From 37b9da32d3b911091deb254f6cba2a137c471287 Mon Sep 17 00:00:00 2001 From: Marc Vertes Date: Thu, 12 Oct 2023 10:51:58 +0200 Subject: move to a direct byte code compiler (#8) * chore: refactor to keep only the new parser and bytecode vm * scanner: remove Token.value field * scanner: remove scanner.kind field * chore: move language specification in lang package This avoid a cyclic dependency in scanner_test which can now use the golang/GoSpec language specification for Go. * clean code * scanner: export scanner fields Also parser now generate function calls, including externals. * chore: fix lint issues * parser: handle strings * wip * parser: implement support for 'if, else, else if' statements Resolving labels in the compiler still in progress. * parser: support if statements, improve compiler * improve handling of functions * improve support of local variables * scanner: trim leading and trailing spaces * fixes to make fibonacci work * parser: improve README, fix function parameters parsing --- parser/parse.go | 519 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 324 insertions(+), 195 deletions(-) (limited to 'parser/parse.go') diff --git a/parser/parse.go b/parser/parse.go index ca89467..5f0b643 100644 --- a/parser/parse.go +++ b/parser/parse.go @@ -1,244 +1,373 @@ package parser import ( + "fmt" + "log" + "strconv" + "strings" + + "github.com/gnolang/parscan/lang" "github.com/gnolang/parscan/scanner" ) -const ( - Stmt = 1 << iota - ExprSep - Call - Index - Decl - MultiOp -) +type Parser struct { + *scanner.Scanner -type NodeSpec struct { - Kind // AST node kind - Flags uint // composable properties used for AST generation - Order int // operator precedence order + symbols map[string]*symbol + function *symbol + scope string + fname string + labelCount map[string]int } -type Parser struct { - *scanner.Scanner - Spec map[string]NodeSpec +func (p *Parser) Scan(s string, endSemi bool) (Tokens, error) { + return p.Scanner.Scan(s, endSemi) } -func (p *Parser) Parse(src string, ctx *Node) (nodes []*Node, err error) { - tokens, err := p.Scan(src) - if err != nil { - return +type Tokens []scanner.Token + +func (toks Tokens) String() (s string) { + for _, t := range toks { + s += fmt.Sprintf("%#v ", t.Str) } - return p.ParseTokens(tokens, ctx) + return s } -func (p *Parser) ParseTokens(tokens []*scanner.Token, ctx *Node) (nodes []*Node, err error) { - // TODO: error handling. - var root *Node // current root node - var expr *Node // current expression root node - var prev, c *Node // previous and current nodes - var lce *Node // last complete expression node - unaryOp := map[*Node]bool{} // unaryOp indicates if a node is an unary operator. - prevToken := map[*Node]*scanner.Token{} - - for i, t := range tokens { - prev = c - c = &Node{ - Token: t, - Kind: p.Spec[t.Name()].Kind, +func (toks Tokens) Index(id lang.TokenId) int { + for i, t := range toks { + if t.Id == id { + return i } - if i > 0 { - prevToken[c] = tokens[i-1] + } + return -1 +} + +func (toks Tokens) LastIndex(id lang.TokenId) int { + for i := len(toks) - 1; i >= 0; i-- { + if toks[i].Id == id { + return i } - if c.Kind == Comment { - continue + } + return -1 +} + +func (toks Tokens) Split(id lang.TokenId) (result []Tokens) { + for { + i := toks.Index(id) + if i < 0 { + return append(result, toks) } - if t.IsOperator() && (i == 0 || tokens[i-1].IsOperator()) { - unaryOp[c] = true + result = append(result, toks[:i]) + toks = toks[i+1:] + } +} + +func (p *Parser) Parse(src string) (out Tokens, err error) { + log.Printf("Parse src: %#v\n", src) + in, err := p.Scan(src, true) + if err != nil { + return out, err + } + log.Println("Parse in:", in) + for len(in) > 0 { + endstmt := in.Index(lang.Semicolon) + if endstmt == -1 { + return out, scanner.ErrBlock } - if c.Kind == Undefined { - switch t.Kind() { - case scanner.Number: - c.Kind = LiteralNumber - case scanner.Identifier: - c.Kind = Ident + + // Skip over simple init statements for some tokens (if, for, ...) + if lang.HasInit[in[0].Id] { + log.Println("may have init") + for in[endstmt-1].Id != lang.BraceBlock { + e2 := in[endstmt+1:].Index(lang.Semicolon) + if e2 == -1 { + return out, scanner.ErrBlock + } + endstmt += 1 + e2 } } + o, err := p.ParseStmt(in[:endstmt]) + if err != nil { + return out, err + } + out = append(out, o...) + in = in[endstmt+1:] + } + return out, err +} - if root == nil { - if p.isSep(c) { - continue - } - lce = nil - root = c - if p.isExpr(c) { - expr = c - } +func (p *Parser) ParseStmt(in Tokens) (out Tokens, err error) { + log.Println("ParseStmt in:", in) + if len(in) == 0 { + return nil, nil + } + switch t := in[0]; t.Id { + case lang.Func: + return p.ParseFunc(in) + case lang.If: + return p.ParseIf(in) + case lang.Return: + return p.ParseReturn(in) + default: + return p.ParseExpr(in) + } +} + +func (p *Parser) ParseFunc(in Tokens) (out Tokens, err error) { + // TODO: handle anonymous functions (no function name) + // TODO: handle receiver (methods) + // TODO: handle parametric types (generics) + // TODO: handle variadic parameters + fname := in[1].Str + ofname := p.fname + p.fname = fname + ofunc := p.function + s, _, ok := p.getSym(fname, p.scope) + if !ok { + s = &symbol{} + p.symbols[p.scope+fname] = s + } + p.pushScope(fname) + defer func() { + p.fname = ofname // TODO remove if favor of function. + p.function = ofunc + p.popScope() + }() + + out = Tokens{ + {Id: lang.Enter, Str: "enter " + p.scope}, + {Id: lang.Goto, Str: "goto " + fname + "_end"}, // Skip function definition. + {Id: lang.Label, Pos: in[0].Pos, Str: fname}, + } + + bi := in.Index(lang.BraceBlock) + if bi < 0 { + return out, fmt.Errorf("no function body") + } + typ, err := p.ParseType(in[:bi]) + if err != nil { + return out, err + } + s.kind = symFunc + s.Type = typ + p.function = s + + log.Println("body:", in[len(in)-1].Block()) + toks, err := p.Parse(in[len(in)-1].Block()) + if err != nil { + return out, err + } + out = append(out, toks...) + out = append(out, + scanner.Token{Id: lang.Label, Str: fname + "_end"}, + scanner.Token{Id: lang.Exit}, + ) + log.Println("symbols", p.symbols) + return out, err +} + +func (p *Parser) ParseIf(in Tokens) (out Tokens, err error) { + prefix := p.fname + "_if" + strconv.Itoa(p.labelCount[p.scope+p.fname]) + p.labelCount[p.scope+p.fname]++ + sc := 0 + ssc := strconv.Itoa(sc) + // We start from the end of the statement and examine tokens backward to + // get the destination labels already computed when jumps are set. + i := len(in) - 1 + for i > 0 { + if in[i].Id != lang.BraceBlock { + return nil, fmt.Errorf("expected '{', got %v", in[i]) + } + pre, err := p.Parse(in[i].Block()) + if err != nil { + return nil, err + } + //pre := append(Tokens{{Id: lang.Label, Str: prefix + "_b" + ssc}}, blockout...) + if sc > 0 { + pre = append(pre, scanner.Token{Id: lang.Goto, Str: "goto " + prefix + "_e0"}) + } + pre = append(pre, scanner.Token{Id: lang.Label, Str: prefix + "_e" + ssc}) + out = append(pre, out...) + + i-- + ifp := in[:i].LastIndex(lang.If) + + if in[i].Id == lang.Else { // Step over final 'else'. + i-- + sc++ + ssc = strconv.Itoa(sc) continue } - if t.IsBlock() { - if expr != nil { - if p.hasProp(c, ExprSep) && p.isExprSep(root) { - // A bracket block may end a previous expression. - root.Child = append(root.Child, expr) - expr = nil - } else if p.hasProp(c, Call) && !p.hasProp(root, Decl) && p.canCallToken(tokens[i-1]) { - // Handle (possibly nested) call expressions. - if lce == nil || lce != expr { // TODO(marc): not general, fix it. - lce = prev - } - lce.Child = []*Node{{Token: lce.Token, Child: lce.Child, Kind: lce.Kind}} - lce.Token = scanner.NewToken("Call", c.Pos()) - lce.Kind = ExprCall - } - } - tcont := t.Content() - s := tcont[t.Start() : len(tcont)-t.End()] - n2, err := p.Parse(s, c) - if err != nil { + pre = Tokens{} + var init, cond Tokens + initcond := in[ifp+1 : i+1] + if ii := initcond.Index(lang.Semicolon); ii < 0 { + cond = initcond + } else { + init = initcond[:ii] + cond = initcond[ii+1:] + } + if len(init) > 0 { + if init, err = p.ParseStmt(init); err != nil { return nil, err } - c.Child = append(c.Child, n2...) - } - - // Process the end of an expression or a statement. - if t.IsSeparator() { - if t.Content() == "," && ctx.Kind != BlockParen { - // ignore comma separator in field lists - } else if expr != nil && p.hasProp(root, Stmt) { - root.Child = append(root.Child, expr) - if p.hasProp(expr, ExprSep) { - nodes = append(nodes, root) - root = nil - } - expr = nil - } else { - if expr != nil { - root = expr - } - nodes = append(nodes, root) - expr = nil - root = nil - } - continue + pre = append(pre, init...) } + if cond, err = p.ParseExpr(cond); err != nil { + return nil, err + } + pre = append(pre, cond...) + pre = append(pre, scanner.Token{Id: lang.JumpFalse, Str: "JumpFalse " + prefix + "_e" + ssc}) + out = append(pre, out...) + i = ifp + if i > 1 && in[i].Id == lang.If && in[i-1].Id == lang.Else { // Step over 'else if'. + i -= 2 + } + sc++ + ssc = strconv.Itoa(sc) + } + log.Println("prefix:", prefix) + log.Println("if tokens:", out) + return out, err +} - // We assume from now that current node is part of an expression subtree. - if expr == nil { - if p.isStatement(root) { - expr = c - continue +func (p *Parser) ParseReturn(in Tokens) (out Tokens, err error) { + if len(in) > 1 { + if out, err = p.ParseExpr(in[1:]); err != nil { + return out, err + } + } + + // TODO: the function symbol should be already present in the parser context. + // otherwise no way to handle anonymous func. + s := p.function + in[0].Beg = s.Type.NumOut() + in[0].End = s.Type.NumIn() + log.Println("ParseReturn:", p.fname, in[0]) + out = append(out, in[0]) + return out, err +} + +func (p *Parser) ParseExpr(in Tokens) (out Tokens, err error) { + log.Println("ParseExpr in:", in) + var ops Tokens + var vl int + // + // Process tokens from last to first, the goal is to reorder the tokens in + // a stack machine processing order, so it can be directly interpreted. + // + for i := len(in) - 1; i >= 0; i-- { + t := in[i] + // temporary assumptions: binary operators, returning 1 value + switch t.Id { + case lang.Ident, lang.Int, lang.String: + out = append(out, t) + vl++ + case lang.Define, lang.Add, lang.Sub, lang.Assign, lang.Equal, lang.Less: + // TODO: handle operator precedence to swap operators / operands if necessary + if vl < 2 { + ops = append(ops, t) + break } - expr = root - } - - // Update the expression subtree according to binary operator precedence rules. - // - operators are binary infix by default. - // - if an operator follows another, then it's unary prefix. - // - if an expression starts by an operator, then it's unary prefix. - // - non operator nodes have a default precedence of 0. - // TODO: handle postfix unary (i.e. ++) and ternary (i.e. ?:) - // - ep := p.Spec[expr.Content()].Order - cp := p.Spec[c.Content()].Order - a := expr - if unaryOp[c] { - cp = 0 - } - if cp != 0 { - if cp > ep { - // Perform an operator permutation at expr root as required by precedence. - // TODO(marc): maybe it can be generalized in below else branch. - expr, c = c, expr - a = expr // Temporary ancestor: its children may have to be permuted. - } else { - // Findout if an operator permutation is necessary in subtree. - c1 := expr - for { - a = c1 - if unaryOp[c1] { - c1, c = c, c1 - a = c1 - if c == expr { - expr = a - } - break - } - if len(c1.Child) < 2 { - break - } - c1 = c1.Child[1] - if !c1.IsOperator() || unaryOp[c1] || cp > p.Spec[c1.Content()].Order { + case lang.ParenBlock: + // If the previous token is an arithmetic, logic or assign operator then + // this parenthesis block is an enclosed expr, otherwise a call expr. + if i == 0 || in[i-1].Id.IsOperator() { + out = append(out, t) + vl++ + break + } + // The call expression can be a function call, a conversion, + // a type assersion (including for type switch) + // func call: push args and func address then call + out = append(out, t) + vl++ + if t2 := in[i-1]; t2.Id == lang.Ident { + if s, sc, ok := p.getSym(t2.Str, p.scope); ok { + log.Println("callExpr:", t2.Str, p.scope, s, ok, sc) + if s.kind == symValue { + // Store the number of input parameters in the token Beg field. + ops = append(ops, scanner.Token{Str: "callX", Id: lang.CallX, Pos: t.Pos, Beg: p.numItems(t.Block(), lang.Comma)}) break } } - // No permutation occured. Append current to last visited ancestor. - if len(a.Child) > 1 { - a.Child = a.Child[:1] - c.Child = append(c.Child, c1) - } - } - } else if ep != 0 { - for len(a.Child) > 1 { - a = a.Child[1] } + ops = append(ops, scanner.Token{Str: "call", Id: lang.Call, Pos: t.Pos}) } - a.Child = append(a.Child, c) - if p.hasProp(a, Call) { - lce = a - } - } - if root != nil && p.isStatement(root) { - if expr != nil { - root.Child = append(root.Child, expr) + if ol := len(ops); ol > 0 && vl > ol { + op := ops[ol-1] + ops = ops[:ol-1] + out = append(out, op) + vl-- } - } else if expr != nil { - root = expr } - if root != nil { - // /* - if p.hasProp(root, MultiOp) { - for { - if !p.fixMultiOp(root, prevToken) { - break - } + out = append(out, ops...) // TODO: verify that ops should be added in this order. + + log.Println("ParseExpr out:", out, "vl:", vl, "ops:", ops) + // The tokens are now properly ordered, process nested blocks. + for i := len(out) - 1; i >= 0; i-- { + t := out[i] + var toks Tokens + switch t.Id { + case lang.ParenBlock, lang.BracketBlock: + if toks, err = p.ParseExprStr(t.Block()); err != nil { + return out, err } + default: + continue } - // */ - nodes = append(nodes, root) + + // replace block token by its parsed result. + log.Println("toks:", toks) + out2 := append(Tokens{}, out[:i]...) + out2 = append(out2, toks...) + out = append(out2, out[i+1:]...) } - return nodes, err + log.Println("Final out:", out) + return out, err } -func (p *Parser) fixMultiOp(root *Node, prevToken map[*Node]*scanner.Token) bool { - for i, c := range root.Child { - for j, cc := range c.Child { - if pt := prevToken[cc]; pt != nil && pt.Content() == "," { - c.RemoveChild(j) - root.InsertChild(cc, i) - return true - } +func (p *Parser) ParseExprStr(s string) (tokens Tokens, err error) { + if tokens, err = p.Scan(s, false); err != nil { + return + } + var result Tokens + for _, sub := range tokens.Split(lang.Comma) { + toks, err := p.ParseExpr(sub) + if err != nil { + return result, err } + result = append(toks, result...) } - return false + return result, err } -func (p *Parser) hasProp(n *Node, prop uint) bool { return p.Spec[n.Name()].Flags&prop != 0 } -func (p *Parser) isStatement(n *Node) bool { return p.Spec[n.Content()].Flags&Stmt != 0 } -func (p *Parser) isExprSep(n *Node) bool { return p.Spec[n.Content()].Flags&ExprSep != 0 } -func (p *Parser) isExpr(n *Node) bool { return !p.isStatement(n) && !p.isExprSep(n) } -func (p *Parser) isSep(n *Node) bool { return n.Token.Kind() == scanner.Separator } -func (p *Parser) IsBlock(n *Node) bool { return n.Token.Kind() == scanner.Block } - -func (p *Parser) precedenceToken(t *scanner.Token) int { - s := t.Content() - if l := t.Start(); l > 0 { - s = s[:l] +func (p *Parser) numItems(s string, sep lang.TokenId) int { + tokens, err := p.Scan(s, false) + if err != nil { + return -1 } - return p.Spec[s].Order + r := 0 + for _, t := range tokens.Split(sep) { + if len(t) == 0 { + continue + } + r++ + } + return r +} + +func (p *Parser) pushScope(name string) { + p.scope = strings.TrimPrefix(p.scope+"/"+name+"/", "/") } -func (p *Parser) canCallToken(t *scanner.Token) bool { - return p.precedenceToken(t) == 0 || p.Spec[t.Name()].Flags&Call != 0 +func (p *Parser) popScope() { + p.scope = strings.TrimSuffix(p.scope, "/") + j := strings.LastIndex(p.scope, "/") + if j == -1 { + p.scope = "" + return + } + p.scope = p.scope[:j] } -- cgit v1.2.3