From 947873b34aabe46dfb9f8d06214736cb11b5a6b2 Mon Sep 17 00:00:00 2001 From: Marc Vertes Date: Wed, 9 Aug 2023 11:47:39 +0200 Subject: codegen: add a bytecode generator (#5) * codegen: add a bytecode generator * cleaning scanner, parser and vm1. --- parser/node.go | 2 -- parser/parse.go | 50 +++++++++++++++++++++---------------------- parser/parse_test.go | 4 +--- parser/readme.md | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 30 deletions(-) create mode 100644 parser/readme.md (limited to 'parser') diff --git a/parser/node.go b/parser/node.go index f81285a..d2f13ef 100644 --- a/parser/node.go +++ b/parser/node.go @@ -6,8 +6,6 @@ type Node struct { Child []*Node // sub-tree nodes scanner.Token // token at origin of the node Kind // Node kind, depends on the language spec - - unary bool // TODO(marc): get rid of it. } // TODO: remove it in favor of Walk2 diff --git a/parser/parse.go b/parser/parse.go index cfbff27..4f186bd 100644 --- a/parser/parse.go +++ b/parser/parse.go @@ -1,8 +1,6 @@ package parser -import ( - "github.com/gnolang/parscan/scanner" -) +import "github.com/gnolang/parscan/scanner" const ( Stmt = 1 << iota @@ -31,25 +29,29 @@ func (p *Parser) Parse(src string) (n []*Node, err error) { return p.ParseTokens(tokens) } -func (p *Parser) ParseTokens(token []scanner.Token) (roots []*Node, err error) { +func (p *Parser) ParseTokens(tokens []scanner.Token) (roots []*Node, err error) { // TODO: error handling. - var root *Node // current root node - var expr *Node // current expression root node - var prev, c *Node - var lce *Node - for i, t := range token { + var root *Node // current root node + var expr *Node // current expression root node + var prev, c *Node // previous and current nodes + var lce *Node // last complete expression node + unaryOp := map[*Node]bool{} // unaryOp indicates if a node is an unary operator. + + for i, t := range tokens { prev = c c = &Node{ Token: t, Kind: p.Spec[t.Name()].Kind, - unary: t.Kind() == scanner.Operator && (i == 0 || token[i-1].Kind() == scanner.Operator), } - if c.Kind == 0 { + if t.IsOperator() && (i == 0 || tokens[i-1].IsOperator()) { + unaryOp[c] = true + } + if c.Kind == Undefined { switch t.Kind() { case scanner.Number: - c.Kind = p.Spec["#num"].Kind + c.Kind = NumberLit case scanner.Identifier: - c.Kind = p.Spec["#id"].Kind + c.Kind = Ident } } @@ -65,20 +67,20 @@ func (p *Parser) ParseTokens(token []scanner.Token) (roots []*Node, err error) { continue } - if t.Kind() == scanner.Block { + if t.IsBlock() { if expr != nil { - if p.hasBlockProp(c, ExprSep) && p.isExprSep(root) { + if p.hasProp(c, ExprSep) && p.isExprSep(root) { // A bracket block may end a previous expression. root.Child = append(root.Child, expr) expr = nil - } else if p.hasBlockProp(c, Call) && !p.hasProp(root, Decl) && p.canCallToken(token[i-1]) { + } else if p.hasProp(c, Call) && !p.hasProp(root, Decl) && p.canCallToken(tokens[i-1]) { // Handle (possibly nested) call expressions. if lce == nil || lce != expr { // TODO(marc): not general, fix it. lce = prev } lce.Child = []*Node{{Token: lce.Token, Child: lce.Child, Kind: lce.Kind}} lce.Token = scanner.NewToken("Call", c.Pos()) - lce.Kind = p.Spec["#call"].Kind + lce.Kind = CallExpr } } tcont := t.Content() @@ -91,10 +93,10 @@ func (p *Parser) ParseTokens(token []scanner.Token) (roots []*Node, err error) { } // Process the end of an expression or a statement. - if t.Kind() == scanner.Separator { + if t.IsSeparator() { if expr != nil && p.hasProp(root, Stmt) { root.Child = append(root.Child, expr) - if p.hasBlockProp(expr, ExprSep) { + if p.hasProp(expr, ExprSep) { roots = append(roots, root) root = nil } @@ -129,7 +131,7 @@ func (p *Parser) ParseTokens(token []scanner.Token) (roots []*Node, err error) { ep := p.Spec[expr.Content()].Order cp := p.Spec[c.Content()].Order a := expr - if c.unary { + if unaryOp[c] { cp = 0 } if cp != 0 { @@ -143,7 +145,7 @@ func (p *Parser) ParseTokens(token []scanner.Token) (roots []*Node, err error) { c1 := expr for { a = c1 - if c1.unary { + if unaryOp[c1] { c1, c = c, c1 a = c1 if c == expr { @@ -155,7 +157,7 @@ func (p *Parser) ParseTokens(token []scanner.Token) (roots []*Node, err error) { break } c1 = c1.Child[1] - if c1.Token.Kind() != scanner.Operator || c1.unary || cp > p.Spec[c1.Content()].Order { + if !c1.IsOperator() || unaryOp[c1] || cp > p.Spec[c1.Content()].Order { break } } @@ -194,9 +196,7 @@ func (p *Parser) isExprSep(n *Node) bool { return p.Spec[n.Content()].F func (p *Parser) isExpr(n *Node) bool { return !p.isStatement(n) && !p.isExprSep(n) } func (p *Parser) isSep(n *Node) bool { return n.Token.Kind() == scanner.Separator } func (p *Parser) IsBlock(n *Node) bool { return n.Token.Kind() == scanner.Block } -func (p *Parser) hasBlockProp(n *Node, prop uint) bool { - return p.Spec[n.Content()[:n.Start()]].Flags&prop != 0 -} + func (p *Parser) precedenceToken(t scanner.Token) int { s := t.Content() if l := t.Start(); l > 0 { diff --git a/parser/parse_test.go b/parser/parse_test.go index 3bf6955..838e1c8 100644 --- a/parser/parse_test.go +++ b/parser/parse_test.go @@ -57,19 +57,17 @@ var GoParser = &Parser{ "<": {InfOp, 0, 6}, ":=": {DefOp, 0, 7}, "=": {AssignOp, 0, 7}, - "#Call": {CallExpr, 0, 0}, "if": {IfStmt, Stmt | ExprSep, 0}, "func": {FuncDecl, Decl | Call, 0}, "return": {ReturnStmt, Stmt, 0}, "{..}": {StmtBloc, ExprSep, 0}, - "{": {StmtBloc, ExprSep, 0}, - "(": {ParBloc, Call, 0}, "(..)": {ParBloc, Call, 0}, }, } func TestParse(t *testing.T) { for _, test := range goTests { + test := test t.Run("", func(t *testing.T) { var err error errStr := "" diff --git a/parser/readme.md b/parser/readme.md new file mode 100644 index 0000000..19d8778 --- /dev/null +++ b/parser/readme.md @@ -0,0 +1,60 @@ +# Parser + +A parser takes an array of tokens (produced by the scanner) in input and +returns a node representing a syntax tree. A node is an object +containing a kind, the corresponding token and the ordered references to +descendent nodes. + +A goal is to make the parser generic enough so it can generate syntax +trees for most of existing programming languages (no claim of generality +yet), provided a small set of generating rules per language, and a small +set of validating rules (yet to be defined) to detect invalid +constructs. + +The input tokens are particular in the sense that they include classical +lexical items such as words, separators, numbers, but also strings and +nested blocks, which are resolved at scanning stage rather than parsing +stage. See the scanner for more details. + +The language specification includes the following: + +- a scanner specification, to produce the set of possible tokens. +- a map of node specification per token name. The node specification + defines some parameters influing how the tree is generated. + +## Development status + +A successful test must be provided to check the status. + +- [x] binary operator expressions +- [x] unary operator (prefix) expressions +- [ ] unary operator (suffix) expressions +- [x] operator precedence rules +- [x] parenthesis in expressions +- [ ] semi-colon automatic insertion rules +- [x] call expressions +- [ ] nested calls +- [x] index expressions +- [x] single assignments +- [ ] multi assignments +- [x] simple `if` statement (no `else`) +- [ ] full `if` statement (including `else`, `else if`) +- [x] init expressions in `if` statements +- [x] statement blocks +- [ ] comments +- [ ] for statement +- [ ] switch statement +- [ ] select statement +- [x] return statement +- [x] function declaration +- [ ] method declaration +- [ ] anonymous functions (closures) +- [ ] type declaration +- [ ] var, const, type single declaration +- [ ] var, const, type multi declaration +- [ ] type parametric expressions +- [x] literal numbers (see scanner) +- [x] literal strings +- [ ] composite literal +- [ ] import statements +- [ ] go.mod syntax -- cgit v1.2.3