From 37b9da32d3b911091deb254f6cba2a137c471287 Mon Sep 17 00:00:00 2001 From: Marc Vertes Date: Thu, 12 Oct 2023 10:51:58 +0200 Subject: move to a direct byte code compiler (#8) * chore: refactor to keep only the new parser and bytecode vm * scanner: remove Token.value field * scanner: remove scanner.kind field * chore: move language specification in lang package This avoid a cyclic dependency in scanner_test which can now use the golang/GoSpec language specification for Go. * clean code * scanner: export scanner fields Also parser now generate function calls, including externals. * chore: fix lint issues * parser: handle strings * wip * parser: implement support for 'if, else, else if' statements Resolving labels in the compiler still in progress. * parser: support if statements, improve compiler * improve handling of functions * improve support of local variables * scanner: trim leading and trailing spaces * fixes to make fibonacci work * parser: improve README, fix function parameters parsing --- .gitignore | 2 +- _samples/add | 2 + _samples/fib | 6 + _samples/p00 | 1 + _samples/p01 | 2 + _samples/p02 | 7 + _samples/p03 | 3 + _samples/p04 | 4 + _samples/p05 | 10 + _samples/p06 | 7 + _samples/p07 | 5 + codegen/compiler.go | 242 --------------------- codegen/compiler_test.go | 58 ----- codegen/expression.go | 41 ---- codegen/interpreter.go | 53 ----- codegen/interpreter_test.go | 48 ---- lang/golang/go.go | 183 ++++++++-------- lang/spec.go | 43 ++++ lang/token.go | 110 ++++++++++ main.go | 23 +- parser/README.md | 157 +++++++++----- parser/compiler.go | 250 +++++++++++++++++++++ parser/dot.go | 89 -------- parser/interpreter.go | 52 +++++ parser/interpreter_test.go | 66 ++++++ parser/kind.go | 56 ----- parser/node.go | 50 ----- parser/parse.go | 519 +++++++++++++++++++++++++++----------------- parser/parse_test.go | 257 ---------------------- parser/symbol.go | 67 ++++++ parser/type.go | 117 ++++++++++ scanner/scan.go | 240 ++++++++------------ scanner/scan_test.go | 101 ++------- testdata/add | 2 - testdata/fib | 6 - testdata/p00 | 1 - testdata/p01 | 2 - testdata/p02 | 7 - testdata/p03 | 3 - testdata/p04 | 4 - testdata/p05 | 10 - testdata/p06 | 7 - testdata/p07 | 5 - vm/README.md | 38 ++++ vm/vm.go | 229 +++++++++++++++++++ vm/vm_test.go | 155 +++++++++++++ vm0/README.md | 27 --- vm0/func.go | 75 ------- vm0/vm.go | 157 -------------- vm0/vm_test.go | 26 --- vm1/README.md | 38 ---- vm1/vm.go | 196 ----------------- vm1/vm_test.go | 122 ----------- 53 files changed, 1815 insertions(+), 2166 deletions(-) create mode 100644 _samples/add create mode 100644 _samples/fib create mode 100644 _samples/p00 create mode 100644 _samples/p01 create mode 100644 _samples/p02 create mode 100644 _samples/p03 create mode 100644 _samples/p04 create mode 100644 _samples/p05 create mode 100644 _samples/p06 create mode 100644 _samples/p07 delete mode 100644 codegen/compiler.go delete mode 100644 codegen/compiler_test.go delete mode 100644 codegen/expression.go delete mode 100644 codegen/interpreter.go delete mode 100644 codegen/interpreter_test.go create mode 100644 lang/spec.go create mode 100644 lang/token.go create mode 100644 parser/compiler.go delete mode 100644 parser/dot.go create mode 100644 parser/interpreter.go create mode 100644 parser/interpreter_test.go delete mode 100644 parser/kind.go delete mode 100644 parser/node.go delete mode 100644 parser/parse_test.go create mode 100644 parser/symbol.go create mode 100644 parser/type.go delete mode 100644 testdata/add delete mode 100644 testdata/fib delete mode 100644 testdata/p00 delete mode 100644 testdata/p01 delete mode 100644 testdata/p02 delete mode 100644 testdata/p03 delete mode 100644 testdata/p04 delete mode 100644 testdata/p05 delete mode 100644 testdata/p06 delete mode 100644 testdata/p07 create mode 100644 vm/README.md create mode 100644 vm/vm.go create mode 100644 vm/vm_test.go delete mode 100644 vm0/README.md delete mode 100644 vm0/func.go delete mode 100644 vm0/vm.go delete mode 100644 vm0/vm_test.go delete mode 100644 vm1/README.md delete mode 100644 vm1/vm.go delete mode 100644 vm1/vm_test.go diff --git a/.gitignore b/.gitignore index 9529633..e2bf6fd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ *.out *.test -gint +parscan diff --git a/_samples/add b/_samples/add new file mode 100644 index 0000000..a403485 --- /dev/null +++ b/_samples/add @@ -0,0 +1,2 @@ +func add(a int, b int) int { return a + b } +add(4, 3) diff --git a/_samples/fib b/_samples/fib new file mode 100644 index 0000000..654c5c0 --- /dev/null +++ b/_samples/fib @@ -0,0 +1,6 @@ +func fib(i int) int { + if i < 2 { return i } + return fib(i-2) + fib(i-1) +} + +println(fib(6)) diff --git a/_samples/p00 b/_samples/p00 new file mode 100644 index 0000000..19f5084 --- /dev/null +++ b/_samples/p00 @@ -0,0 +1 @@ +1+2 diff --git a/_samples/p01 b/_samples/p01 new file mode 100644 index 0000000..baafaa9 --- /dev/null +++ b/_samples/p01 @@ -0,0 +1,2 @@ +s := "Hello" +println(s, 5+2) diff --git a/_samples/p02 b/_samples/p02 new file mode 100644 index 0000000..1aeb4a9 --- /dev/null +++ b/_samples/p02 @@ -0,0 +1,7 @@ +a := 1 + +if a < 2 { + println("yep") +} + +println("ok") diff --git a/_samples/p03 b/_samples/p03 new file mode 100644 index 0000000..39a3cda --- /dev/null +++ b/_samples/p03 @@ -0,0 +1,3 @@ +func f(a int, b int) int { return a + b } + +println("f:", f(3, 4), f(5, 6)) diff --git a/_samples/p04 b/_samples/p04 new file mode 100644 index 0000000..f2a85c8 --- /dev/null +++ b/_samples/p04 @@ -0,0 +1,4 @@ +func f() int { println(a) } + +a := 4 +f() diff --git a/_samples/p05 b/_samples/p05 new file mode 100644 index 0000000..51c2c9b --- /dev/null +++ b/_samples/p05 @@ -0,0 +1,10 @@ +func f(i int) int { + if i < 2 { + if i == 1 { + return + } } + println("i", i) +} + +f(1) +println("bye") diff --git a/_samples/p06 b/_samples/p06 new file mode 100644 index 0000000..88029cc --- /dev/null +++ b/_samples/p06 @@ -0,0 +1,7 @@ +func f(i int) { + if i < 2 { return } + println("i > 1:", i) +} + +f(1) +println("bye") diff --git a/_samples/p07 b/_samples/p07 new file mode 100644 index 0000000..2fa7ee0 --- /dev/null +++ b/_samples/p07 @@ -0,0 +1,5 @@ +func f1() { println("in f1") } + +func f2() { println("in f2"); f1() } + +f2() diff --git a/codegen/compiler.go b/codegen/compiler.go deleted file mode 100644 index 02b67cb..0000000 --- a/codegen/compiler.go +++ /dev/null @@ -1,242 +0,0 @@ -package codegen - -import ( - "fmt" - "log" - "strings" - - "github.com/gnolang/parscan/parser" - "github.com/gnolang/parscan/vm1" -) - -type symbol struct { - index int // address of symbol in frame - local bool // if true address is relative to local frame, otherwise global - node *parser.Node // symbol definition in AST -} - -type Compiler struct { - Code [][]int64 // produced code, to fill VM with - Data []any // produced data, will be at the bottom of VM stack - Entry int // offset in Code to start execution from (skip function defintions) - - symbols map[string]*symbol -} - -func NewCompiler() *Compiler { return &Compiler{symbols: map[string]*symbol{}, Entry: -1} } - -type nodedata struct { - ipstart, ipend, fsp int // CFG and symbol node annotations -} - -type extNode struct { - *Compiler - *parser.Node - anc *parser.Node // node ancestor - rank int // node rank in ancestor's children -} - -func (c *Compiler) CodeGen(node *parser.Node) (err error) { - notes := map[*parser.Node]*nodedata{} // AST node annotations for CFG, symbols, ... - scope := "" - frameNode := []*parser.Node{node} - fnote := notes[node] - - node.Walk2(nil, 0, func(n, a *parser.Node, k int) (ok bool) { - // Node pre-order processing callback. - notes[n] = &nodedata{} - nd := notes[n] - - switch n.Kind { - case parser.DeclFunc: - fname := n.Child[0].Content() - c.addSym(len(c.Code), scope+fname, false, n) - scope = pushScope(scope, fname) - frameNode = append(frameNode, n) - fnote = notes[n] - for j, child := range n.Child[1].Child { - vname := child.Content() - c.addSym(-j-2, scope+vname, true, child) - fnote.fsp++ - } - - case parser.BlockStmt: - nd.ipstart = len(c.Code) - if a != nil && a.Kind == parser.StmtIf && k == 1 { - c.Emit(n, vm1.JumpFalse, 0) // location to be updated in post IfStmt - } - } - return true - }, func(n, a *parser.Node, k int) (ok bool) { - // Node post-order processing callback. - nd := notes[n] - x := extNode{c, n, a, k} - - switch n.Kind { - case parser.OpAdd: - c.Emit(n, vm1.Add) - - case parser.ExprCall: - err = postCallExpr(x) - - case parser.OpDefine: - // Define operation, global vars only. TODO: on local frame too - l := c.addSym(nil, n.Child[0].Content(), false, n) - c.Emit(n, vm1.Assign, int64(l)) - - case parser.DeclFunc: - fun := frameNode[len(frameNode)-1] - if len(fun.Child) == 3 { // no return values - if c.Code[len(c.Code)-1][1] != vm1.Return { - c.Emit(n, vm1.Return, 0, int64(len(fun.Child[1].Child))) - } - } - scope = popScope(scope) - fnote = notes[fun] - - case parser.Ident: - ident := n.Content() - if len(n.Child) > 0 || a.Kind == parser.DeclFunc { - break - } - if s, _, ok := c.getSym(ident, scope); ok { - if s.local { - c.Emit(n, vm1.Fdup, int64(s.index)) - } else if a != nil && a.Kind == parser.OpAssign { - c.Emit(n, vm1.Push, int64(s.index)) - } else if _, ok := c.Data[s.index].(int); !ok { - c.Emit(n, vm1.Dup, int64(s.index)) - } - } - - case parser.StmtIf: - ifBodyStart := notes[n.Child[1]].ipstart - ifBodyEnd := notes[n.Child[1]].ipend - c.Code[ifBodyStart][2] = int64(ifBodyEnd - ifBodyStart) - // TODO: handle 'else' - - case parser.LiteralNumber: - // A literal number can be a float or an integer, or a big number - switch v := n.Value().(type) { - case int64: - c.Emit(n, vm1.Push, v) - case error: - err = v - default: - err = fmt.Errorf("type not supported: %T\n", v) - } - - case parser.StmtReturn: - fun := frameNode[len(frameNode)-1] - nret := 0 - if len(fun.Child) > 3 { - if ret := fun.Child[2]; ret.Kind == parser.BlockParen { - nret = len(ret.Child) - } else { - nret = 1 - } - } - c.Emit(n, vm1.Return, int64(nret), int64(len(fun.Child[1].Child))) - - case parser.BlockStmt: - nd.ipend = len(c.Code) - - case parser.LiteralString: - p := len(c.Data) - c.Data = append(c.Data, n.Block()) - c.Emit(n, vm1.Dup, int64(p)) - - case parser.OpInferior: - c.Emit(n, vm1.Lower) - - case parser.OpSubtract: - c.Emit(n, vm1.Sub) - } - - if err != nil { - return false - } - - // TODO: Fix this temporary hack to compute an entry point - if c.Entry < 0 && len(scope) == 0 && n.Kind != parser.DeclFunc { - c.Entry = len(c.Code) - 1 - if c.Entry >= 0 && len(c.Code) > c.Entry && c.Code[c.Entry][1] == vm1.Return { - c.Entry++ - } - } - return true - }) - - if s, _, ok := c.getSym("main", ""); ok { - if i, ok := c.codeIndex(s); ok { - // Internal call is always relative to instruction pointer. - c.Emit(nil, vm1.Call, int64(i-len(c.Code))) - c.Entry = len(c.Code) - 1 - } - log.Println(vm1.Disassemble(c.Code)) - } - - return -} - -func (c *Compiler) AddSym(v any, name string) int { return c.addSym(v, name, false, nil) } - -func (c *Compiler) addSym(v any, name string, local bool, n *parser.Node) int { - l := len(c.Data) - if local { - l = v.(int) - } else { - c.Data = append(c.Data, v) - } - c.symbols[name] = &symbol{index: l, local: local, node: n} - return l -} - -func (c *Compiler) Emit(n *parser.Node, op ...int64) int { - var pos int64 - if n != nil { - pos = int64(n.Pos()) - } - op = append([]int64{pos}, op...) - l := len(c.Code) - c.Code = append(c.Code, op) - return l -} - -func (c *Compiler) codeIndex(s *symbol) (i int, ok bool) { - i, ok = c.Data[s.index].(int) - return -} - -func pushScope(scope, name string) string { - return strings.TrimPrefix(scope+"/"+name+"/", "/") -} - -func popScope(scope string) string { - scope = strings.TrimSuffix(scope, "/") - j := strings.LastIndex(scope, "/") - if j == -1 { - return "" - } - return scope[:j] -} - -// getSym searches for an existing symbol starting from the deepest scope. -func (c *Compiler) getSym(name, scope string) (sym *symbol, sc string, ok bool) { - for { - if sym, ok = c.symbols[scope+name]; ok { - return sym, scope, ok - } - scope = strings.TrimSuffix(scope, "/") - i := strings.LastIndex(scope, "/") - if i == -1 { - scope = "" - break - } - if scope = scope[:i]; scope == "" { - break - } - } - sym, ok = c.symbols[name] - return sym, scope, ok -} diff --git a/codegen/compiler_test.go b/codegen/compiler_test.go deleted file mode 100644 index 5c7e9d3..0000000 --- a/codegen/compiler_test.go +++ /dev/null @@ -1,58 +0,0 @@ -package codegen - -import ( - "fmt" - "log" - "testing" - - "github.com/gnolang/parscan/lang/golang" - "github.com/gnolang/parscan/parser" - "github.com/gnolang/parscan/vm1" -) - -func TestCodeGen(t *testing.T) { - log.SetFlags(log.Lshortfile) - for _, test := range codeGenTests { - test := test - t.Run("", func(t *testing.T) { - c := NewCompiler() - c.AddSym(fmt.Println, "println") - n := &parser.Node{} - var err error - if n.Child, err = golang.GoParser.Parse(test.src, n); err != nil { - t.Error(err) - } - errStr := "" - if err = c.CodeGen(n); err != nil { - errStr = err.Error() - } - if errStr != test.err { - t.Errorf("got error %#v, want error %#v", errStr, test.err) - } - t.Log("data:", c.Data) - t.Log("code:", vm1.Disassemble(c.Code)) - if s := vm1.Disassemble(c.Code); s != test.asm { - t.Errorf("got %#v, want %#v", s, test.asm) - } - }) - } -} - -var codeGenTests = []struct { - src, asm, sym, err string -}{{ // #00 - src: "1+2", - asm: "Push 1\nPush 2\nAdd\n", -}, { // #01 - src: `println("Hello")`, - asm: "Dup 0\nDup 1\nCallX 1\nPop 2\n", -}, { // #02 - src: `a := 2; println(a)`, - asm: "Push 2\nAssign 1\nDup 0\nDup 1\nCallX 1\nPop 2\n", -}, { // #03 - src: `a := 2; if a < 3 {println(a)}; println("bye")`, - asm: "Push 2\nAssign 1\nDup 1\nPush 3\nLower\nJumpFalse 5\nDup 0\nDup 1\nCallX 1\nPop 2\nDup 0\nDup 2\nCallX 1\nPop 2\n", -}, { // #04 - src: "func add(a int, b int) int { return a + b }", - asm: "Fdup -2\nFdup -3\nAdd\nReturn 1 2\n", -}} diff --git a/codegen/expression.go b/codegen/expression.go deleted file mode 100644 index f3eb38a..0000000 --- a/codegen/expression.go +++ /dev/null @@ -1,41 +0,0 @@ -package codegen - -import ( - "fmt" - "reflect" - - "github.com/gnolang/parscan/parser" - "github.com/gnolang/parscan/vm1" -) - -func postCallExpr(x extNode) error { - switch x.Child[0].Kind { - case parser.Ident: - var numOut int - s, _, ok := x.getSym(x.Child[0].Content(), "") - if !ok { - return fmt.Errorf("invalid symbol %s", x.Child[0].Content()) - } - if i, ok := x.codeIndex(s); ok { - // Internal call is always relative to instruction pointer. - x.Emit(x.Node, vm1.Call, int64(i-len(x.Code))) - } else { - // External call, using absolute addr in symtable. - x.Emit(x.Node, vm1.CallX, int64(len(x.Child[1].Child))) - numOut = reflect.TypeOf(x.Data[s.index]).NumOut() - } - if !usedRet(x.anc) { - x.Emit(x.Node, vm1.Pop, int64(numOut)) - } - } - return nil -} - -func usedRet(n *parser.Node) bool { - switch n.Kind { - case parser.Undefined, parser.BlockStmt: - return false - default: - return true - } -} diff --git a/codegen/interpreter.go b/codegen/interpreter.go deleted file mode 100644 index 18cc5f8..0000000 --- a/codegen/interpreter.go +++ /dev/null @@ -1,53 +0,0 @@ -package codegen - -import ( - "os" - - "github.com/gnolang/parscan/parser" - "github.com/gnolang/parscan/vm1" -) - -const debug = true - -type Interpreter struct { - *parser.Parser - *Compiler - *vm1.Machine -} - -func NewInterpreter(p *parser.Parser) *Interpreter { - return &Interpreter{p, NewCompiler(), &vm1.Machine{}} -} - -func (i *Interpreter) Eval(src string) (res any, err error) { - n := &parser.Node{} - if n.Child, err = i.Parse(src, n); err != nil { - return res, err - } - if debug { - n.Dot(os.Getenv("DOT"), "") - } - codeOffset := len(i.Code) - dataOffset := 0 - if codeOffset > 0 { - // All data must be copied to the VM the first time only (re-entrance). - dataOffset = len(i.Data) - } - i.PopExit() // Remove last exit from previous run (re-entrance). - if err = i.CodeGen(n); err != nil { - return res, err - } - i.Push(i.Data[dataOffset:]...) - i.PushCode(i.Code[codeOffset:]...) - i.PushCode([]int64{0, vm1.Exit}) - i.SetIP(max(codeOffset, i.Entry)) - err = i.Run() - return i.Top(), err -} - -func max(a, b int) int { - if a > b { - return a - } - return b -} diff --git a/codegen/interpreter_test.go b/codegen/interpreter_test.go deleted file mode 100644 index 2563677..0000000 --- a/codegen/interpreter_test.go +++ /dev/null @@ -1,48 +0,0 @@ -package codegen - -import ( - "fmt" - "testing" - - "github.com/gnolang/parscan/lang/golang" -) - -func TestEval(t *testing.T) { - for _, test := range evalTests { - test := test - t.Run("", func(t *testing.T) { - interp := NewInterpreter(golang.GoParser) - errStr := "" - r, e := interp.Eval(test.src) - if e != nil { - errStr = e.Error() - } - if errStr != test.err { - t.Errorf("got error %#v, want error %#v", errStr, test.err) - } - res := fmt.Sprintf("%v", r) - if res != test.res { - t.Errorf("got %#v, want %#v", res, test.res) - } - }) - } -} - -var evalTests = []struct { - name, src, res, err string -}{{ // #00 - src: "1 + 2", - res: "3", -}, { // #01 - src: "a := 2; a = a + 3", - res: "5", -}, { // #02 - src: "func f(a int) int { return a + 1 }; f(5)", - res: "6", -}, { // #03 - src: "func f(a int) (b int) { b = a + 1; return b }; f(5)", - res: "6", -}, { // #04 - src: "func f(a int) (b int) { b = a + 1; return }; f(5)", - res: "6", -}} diff --git a/lang/golang/go.go b/lang/golang/go.go index 47ca6db..7f66594 100644 --- a/lang/golang/go.go +++ b/lang/golang/go.go @@ -1,37 +1,35 @@ package golang -import ( - "github.com/gnolang/parscan/parser" - "github.com/gnolang/parscan/scanner" -) +import "github.com/gnolang/parscan/lang" -var GoScanner = &scanner.Scanner{ - CharProp: [scanner.ASCIILen]uint{ - '\t': scanner.CharSep, - '\n': scanner.CharLineSep, - ' ': scanner.CharSep, - '!': scanner.CharOp, - '"': scanner.CharStr, - '%': scanner.CharOp, - '&': scanner.CharOp, - '\'': scanner.CharStr, - '(': scanner.CharBlock, - '*': scanner.CharOp, - '+': scanner.CharOp, - ',': scanner.CharGroupSep, - '-': scanner.CharOp, - '.': scanner.CharOp, - '/': scanner.CharOp, - ':': scanner.CharOp, - ';': scanner.CharGroupSep, - '<': scanner.CharOp, - '=': scanner.CharOp, - '>': scanner.CharOp, - '[': scanner.CharBlock, - '^': scanner.CharOp, - '{': scanner.CharBlock, - '|': scanner.CharOp, - '~': scanner.CharOp, +var GoSpec = &lang.Spec{ + CharProp: [lang.ASCIILen]uint{ + '\t': lang.CharSep, + '\n': lang.CharLineSep, + ' ': lang.CharSep, + '!': lang.CharOp, + '"': lang.CharStr, + '%': lang.CharOp, + '&': lang.CharOp, + '\'': lang.CharStr, + '(': lang.CharBlock, + '*': lang.CharOp, + '+': lang.CharOp, + ',': lang.CharGroupSep, + '-': lang.CharOp, + '`': lang.CharStr, + '.': lang.CharOp, + '/': lang.CharOp, + ':': lang.CharOp, + ';': lang.CharGroupSep, + '<': lang.CharOp, + '=': lang.CharOp, + '>': lang.CharOp, + '[': lang.CharBlock, + '^': lang.CharOp, + '{': lang.CharBlock, + '|': lang.CharOp, + '~': lang.CharOp, }, End: map[string]string{ "(": ")", @@ -44,61 +42,76 @@ var GoScanner = &scanner.Scanner{ "//": "\n", }, BlockProp: map[string]uint{ - "(": scanner.CharBlock, - "{": scanner.CharBlock, - "[": scanner.CharBlock, - `"`: scanner.CharStr | scanner.StrEsc | scanner.StrNonl, - "`": scanner.CharStr, - "'": scanner.CharStr | scanner.StrEsc, - "/*": scanner.CharStr, - "//": scanner.CharStr | scanner.ExcludeEnd | scanner.EosValidEnd, + "(": lang.CharBlock, + "{": lang.CharBlock, + "[": lang.CharBlock, + `"`: lang.CharStr | lang.StrEsc | lang.StrNonl, + "`": lang.CharStr, + "'": lang.CharStr | lang.StrEsc, + "/*": lang.CharStr, + "//": lang.CharStr | lang.ExcludeEnd | lang.EosValidEnd, }, - SkipSemi: map[string]bool{ - "++": true, - "--": true, - "case": true, - "chan": true, - "const": true, - "default": true, - "defer": true, - "else": true, - "for": true, - "func": true, - "go": true, - "goto": true, - "if": true, - "import": true, - "interface": true, - "map": true, - "package": true, - "range": true, - "select": true, - "struct": true, - "switch": true, - "type": true, - "var": true, - }, -} + TokenProps: map[string]lang.TokenProp{ + // Block tokens (can be nested) + "{..}": {TokenId: lang.BraceBlock}, + "[..]": {TokenId: lang.BracketBlock}, + "(..)": {TokenId: lang.ParenBlock}, + + // String tokens (not nested) + "//..": {TokenId: lang.Comment}, + "/*..": {TokenId: lang.Comment}, + `".."`: {TokenId: lang.String}, + "`..`": {TokenId: lang.String}, -var GoParser = &parser.Parser{ - Scanner: GoScanner, - Spec: map[string]parser.NodeSpec{ - ".": {Kind: parser.OpDot, Flags: parser.Call, Order: 3}, - "*": {Kind: parser.OpMultiply, Order: 4}, - "+": {Kind: parser.OpAdd, Order: 5}, - "-": {Kind: parser.OpSubtract, Order: 5}, - "<": {Kind: parser.OpInferior, Order: 6}, - ":=": {Kind: parser.OpDefine, Order: 7}, - "=": {Kind: parser.OpAssign, Order: 7}, - "if": {Kind: parser.StmtIf, Flags: parser.Stmt | parser.ExprSep}, - "func": {Kind: parser.DeclFunc, Flags: parser.Decl | parser.Call}, - "return": {Kind: parser.StmtReturn, Flags: parser.Stmt}, - "{..}": {Kind: parser.BlockStmt, Flags: parser.ExprSep}, - "(..)": {Kind: parser.BlockParen, Flags: parser.Call}, - `".."`: {Kind: parser.LiteralString}, - "//..": {Kind: parser.Comment}, - "/*..": {Kind: parser.Comment}, + // Separators + ",": {TokenId: lang.Comma}, + ";": {TokenId: lang.Semicolon}, + ".": {TokenId: lang.Period}, + ":": {TokenId: lang.Colon}, + + // Operators + "&": {TokenId: lang.And}, + "*": {TokenId: lang.Mul}, + "+": {TokenId: lang.Add}, + "-": {TokenId: lang.Sub}, + "=": {TokenId: lang.Assign}, + "<": {TokenId: lang.Less}, + ">": {TokenId: lang.Greater}, + "^": {TokenId: lang.Xor}, + "~": {TokenId: lang.Tilde}, + ":=": {TokenId: lang.Define}, + "==": {TokenId: lang.Equal}, + "<=": {TokenId: lang.LessEqual}, + ">=": {TokenId: lang.GreaterEqual}, + "->": {TokenId: lang.Arrow}, + "++": {TokenId: lang.Inc, SkipSemi: true}, + "--": {TokenId: lang.Dec, SkipSemi: true}, + + // Reserved keywords + "break": {TokenId: lang.Break}, + "case": {TokenId: lang.Case, SkipSemi: true}, + "chan": {TokenId: lang.Chan, SkipSemi: true}, + "const": {TokenId: lang.Const, SkipSemi: true}, + "continue": {TokenId: lang.Continue}, + "default": {TokenId: lang.Default, SkipSemi: true}, + "defer": {TokenId: lang.Defer, SkipSemi: true}, + "else": {TokenId: lang.Else, SkipSemi: true}, + "fallthrough": {TokenId: lang.Fallthrough}, + "for": {TokenId: lang.For, SkipSemi: true}, + "func": {TokenId: lang.Func, SkipSemi: true}, + "go": {TokenId: lang.Go, SkipSemi: true}, + "goto": {TokenId: lang.Goto, SkipSemi: true}, + "if": {TokenId: lang.If, SkipSemi: true}, + "import": {TokenId: lang.Import, SkipSemi: true}, + "interface": {TokenId: lang.Interface, SkipSemi: true}, + "map": {TokenId: lang.Map, SkipSemi: true}, + "package": {TokenId: lang.Package, SkipSemi: true}, + "range": {TokenId: lang.Range, SkipSemi: true}, + "return": {TokenId: lang.Return}, + "select": {TokenId: lang.Select, SkipSemi: true}, + "struct": {TokenId: lang.Struct, SkipSemi: true}, + "switch": {TokenId: lang.Switch, SkipSemi: true}, + "type": {TokenId: lang.Type, SkipSemi: true}, + "var": {TokenId: lang.Var, SkipSemi: true}, }, } - -func init() { GoParser.Init() } diff --git a/lang/spec.go b/lang/spec.go new file mode 100644 index 0000000..c5b50a4 --- /dev/null +++ b/lang/spec.go @@ -0,0 +1,43 @@ +package lang + +const ( + CharIllegal = 1 << iota + CharOp + CharNum + CharAlpha + CharSep + CharLineSep + CharGroupSep + CharStr + CharBlock + StrEsc + StrNonl + ExcludeEnd // exclude end delimiter from content + EosValidEnd // end of input string terminates block or string token +) + +const ASCIILen = 1 << 7 // 128 + +type TokenProp struct { + TokenId + SkipSemi bool // automatic semicolon insertion after newline +} + +type Spec struct { + CharProp [ASCIILen]uint // special Character properties + End map[string]string // end delimiters, indexed by start + BlockProp map[string]uint // block properties + TokenProps map[string]TokenProp // token properties + DotNum bool // true if a number can start with '.' + IdAscii bool // true if an identifier can be in ASCII only + Num_ bool // true if a number can contain _ character +} + +// HasInit stores if a statement may contain a simple init statement +var HasInit = map[TokenId]bool{ + Case: true, + For: true, + If: true, + Select: true, + Switch: true, +} diff --git a/lang/token.go b/lang/token.go new file mode 100644 index 0000000..3a980a4 --- /dev/null +++ b/lang/token.go @@ -0,0 +1,110 @@ +package lang + +type TokenId int + +const ( + Illegal = iota + Comment + Ident + Int + Float + Imag + Char + String + + // Operators + Add + Sub + Mul + Quo + Rem + And + Or + Xor + Shl // << + Shr // >> + AndNot // + + AddAssign + SubAssign + MulAssign + QuoAssign + RemAssign + AndAssign + OrAssign + XorAssign + ShlAssign + ShrAssign + AndNotAssign + + Land + Lor + Arrow + Inc + Dec + Equal + Less + Greater + Assign + Not + Plus // unitary + + Minus // unitary - + Address // unitary & + Deref // unitary * + NotEqual + LessEqual + GreaterEqual + Define + Ellipsis + Period + Tilde + + // Separators + Comma + Semicolon + Colon + + // Block tokens + ParenBlock // (..) + BracketBlock // [..] + BraceBlock // {..} + + // Reserved keywords + Break + Case + Chan + Const + Continue + Default + Defer + Else + Fallthrough + For + Func + Go + Goto + If + Import + Interface + Map + Package + Range + Return + Select + Struct + Switch + Type + Var + + // Internal tokens (no corresponding keyword) + Call + CallX + Label + JumpFalse + Enter // entering in function context + Exit // exiting from function context +) + +func (t TokenId) IsKeyword() bool { return t >= Break && t <= Var } +func (t TokenId) IsOperator() bool { return t >= Add && t <= Tilde } +func (t TokenId) IsBlock() bool { return t >= ParenBlock && t <= BraceBlock } diff --git a/main.go b/main.go index 4ae57da..6994038 100644 --- a/main.go +++ b/main.go @@ -9,10 +9,9 @@ import ( "log" "os" - "github.com/gnolang/parscan/codegen" "github.com/gnolang/parscan/lang/golang" + "github.com/gnolang/parscan/parser" "github.com/gnolang/parscan/scanner" - "github.com/gnolang/parscan/vm0" ) type Interpreter interface { @@ -61,32 +60,20 @@ func repl(interp Interpreter, in io.Reader) (err error) { } func run(arg []string) (err error) { - var i int - rflag := flag.NewFlagSet("run", flag.ContinueOnError) - rflag.IntVar(&i, "i", 1, "set interpreter version for execution, possible values: 0, 1") rflag.Usage = func() { fmt.Println("Usage: parscan run [options] [path] [args]") - fmt.Println("Options:") - rflag.PrintDefaults() + //fmt.Println("Options:") + //rflag.PrintDefaults() } if err = rflag.Parse(arg); err != nil { return err } args := rflag.Args() - var interp Interpreter - switch i { - case 0: - interp = vm0.New(golang.GoParser) - case 1: - interp = codegen.NewInterpreter(golang.GoParser) - interp.(*codegen.Interpreter).AddSym(fmt.Println, "println") - default: - return fmt.Errorf("invalid interpreter version: %v", i) - } + interp := parser.NewInterpreter(scanner.NewScanner(golang.GoSpec)) + interp.AddSym("println", fmt.Println) - log.Println("args:", args) in := os.Stdin if len(args) > 0 { if in, err = os.Open(arg[0]); err != nil { diff --git a/parser/README.md b/parser/README.md index b8bc0d4..d46608c 100644 --- a/parser/README.md +++ b/parser/README.md @@ -1,71 +1,112 @@ # Parser -A parser takes an array of tokens (produced by the scanner) in input and -returns a node representing a syntax tree. A node is an object -containing a kind, the corresponding token and the ordered references to -descendent nodes. +See if we can build an almost single pass compiler for Go doing syntax +directed translation, without any complex data structure (no syntax +tree), only lists of tokens. -```mermaid -graph LR -s[ ] --> |source| a(scanner) ---> |tokens| b(parser) ---> |AST| c[ ] -subgraph parser - b -end -style s height:0px; -style c height:0px; -``` +The goal is to have the shortest and simplest path from source to +bytecode. -A goal is to make the parser generic enough so it can generate syntax -trees for most of existing programming languages (no claim of generality -yet), provided a small set of generating rules per language, and a small -set of validating rules (yet to be defined) to detect invalid -constructs. +## Design -The input tokens are particular in the sense that they include classical -lexical items such as words, separators, numbers, but also strings and -nested blocks, which are resolved at scanning stage rather than parsing -stage. See the scanner for more details. +The input of parser is a list of tokens produced by the scanner. +Multiple tokens are processed at once. The minimal set to get +meaningful results (not an error or nil) is a complete statemement. -The language specification includes the following: +The output of parser is also a list of tokens, to be consummed by +the compiler to produce bytecode. The output tokens set is identical +to the bytecode instructions set except that: -- a scanner specification, to produce the set of possible tokens. -- a map of node specification per token name. The node specification - defines some parameters influing how the tree is generated. +- code locations may be provided as as labels instead of numerical + values, +- memory locations for constants and variables may be provided as + symbol names instead of numerical values. -## Development status +## Status -A successful test must be provided to check the status. +Go language support: -- [x] binary operator expressions -- [x] unary operator (prefix) expressions -- [ ] unary operator (suffix) expressions -- [x] operator precedence rules -- [x] parenthesis in expressions -- [x] call expressions -- [ ] nested calls -- [x] index expressions -- [x] single assignments -- [ ] multi assignments -- [x] simple `if` statement (no `else`) -- [ ] full `if` statement (including `else`, `else if`) -- [x] init expressions in `if` statements -- [x] statement blocks -- [x] comments +- [x] named functions +- [ ] anonymous functions (closures) +- [ ] methods +- [x] internal function calls +- [x] external function calls (calling runtime symbols in interpreter) +- [ ] export to runtime +- [ ] builtin calls (new, make, copy, delete, len, cap, ...) +- [ ] out of order declarations +- [ ] arbirtrary precision constants +- [x] basic types +- [ ] complete numeric types +- [x] function types +- [ ] variadic functions +- [ ] pointers +- [ ] structures +- [ ] embedded structures +- [ ] recursive structures +- [ ] interfaces +- [ ] arrays, slices +- [ ] maps +- [ ] deterministic maps +- [ ] channel types +- [ ] channel operations +- [x] var defined by assign := +- [x] var assign = +- [ ] var declaration +- [ ] type declaration +- [x] func declaration +- [ ] const declaration +- [ ] iota expression +- [ ] defer statement +- [ ] recover statement +- [ ] go statement +- [x] if statement (including else and else if) - [ ] for statement - [ ] switch statement +- [ ] break statement +- [ ] continue statement +- [ ] fallthrough statement +- [ ] goto statement - [ ] select statement -- [x] return statement -- [x] function declaration -- [ ] method declaration -- [ ] anonymous functions (closures) -- [ ] type declaration -- [ ] var, const, type single declaration -- [ ] var, const, type multi declaration -- [ ] type parametric expressions -- [x] literal numbers (see scanner) -- [x] literal strings -- [ ] composite literal -- [ ] import statements -- [ ] go.mod syntax +- [x] binary operators +- [ ] unary operators +- [ ] logical operators && and || +- [ ] assign operators +- [ ] operator precedence +- [x] parenthesis expressions +- [x] call expressions +- [ ] index expressions +- [ ] selector expressions +- [ ] type conversions +- [ ] type assertions +- [ ] parametric types (generic) +- [ ] type parametric functions (generic) +- [ ] type constraints (generic) +- [ ] type checking +- [ ] comment pragmas +- [ ] packages import +- [ ] modules + +Other items: + +- [x] REPL +- [x] multiline statements in REPL +- [ ] completion, history in REPL +- [x] eval strings +- [x] eval files (including stdin, ...) +- [x] debug traces for scanner, parser, compiler, bytecode vm +- [x] simple interpreter tests to exercise from source to execution +- [ ] compile time error detection and diagnosis +- [ ] stack dump +- [ ] symbol tables, data tables, code binded to source lines +- [ ] interactive debugger: breaks, continue, instrospection, ... +- [ ] machine level debugger +- [ ] source level debugger +- [ ] replay debugger, backward instruction execution +- [ ] vm monitor: live memory / code display during run +- [ ] stdlib wrappers a la yaegi +- [ ] system and environment sandboxing +- [ ] build constraints (arch, sys, etc) +- [ ] test command (running go test / benchmark / example files) +- [ ] skipping / including test files +- [ ] test coverage +- [ ] fuzzy tests for scanner, vm, ... diff --git a/parser/compiler.go b/parser/compiler.go new file mode 100644 index 0000000..5b0ed81 --- /dev/null +++ b/parser/compiler.go @@ -0,0 +1,250 @@ +package parser + +import ( + "fmt" + "log" + "reflect" + "strconv" + + "github.com/gnolang/parscan/lang" + "github.com/gnolang/parscan/scanner" + "github.com/gnolang/parscan/vm" +) + +type Compiler struct { + *Parser + Code [][]int64 // produced code, to fill VM with + Data []any // produced data, will be at the bottom of VM stack + Entry int // offset in Code to start execution from (skip function defintions) + + strings map[string]int // locations of strings in Data +} + +func NewCompiler(scanner *scanner.Scanner) *Compiler { + return &Compiler{ + Parser: &Parser{Scanner: scanner, symbols: initUniverse(), labelCount: map[string]int{}}, + Entry: -1, + strings: map[string]int{}, + } +} + +func (c *Compiler) Emit(op ...int64) int { + op = append([]int64{}, op...) + l := len(c.Code) + c.Code = append(c.Code, op) + return l +} + +func (c *Compiler) AddSym(name string, value any) int { + p := len(c.Data) + c.Data = append(c.Data, value) + c.Parser.AddSym(p, name, value) + return p +} + +func (c *Compiler) Codegen(tokens Tokens) (err error) { + fixList := Tokens{} + function := []string{""} + + log.Println("Codegen tokens:", tokens) + + for i, t := range tokens { + scope := function[len(function)-1] + switch t.Id { + case lang.Int: + n, err := strconv.Atoi(t.Str) + if err != nil { + return err + } + c.Emit(int64(t.Pos), vm.Push, int64(n)) + + case lang.String: + s := t.Block() + i, ok := c.strings[s] + if !ok { + i = len(c.Data) + c.Data = append(c.Data, s) + c.strings[s] = i + } + c.Emit(int64(t.Pos), vm.Dup, int64(i)) + + case lang.Add: + c.Emit(int64(t.Pos), vm.Add) + + case lang.Sub: + c.Emit(int64(t.Pos), vm.Sub) + + case lang.Less: + c.Emit(int64(t.Pos), vm.Lower) + + case lang.Call: + c.Emit(int64(t.Pos), vm.Call) + + case lang.CallX: + c.Emit(int64(t.Pos), vm.CallX, int64(t.Beg)) + + case lang.Define: + // TODO: support assignment to local, composite objects + st := tokens[i-1] + l := len(c.Data) + c.Data = append(c.Data, nil) + c.addSym(l, st.Str, nil, symVar, nil, false) + c.Emit(int64(st.Pos), vm.Assign, int64(l)) + + case lang.Enter: + // TODO: merge with label ? + function = append(function, t.Str[6:]) + + case lang.Exit: + function = function[:len(function)-1] + + case lang.Assign: + st := tokens[i-1] + s, _, ok := c.getSym(st.Str, scope) + if !ok { + return fmt.Errorf("symbol not found: %s", st.Str) + } + if s.local { + c.Emit(int64(st.Pos), vm.Fassign, int64(s.index)) + } else { + c.Emit(int64(st.Pos), vm.Assign, int64(s.index)) + } + + case lang.Equal: + c.Emit(int64(t.Pos), vm.Equal) + + case lang.Ident: + if i < len(tokens)-1 { + switch t1 := tokens[i+1]; t1.Id { + case lang.Define, lang.Assign, lang.Colon: + continue + } + } + s, _, ok := c.getSym(t.Str, scope) + if !ok { + return fmt.Errorf("symbol not found: %s", t.Str) + } + if s.local { + c.Emit(int64(t.Pos), vm.Fdup, int64(s.index)) + } else { + c.Emit(int64(t.Pos), vm.Dup, int64(s.index)) + } + + case lang.Label: + // If the label is a function, the symbol already exists + s, _, ok := c.getSym(t.Str, scope) + lc := len(c.Code) + if ok { + s.value = lc + if s.kind == symFunc { + // label is a function entry point, register its code address in data. + s.index = len(c.Data) + c.Data = append(c.Data, lc) + } else { + c.Data[s.index] = lc + } + } else { + ld := len(c.Data) + c.Data = append(c.Data, lc) + c.addSym(ld, t.Str, lc, symLabel, nil, false) + } + + case lang.JumpFalse: + label := t.Str[10:] + i := 0 + if s, _, ok := c.getSym(label, scope); !ok { + // t.Beg contains the position in code which needs to be fixed. + t.Beg = len(c.Code) + fixList = append(fixList, t) + } else { + i = s.value.(int) + } + c.Emit(int64(t.Pos), vm.JumpFalse, int64(i)) + + case lang.Goto: + label := t.Str[5:] + i := 0 + if s, _, ok := c.getSym(label, scope); !ok { + t.Beg = len(c.Code) + fixList = append(fixList, t) + } else { + i = s.value.(int) + } + c.Emit(int64(t.Pos), vm.Jump, int64(i)) + + case lang.Return: + c.Emit(int64(t.Pos), vm.Return, int64(t.Beg), int64(t.End)) + + default: + return fmt.Errorf("Codegen: unsupported token %v", t) + } + } + + // Finally we fix unresolved labels for jump destinations. + for _, t := range fixList { + var label string + switch t.Id { + case lang.Goto: + label = t.Str[5:] + case lang.JumpFalse: + label = t.Str[10:] + } + s, _, ok := c.getSym(label, "") + if !ok { + return fmt.Errorf("label not found: %v", label) + } + c.Code[t.Beg][2] = int64(s.value.(int) - t.Beg) + + } + return err +} + +func (c *Compiler) PrintCode() { + labels := map[int]string{} + for name, sym := range c.symbols { + if sym.kind == symLabel || sym.kind == symFunc { + labels[sym.value.(int)] = name + } + } + fmt.Println("# Code:") + for i, l := range c.Code { + if label, ok := labels[i]; ok { + fmt.Println(label + ":") + } + extra := "" + switch l[1] { + case vm.Jump, vm.JumpFalse, vm.JumpTrue, vm.Calli: + if d, ok := labels[i+(int)(l[2])]; ok { + extra = "// " + d + } + } + fmt.Printf("%4d %-14v %v\n", i, vm.CodeString(l), extra) + } +} + +type entry struct { + name string + *symbol +} + +func (c *Compiler) PrintData() { + dict := map[int]entry{} + for name, sym := range c.symbols { + if !sym.used || sym.local || sym.kind == symLabel { + continue + } + dict[sym.index] = entry{name, sym} + } + fmt.Println("# Data:") + for i, d := range c.Data { + fmt.Printf("%4d %T %v %v\n", i, d, d, dict[i]) + } +} + +func (c *Compiler) NumIn(i int) (int, bool) { + t := reflect.TypeOf(c.Data[i]) + if t.Kind() == reflect.Func { + return t.NumIn(), t.IsVariadic() + } + return -1, false +} diff --git a/parser/dot.go b/parser/dot.go deleted file mode 100644 index 11d5014..0000000 --- a/parser/dot.go +++ /dev/null @@ -1,89 +0,0 @@ -package parser - -import ( - "bytes" - "fmt" - "io" - "log" - "os/exec" - "strings" -) - -func (*Parser) Adot(nodes []*Node, c string) { - if c == "" { - return - } - n := &Node{Child: nodes} - n.Dot(c, "") -} - -func (n *Node) Dot(c, s string) { - dw, cmd := dotWriter(c) - n.astDot(dw, s) - if cmd == nil { - return - } - if err := cmd.Wait(); err != nil { - log.Fatal(err) - } -} - -func (n *Node) Sdot(s string) string { - var buf bytes.Buffer - n.astDot(&buf, s) - return buf.String() -} - -// TODO: rewrite it using Walk2 -func (n *Node) astDot(out io.Writer, label string) { - fmt.Fprintf(out, "digraph ast { ") - if label != "" { - fmt.Fprintf(out, "labelloc=\"t\"; label=\"%s\";", label) - } - anc := map[*Node]*Node{} - index := map[*Node]int{} - count := 0 - n.Walk(func(nod *Node) bool { - index[nod] = count - count++ - - for _, c := range nod.Child { - anc[c] = nod - } - name := "" - if nod.Token != nil { - name = strings.ReplaceAll(nod.Name(), `"`, `\"`) - } - fmt.Fprintf(out, "%d [label=\"%s\"]; ", index[nod], name) - if anc[nod] != nil { - fmt.Fprintf(out, "%d -> %d; ", index[anc[nod]], index[nod]) - } - return true - }, nil) - fmt.Fprintf(out, "}") - if c, ok := out.(io.Closer); ok { - c.Close() - } -} - -type nopCloser struct { - io.Writer -} - -func (nopCloser) Close() error { return nil } - -func dotWriter(dotCmd string) (io.WriteCloser, *exec.Cmd) { - if dotCmd == "" { - return nopCloser{io.Discard}, nil - } - fields := strings.Fields(dotCmd) - cmd := exec.Command(fields[0], fields[1:]...) - dotin, err := cmd.StdinPipe() - if err != nil { - log.Fatal(err) - } - if err = cmd.Start(); err != nil { - log.Fatal(err) - } - return dotin, cmd -} diff --git a/parser/interpreter.go b/parser/interpreter.go new file mode 100644 index 0000000..72fc667 --- /dev/null +++ b/parser/interpreter.go @@ -0,0 +1,52 @@ +package parser + +import ( + "github.com/gnolang/parscan/scanner" + "github.com/gnolang/parscan/vm" +) + +const debug = true + +type Interpreter struct { + *Compiler + *vm.Machine +} + +func NewInterpreter(s *scanner.Scanner) *Interpreter { + return &Interpreter{NewCompiler(s), &vm.Machine{}} +} + +func (i *Interpreter) Eval(src string) (res any, err error) { + codeOffset := len(i.Code) + dataOffset := 0 + if codeOffset > 0 { + // All data must be copied to the VM the first time only (re-entrance). + dataOffset = len(i.Data) + } + i.PopExit() // Remove last exit from previous run (re-entrance). + + t, err := i.Parse(src) + if err != nil { + return res, err + } + if err = i.Codegen(t); err != nil { + return res, err + } + i.Push(i.Data[dataOffset:]...) + i.PushCode(i.Code[codeOffset:]...) + i.PushCode([]int64{0, vm.Exit}) + i.SetIP(max(codeOffset, i.Entry)) + if debug { + i.PrintData() + i.PrintCode() + } + err = i.Run() + return i.Top(), err +} + +func max(a, b int) int { + if a > b { + return a + } + return b +} diff --git a/parser/interpreter_test.go b/parser/interpreter_test.go new file mode 100644 index 0000000..ee927a5 --- /dev/null +++ b/parser/interpreter_test.go @@ -0,0 +1,66 @@ +package parser_test + +import ( + "fmt" + "log" + "testing" + + "github.com/gnolang/parscan/lang/golang" + "github.com/gnolang/parscan/parser" + "github.com/gnolang/parscan/scanner" +) + +var GoScanner *scanner.Scanner + +func init() { + log.SetFlags(log.Lshortfile) + GoScanner = scanner.NewScanner(golang.GoSpec) +} + +func TestEval(t *testing.T) { + for _, test := range goTests { + test := test + t.Run("", func(t *testing.T) { + interp := parser.NewInterpreter(GoScanner) + errStr := "" + r, e := interp.Eval(test.src) + t.Log(r, e) + if e != nil { + errStr = e.Error() + } + if errStr != test.err { + t.Errorf("got error %#v, want error %#v", errStr, test.err) + } + if res := fmt.Sprintf("%v", r); test.err == "" && res != test.res { + t.Errorf("got %#v, want %#v", res, test.res) + } + }) + } +} + +var goTests = []struct { + src, res, err string +}{ + 0: {src: "", res: ""}, + 1: {src: "1+2", res: "3"}, + 2: {src: "1+", err: "block not terminated"}, + 3: {src: "a := 1 + 2; b := 0; a + 1", res: "4"}, + 4: {src: "1+(2+3)", res: "6"}, + 5: {src: "(1+2)+3", res: "6"}, + 6: {src: "(6+(1+2)+3)+5", res: "17"}, + 7: {src: "(6+(1+2+3)+5", err: "1:1: block not terminated"}, + 8: {src: "a := 2; a = 3; a", res: "3"}, + 9: {src: "a := 0; if a == 0 { a = 2 } else { a = 1 }; a", res: "2"}, + 10: {src: "a := 0; if a == 1 { a = 2 } else { a = 1 }; a", res: "1"}, + 11: {src: "a := 0; if a == 1 { a = 2 } else if a == 0 { a = 3 } else { a = 1 }; a", res: "3"}, + 12: {src: "a := 0; if a == 1 { a = 2 } else if a == 2 { a = 3 } else { a = 1 }; a", res: "1"}, + 13: {src: "func f() int {return 2}; a := f(); a", res: "2"}, + 14: {src: "func f() int {return 2}; f()", res: "2"}, + 15: {src: "func f(a int) int {return a+2}; f(3)", res: "5"}, + 16: {src: "func f(a int) int {if a < 4 {a = 5}; return a }; f(3)", res: "5"}, + 17: {src: "func f(a int) int {return a+2}; 7 - f(3)", res: "2"}, + 18: {src: "func f(a int) int {return a+2}; f(5) - f(3)", res: "2"}, + 19: {src: "func f(a int) int {return a+2}; f(3) - 2", res: "3"}, + 20: {src: "func f(a int, b int, c int) int {return a+b-c} ; f(7, 1, 3)", res: "5"}, + 21: {src: "func f(a, b, c int) int {return a+b-c} ; f(7, 1, 3)", res: "5"}, +} diff --git a/parser/kind.go b/parser/kind.go deleted file mode 100644 index d004471..0000000 --- a/parser/kind.go +++ /dev/null @@ -1,56 +0,0 @@ -package parser - -import "fmt" - -// kind defines the AST node kind. Its name is the concatenation -// of a category (Block, Decl, Expr, Op, Stmt) and an instance name. -type Kind int - -const ( - Undefined = Kind(iota) - BlockParen - BlockStmt - Comment - DeclFunc - ExprCall - Ident - LiteralNumber - LiteralString - OpAdd - OpAssign - OpDefine - OpDot - OpInferior - OpMultiply - OpSubtract - StmtIf - StmtReturn -) - -var kindString = [...]string{ - Undefined: "Undefined", - BlockParen: "BlockParen", - BlockStmt: "BlockStmt", - Comment: "Comment", - DeclFunc: "DeclFunc", - ExprCall: "ExprCall", - Ident: "Ident", - LiteralString: "LiteralString", - LiteralNumber: "LiteralNumber", - OpAdd: "OpAdd", - OpAssign: "OpAssign", - OpDefine: "OpDefine", - OpDot: "OpDot", - OpInferior: "OpInferior", - OpMultiply: "OpMultiply", - OpSubtract: "OpSubtract", - StmtIf: "StmtIf", - StmtReturn: "StmtReturn", -} - -func (k Kind) String() string { - if int(k) < 0 || int(k) > len(kindString) { - return fmt.Sprintf("unknown kind %d", k) - } - return kindString[k] -} diff --git a/parser/node.go b/parser/node.go deleted file mode 100644 index b6e34cd..0000000 --- a/parser/node.go +++ /dev/null @@ -1,50 +0,0 @@ -package parser - -import "github.com/gnolang/parscan/scanner" - -type Node struct { - Child []*Node // sub-tree nodes - *scanner.Token // token at origin of the node - Kind // Node kind, depends on the language spec -} - -// TODO: remove it in favor of Walk2 -func (n *Node) Walk(in, out func(*Node) bool) (stop bool) { - if in != nil && !in(n) { - return true - } - for _, child := range n.Child { - if child.Walk(in, out) { - return - } - } - if out != nil { - stop = !out(n) - } - return -} - -// Idem to walk, but also propagates the ancestor of visited node and child index. -func (n *Node) Walk2(a *Node, i int, in, out func(*Node, *Node, int) bool) (stop bool) { - if in != nil && !in(n, a, i) { - return true - } - for j, child := range n.Child { - if child.Walk2(n, j, in, out) { - return - } - } - if out != nil { - stop = !out(n, a, i) - } - return -} - -func (n *Node) RemoveChild(i int) { - n.Child = append(n.Child[:i], n.Child[i+1:]...) -} - -func (n *Node) InsertChild(node *Node, i int) { - n.Child = append(n.Child[:i+1], n.Child[i:]...) - n.Child[i] = node -} diff --git a/parser/parse.go b/parser/parse.go index ca89467..5f0b643 100644 --- a/parser/parse.go +++ b/parser/parse.go @@ -1,244 +1,373 @@ package parser import ( + "fmt" + "log" + "strconv" + "strings" + + "github.com/gnolang/parscan/lang" "github.com/gnolang/parscan/scanner" ) -const ( - Stmt = 1 << iota - ExprSep - Call - Index - Decl - MultiOp -) +type Parser struct { + *scanner.Scanner -type NodeSpec struct { - Kind // AST node kind - Flags uint // composable properties used for AST generation - Order int // operator precedence order + symbols map[string]*symbol + function *symbol + scope string + fname string + labelCount map[string]int } -type Parser struct { - *scanner.Scanner - Spec map[string]NodeSpec +func (p *Parser) Scan(s string, endSemi bool) (Tokens, error) { + return p.Scanner.Scan(s, endSemi) } -func (p *Parser) Parse(src string, ctx *Node) (nodes []*Node, err error) { - tokens, err := p.Scan(src) - if err != nil { - return +type Tokens []scanner.Token + +func (toks Tokens) String() (s string) { + for _, t := range toks { + s += fmt.Sprintf("%#v ", t.Str) } - return p.ParseTokens(tokens, ctx) + return s } -func (p *Parser) ParseTokens(tokens []*scanner.Token, ctx *Node) (nodes []*Node, err error) { - // TODO: error handling. - var root *Node // current root node - var expr *Node // current expression root node - var prev, c *Node // previous and current nodes - var lce *Node // last complete expression node - unaryOp := map[*Node]bool{} // unaryOp indicates if a node is an unary operator. - prevToken := map[*Node]*scanner.Token{} - - for i, t := range tokens { - prev = c - c = &Node{ - Token: t, - Kind: p.Spec[t.Name()].Kind, +func (toks Tokens) Index(id lang.TokenId) int { + for i, t := range toks { + if t.Id == id { + return i } - if i > 0 { - prevToken[c] = tokens[i-1] + } + return -1 +} + +func (toks Tokens) LastIndex(id lang.TokenId) int { + for i := len(toks) - 1; i >= 0; i-- { + if toks[i].Id == id { + return i } - if c.Kind == Comment { - continue + } + return -1 +} + +func (toks Tokens) Split(id lang.TokenId) (result []Tokens) { + for { + i := toks.Index(id) + if i < 0 { + return append(result, toks) } - if t.IsOperator() && (i == 0 || tokens[i-1].IsOperator()) { - unaryOp[c] = true + result = append(result, toks[:i]) + toks = toks[i+1:] + } +} + +func (p *Parser) Parse(src string) (out Tokens, err error) { + log.Printf("Parse src: %#v\n", src) + in, err := p.Scan(src, true) + if err != nil { + return out, err + } + log.Println("Parse in:", in) + for len(in) > 0 { + endstmt := in.Index(lang.Semicolon) + if endstmt == -1 { + return out, scanner.ErrBlock } - if c.Kind == Undefined { - switch t.Kind() { - case scanner.Number: - c.Kind = LiteralNumber - case scanner.Identifier: - c.Kind = Ident + + // Skip over simple init statements for some tokens (if, for, ...) + if lang.HasInit[in[0].Id] { + log.Println("may have init") + for in[endstmt-1].Id != lang.BraceBlock { + e2 := in[endstmt+1:].Index(lang.Semicolon) + if e2 == -1 { + return out, scanner.ErrBlock + } + endstmt += 1 + e2 } } + o, err := p.ParseStmt(in[:endstmt]) + if err != nil { + return out, err + } + out = append(out, o...) + in = in[endstmt+1:] + } + return out, err +} - if root == nil { - if p.isSep(c) { - continue - } - lce = nil - root = c - if p.isExpr(c) { - expr = c - } +func (p *Parser) ParseStmt(in Tokens) (out Tokens, err error) { + log.Println("ParseStmt in:", in) + if len(in) == 0 { + return nil, nil + } + switch t := in[0]; t.Id { + case lang.Func: + return p.ParseFunc(in) + case lang.If: + return p.ParseIf(in) + case lang.Return: + return p.ParseReturn(in) + default: + return p.ParseExpr(in) + } +} + +func (p *Parser) ParseFunc(in Tokens) (out Tokens, err error) { + // TODO: handle anonymous functions (no function name) + // TODO: handle receiver (methods) + // TODO: handle parametric types (generics) + // TODO: handle variadic parameters + fname := in[1].Str + ofname := p.fname + p.fname = fname + ofunc := p.function + s, _, ok := p.getSym(fname, p.scope) + if !ok { + s = &symbol{} + p.symbols[p.scope+fname] = s + } + p.pushScope(fname) + defer func() { + p.fname = ofname // TODO remove if favor of function. + p.function = ofunc + p.popScope() + }() + + out = Tokens{ + {Id: lang.Enter, Str: "enter " + p.scope}, + {Id: lang.Goto, Str: "goto " + fname + "_end"}, // Skip function definition. + {Id: lang.Label, Pos: in[0].Pos, Str: fname}, + } + + bi := in.Index(lang.BraceBlock) + if bi < 0 { + return out, fmt.Errorf("no function body") + } + typ, err := p.ParseType(in[:bi]) + if err != nil { + return out, err + } + s.kind = symFunc + s.Type = typ + p.function = s + + log.Println("body:", in[len(in)-1].Block()) + toks, err := p.Parse(in[len(in)-1].Block()) + if err != nil { + return out, err + } + out = append(out, toks...) + out = append(out, + scanner.Token{Id: lang.Label, Str: fname + "_end"}, + scanner.Token{Id: lang.Exit}, + ) + log.Println("symbols", p.symbols) + return out, err +} + +func (p *Parser) ParseIf(in Tokens) (out Tokens, err error) { + prefix := p.fname + "_if" + strconv.Itoa(p.labelCount[p.scope+p.fname]) + p.labelCount[p.scope+p.fname]++ + sc := 0 + ssc := strconv.Itoa(sc) + // We start from the end of the statement and examine tokens backward to + // get the destination labels already computed when jumps are set. + i := len(in) - 1 + for i > 0 { + if in[i].Id != lang.BraceBlock { + return nil, fmt.Errorf("expected '{', got %v", in[i]) + } + pre, err := p.Parse(in[i].Block()) + if err != nil { + return nil, err + } + //pre := append(Tokens{{Id: lang.Label, Str: prefix + "_b" + ssc}}, blockout...) + if sc > 0 { + pre = append(pre, scanner.Token{Id: lang.Goto, Str: "goto " + prefix + "_e0"}) + } + pre = append(pre, scanner.Token{Id: lang.Label, Str: prefix + "_e" + ssc}) + out = append(pre, out...) + + i-- + ifp := in[:i].LastIndex(lang.If) + + if in[i].Id == lang.Else { // Step over final 'else'. + i-- + sc++ + ssc = strconv.Itoa(sc) continue } - if t.IsBlock() { - if expr != nil { - if p.hasProp(c, ExprSep) && p.isExprSep(root) { - // A bracket block may end a previous expression. - root.Child = append(root.Child, expr) - expr = nil - } else if p.hasProp(c, Call) && !p.hasProp(root, Decl) && p.canCallToken(tokens[i-1]) { - // Handle (possibly nested) call expressions. - if lce == nil || lce != expr { // TODO(marc): not general, fix it. - lce = prev - } - lce.Child = []*Node{{Token: lce.Token, Child: lce.Child, Kind: lce.Kind}} - lce.Token = scanner.NewToken("Call", c.Pos()) - lce.Kind = ExprCall - } - } - tcont := t.Content() - s := tcont[t.Start() : len(tcont)-t.End()] - n2, err := p.Parse(s, c) - if err != nil { + pre = Tokens{} + var init, cond Tokens + initcond := in[ifp+1 : i+1] + if ii := initcond.Index(lang.Semicolon); ii < 0 { + cond = initcond + } else { + init = initcond[:ii] + cond = initcond[ii+1:] + } + if len(init) > 0 { + if init, err = p.ParseStmt(init); err != nil { return nil, err } - c.Child = append(c.Child, n2...) - } - - // Process the end of an expression or a statement. - if t.IsSeparator() { - if t.Content() == "," && ctx.Kind != BlockParen { - // ignore comma separator in field lists - } else if expr != nil && p.hasProp(root, Stmt) { - root.Child = append(root.Child, expr) - if p.hasProp(expr, ExprSep) { - nodes = append(nodes, root) - root = nil - } - expr = nil - } else { - if expr != nil { - root = expr - } - nodes = append(nodes, root) - expr = nil - root = nil - } - continue + pre = append(pre, init...) } + if cond, err = p.ParseExpr(cond); err != nil { + return nil, err + } + pre = append(pre, cond...) + pre = append(pre, scanner.Token{Id: lang.JumpFalse, Str: "JumpFalse " + prefix + "_e" + ssc}) + out = append(pre, out...) + i = ifp + if i > 1 && in[i].Id == lang.If && in[i-1].Id == lang.Else { // Step over 'else if'. + i -= 2 + } + sc++ + ssc = strconv.Itoa(sc) + } + log.Println("prefix:", prefix) + log.Println("if tokens:", out) + return out, err +} - // We assume from now that current node is part of an expression subtree. - if expr == nil { - if p.isStatement(root) { - expr = c - continue +func (p *Parser) ParseReturn(in Tokens) (out Tokens, err error) { + if len(in) > 1 { + if out, err = p.ParseExpr(in[1:]); err != nil { + return out, err + } + } + + // TODO: the function symbol should be already present in the parser context. + // otherwise no way to handle anonymous func. + s := p.function + in[0].Beg = s.Type.NumOut() + in[0].End = s.Type.NumIn() + log.Println("ParseReturn:", p.fname, in[0]) + out = append(out, in[0]) + return out, err +} + +func (p *Parser) ParseExpr(in Tokens) (out Tokens, err error) { + log.Println("ParseExpr in:", in) + var ops Tokens + var vl int + // + // Process tokens from last to first, the goal is to reorder the tokens in + // a stack machine processing order, so it can be directly interpreted. + // + for i := len(in) - 1; i >= 0; i-- { + t := in[i] + // temporary assumptions: binary operators, returning 1 value + switch t.Id { + case lang.Ident, lang.Int, lang.String: + out = append(out, t) + vl++ + case lang.Define, lang.Add, lang.Sub, lang.Assign, lang.Equal, lang.Less: + // TODO: handle operator precedence to swap operators / operands if necessary + if vl < 2 { + ops = append(ops, t) + break } - expr = root - } - - // Update the expression subtree according to binary operator precedence rules. - // - operators are binary infix by default. - // - if an operator follows another, then it's unary prefix. - // - if an expression starts by an operator, then it's unary prefix. - // - non operator nodes have a default precedence of 0. - // TODO: handle postfix unary (i.e. ++) and ternary (i.e. ?:) - // - ep := p.Spec[expr.Content()].Order - cp := p.Spec[c.Content()].Order - a := expr - if unaryOp[c] { - cp = 0 - } - if cp != 0 { - if cp > ep { - // Perform an operator permutation at expr root as required by precedence. - // TODO(marc): maybe it can be generalized in below else branch. - expr, c = c, expr - a = expr // Temporary ancestor: its children may have to be permuted. - } else { - // Findout if an operator permutation is necessary in subtree. - c1 := expr - for { - a = c1 - if unaryOp[c1] { - c1, c = c, c1 - a = c1 - if c == expr { - expr = a - } - break - } - if len(c1.Child) < 2 { - break - } - c1 = c1.Child[1] - if !c1.IsOperator() || unaryOp[c1] || cp > p.Spec[c1.Content()].Order { + case lang.ParenBlock: + // If the previous token is an arithmetic, logic or assign operator then + // this parenthesis block is an enclosed expr, otherwise a call expr. + if i == 0 || in[i-1].Id.IsOperator() { + out = append(out, t) + vl++ + break + } + // The call expression can be a function call, a conversion, + // a type assersion (including for type switch) + // func call: push args and func address then call + out = append(out, t) + vl++ + if t2 := in[i-1]; t2.Id == lang.Ident { + if s, sc, ok := p.getSym(t2.Str, p.scope); ok { + log.Println("callExpr:", t2.Str, p.scope, s, ok, sc) + if s.kind == symValue { + // Store the number of input parameters in the token Beg field. + ops = append(ops, scanner.Token{Str: "callX", Id: lang.CallX, Pos: t.Pos, Beg: p.numItems(t.Block(), lang.Comma)}) break } } - // No permutation occured. Append current to last visited ancestor. - if len(a.Child) > 1 { - a.Child = a.Child[:1] - c.Child = append(c.Child, c1) - } - } - } else if ep != 0 { - for len(a.Child) > 1 { - a = a.Child[1] } + ops = append(ops, scanner.Token{Str: "call", Id: lang.Call, Pos: t.Pos}) } - a.Child = append(a.Child, c) - if p.hasProp(a, Call) { - lce = a - } - } - if root != nil && p.isStatement(root) { - if expr != nil { - root.Child = append(root.Child, expr) + if ol := len(ops); ol > 0 && vl > ol { + op := ops[ol-1] + ops = ops[:ol-1] + out = append(out, op) + vl-- } - } else if expr != nil { - root = expr } - if root != nil { - // /* - if p.hasProp(root, MultiOp) { - for { - if !p.fixMultiOp(root, prevToken) { - break - } + out = append(out, ops...) // TODO: verify that ops should be added in this order. + + log.Println("ParseExpr out:", out, "vl:", vl, "ops:", ops) + // The tokens are now properly ordered, process nested blocks. + for i := len(out) - 1; i >= 0; i-- { + t := out[i] + var toks Tokens + switch t.Id { + case lang.ParenBlock, lang.BracketBlock: + if toks, err = p.ParseExprStr(t.Block()); err != nil { + return out, err } + default: + continue } - // */ - nodes = append(nodes, root) + + // replace block token by its parsed result. + log.Println("toks:", toks) + out2 := append(Tokens{}, out[:i]...) + out2 = append(out2, toks...) + out = append(out2, out[i+1:]...) } - return nodes, err + log.Println("Final out:", out) + return out, err } -func (p *Parser) fixMultiOp(root *Node, prevToken map[*Node]*scanner.Token) bool { - for i, c := range root.Child { - for j, cc := range c.Child { - if pt := prevToken[cc]; pt != nil && pt.Content() == "," { - c.RemoveChild(j) - root.InsertChild(cc, i) - return true - } +func (p *Parser) ParseExprStr(s string) (tokens Tokens, err error) { + if tokens, err = p.Scan(s, false); err != nil { + return + } + var result Tokens + for _, sub := range tokens.Split(lang.Comma) { + toks, err := p.ParseExpr(sub) + if err != nil { + return result, err } + result = append(toks, result...) } - return false + return result, err } -func (p *Parser) hasProp(n *Node, prop uint) bool { return p.Spec[n.Name()].Flags&prop != 0 } -func (p *Parser) isStatement(n *Node) bool { return p.Spec[n.Content()].Flags&Stmt != 0 } -func (p *Parser) isExprSep(n *Node) bool { return p.Spec[n.Content()].Flags&ExprSep != 0 } -func (p *Parser) isExpr(n *Node) bool { return !p.isStatement(n) && !p.isExprSep(n) } -func (p *Parser) isSep(n *Node) bool { return n.Token.Kind() == scanner.Separator } -func (p *Parser) IsBlock(n *Node) bool { return n.Token.Kind() == scanner.Block } - -func (p *Parser) precedenceToken(t *scanner.Token) int { - s := t.Content() - if l := t.Start(); l > 0 { - s = s[:l] +func (p *Parser) numItems(s string, sep lang.TokenId) int { + tokens, err := p.Scan(s, false) + if err != nil { + return -1 } - return p.Spec[s].Order + r := 0 + for _, t := range tokens.Split(sep) { + if len(t) == 0 { + continue + } + r++ + } + return r +} + +func (p *Parser) pushScope(name string) { + p.scope = strings.TrimPrefix(p.scope+"/"+name+"/", "/") } -func (p *Parser) canCallToken(t *scanner.Token) bool { - return p.precedenceToken(t) == 0 || p.Spec[t.Name()].Flags&Call != 0 +func (p *Parser) popScope() { + p.scope = strings.TrimSuffix(p.scope, "/") + j := strings.LastIndex(p.scope, "/") + if j == -1 { + p.scope = "" + return + } + p.scope = p.scope[:j] } diff --git a/parser/parse_test.go b/parser/parse_test.go deleted file mode 100644 index 4559acf..0000000 --- a/parser/parse_test.go +++ /dev/null @@ -1,257 +0,0 @@ -package parser - -import ( - "log" - "os" - "testing" - - "github.com/gnolang/parscan/scanner" -) - -var GoScanner = &scanner.Scanner{ - CharProp: [scanner.ASCIILen]uint{ - '\t': scanner.CharSep, - '\n': scanner.CharLineSep, - ' ': scanner.CharSep, - '!': scanner.CharOp, - '"': scanner.CharStr, - '%': scanner.CharOp, - '&': scanner.CharOp, - '\'': scanner.CharStr, - '(': scanner.CharBlock, - '*': scanner.CharOp, - '+': scanner.CharOp, - ',': scanner.CharGroupSep, - '-': scanner.CharOp, - '.': scanner.CharOp, - '/': scanner.CharOp, - ':': scanner.CharOp, - ';': scanner.CharGroupSep, - '<': scanner.CharOp, - '=': scanner.CharOp, - '>': scanner.CharOp, - '[': scanner.CharBlock, - '^': scanner.CharOp, - '{': scanner.CharBlock, - '|': scanner.CharOp, - '~': scanner.CharOp, - }, - End: map[string]string{ - "(": ")", - "{": "}", - "[": "]", - "/*": "*/", - `"`: `"`, - "'": "'", - "`": "`", - "//": "\n", - }, - BlockProp: map[string]uint{ - "(": scanner.CharBlock, - "{": scanner.CharBlock, - "[": scanner.CharBlock, - `"`: scanner.CharStr | scanner.StrEsc | scanner.StrNonl, - "`": scanner.CharStr, - "'": scanner.CharStr | scanner.StrEsc, - "/*": scanner.CharStr, - "//": scanner.CharStr | scanner.ExcludeEnd | scanner.EosValidEnd, - }, - SkipSemi: map[string]bool{ - "++": true, - "--": true, - "case": true, - "chan": true, - "const": true, - "default": true, - "defer": true, - "else": true, - "for": true, - "func": true, - "go": true, - "goto": true, - "if": true, - "import": true, - "interface": true, - "map": true, - "package": true, - "range": true, - "select": true, - "struct": true, - "switch": true, - "type": true, - "var": true, - }, -} - -var GoParser = &Parser{ - Scanner: GoScanner, - Spec: map[string]NodeSpec{ - ".": {Kind: OpDot, Flags: Call, Order: 3}, - "*": {Kind: OpMultiply, Order: 4}, - "+": {Kind: OpAdd, Order: 5}, - "-": {Kind: OpSubtract, Order: 5}, - "<": {Kind: OpInferior, Order: 6}, - ":=": {Kind: OpDefine, Flags: MultiOp, Order: 7}, - "=": {Kind: OpAssign, Flags: MultiOp, Order: 7}, - "if": {Kind: StmtIf, Flags: Stmt | ExprSep}, - "func": {Kind: DeclFunc, Flags: Decl | Call}, - "return": {Kind: StmtReturn, Flags: Stmt}, - "{..}": {Kind: BlockStmt, Flags: ExprSep}, - "(..)": {Kind: BlockParen, Flags: Call}, - "//..": {Kind: Comment}, - "/*..": {Kind: Comment}, - }, -} - -func init() { - GoParser.Init() - log.SetFlags(log.Lshortfile) -} - -func TestParse(t *testing.T) { - for _, test := range goTests { - test := test - t.Run("", func(t *testing.T) { - var err error - errStr := "" - n := &Node{} - if n.Child, err = GoParser.Parse(test.src, n); err != nil { - errStr = err.Error() - } - if errStr != test.err { - t.Errorf("got error %#v, want error %#v", errStr, test.err) - } - if dot := n.Sdot(""); dot != test.dot { - t.Errorf("got %#v, want %#v", dot, test.dot) - } - t.Log(test.src) - t.Log(n.Sdot("")) - if dotCmd := os.Getenv("DOT"); dotCmd != "" { - n.Dot(dotCmd, "") - } - }) - } -} - -var goTests = []struct { - src, dot, err string - skip bool -}{{ // #00 - src: "", - dot: `digraph ast { 0 [label=""]; }`, -}, { // #01 - src: "12", - dot: `digraph ast { 0 [label=""]; 1 [label="12"]; 0 -> 1; }`, -}, { // #02 - src: "1 + 2", - dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="1"]; 1 -> 2; 3 [label="2"]; 1 -> 3; }`, -}, { // #03 - src: "1 + 2 * 3", - dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="1"]; 1 -> 2; 3 [label="*"]; 1 -> 3; 4 [label="2"]; 3 -> 4; 5 [label="3"]; 3 -> 5; }`, -}, { // #04 - src: "1 * 2 + 3", - dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="*"]; 1 -> 2; 3 [label="1"]; 2 -> 3; 4 [label="2"]; 2 -> 4; 5 [label="3"]; 1 -> 5; }`, -}, { // #05 - src: "a := 2 + 5", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="2"]; 3 -> 4; 5 [label="5"]; 3 -> 5; }`, -}, { // #06 - src: "a := 1 * 2 + 3", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="*"]; 3 -> 4; 5 [label="1"]; 4 -> 5; 6 [label="2"]; 4 -> 6; 7 [label="3"]; 3 -> 7; }`, -}, { // #07 - src: "a := 1 + 2 * 3", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="1"]; 3 -> 4; 5 [label="*"]; 3 -> 5; 6 [label="2"]; 5 -> 6; 7 [label="3"]; 5 -> 7; }`, -}, { // #08 - src: "a := 1 + 2 * 3 + 4", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="1"]; 3 -> 4; 5 [label="+"]; 3 -> 5; 6 [label="*"]; 5 -> 6; 7 [label="2"]; 6 -> 7; 8 [label="3"]; 6 -> 8; 9 [label="4"]; 5 -> 9; }`, -}, { // #09 - src: "a := 1 + 2 + 3 * 4", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="1"]; 3 -> 4; 5 [label="+"]; 3 -> 5; 6 [label="2"]; 5 -> 6; 7 [label="*"]; 5 -> 7; 8 [label="3"]; 7 -> 8; 9 [label="4"]; 7 -> 9; }`, -}, { // #10 - src: "a := 1 * 2 + 3 * 4", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="*"]; 3 -> 4; 5 [label="1"]; 4 -> 5; 6 [label="2"]; 4 -> 6; 7 [label="*"]; 3 -> 7; 8 [label="3"]; 7 -> 8; 9 [label="4"]; 7 -> 9; }`, -}, { // #11 - src: "a := (1 + 2) * 3", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="*"]; 1 -> 3; 4 [label="(..)"]; 3 -> 4; 5 [label="+"]; 4 -> 5; 6 [label="1"]; 5 -> 6; 7 [label="2"]; 5 -> 7; 8 [label="3"]; 3 -> 8; }`, -}, { // #12 - src: "a := 2 * -3", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="*"]; 1 -> 3; 4 [label="2"]; 3 -> 4; 5 [label="-"]; 3 -> 5; 6 [label="3"]; 5 -> 6; }`, -}, { // #13 - src: "-5 + 4", - dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="-"]; 1 -> 2; 3 [label="5"]; 2 -> 3; 4 [label="4"]; 1 -> 4; }`, -}, { // #14 - src: "-5 + -4", - dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="-"]; 1 -> 2; 3 [label="5"]; 2 -> 3; 4 [label="-"]; 1 -> 4; 5 [label="4"]; 4 -> 5; }`, -}, { // #15 - src: "a := -5 + -4", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="-"]; 3 -> 4; 5 [label="5"]; 4 -> 5; 6 [label="-"]; 3 -> 6; 7 [label="4"]; 6 -> 7; }`, -}, { // #16 - src: "*a := 5 * -3", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="*"]; 1 -> 2; 3 [label="a"]; 2 -> 3; 4 [label="*"]; 1 -> 4; 5 [label="5"]; 4 -> 5; 6 [label="-"]; 4 -> 6; 7 [label="3"]; 6 -> 7; }`, -}, { // #17 - src: "*a := -5 * 3", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="*"]; 1 -> 2; 3 [label="a"]; 2 -> 3; 4 [label="*"]; 1 -> 4; 5 [label="-"]; 4 -> 5; 6 [label="5"]; 5 -> 6; 7 [label="3"]; 4 -> 7; }`, -}, { // #18 - src: "1+2\n3-4", - dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="1"]; 1 -> 2; 3 [label="2"]; 1 -> 3; 4 [label="-"]; 0 -> 4; 5 [label="3"]; 4 -> 5; 6 [label="4"]; 4 -> 6; }`, -}, { // #19 - src: "i = i+1", - dot: `digraph ast { 0 [label=""]; 1 [label="="]; 0 -> 1; 2 [label="i"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="i"]; 3 -> 4; 5 [label="1"]; 3 -> 5; }`, -}, { // #20 - src: "a[12] = 5", - dot: `digraph ast { 0 [label=""]; 1 [label="="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="[..]"]; 2 -> 3; 4 [label="12"]; 3 -> 4; 5 [label="5"]; 1 -> 5; }`, -}, { // #21 - src: "a[12][0] = 3", - dot: `digraph ast { 0 [label=""]; 1 [label="="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="[..]"]; 2 -> 3; 4 [label="12"]; 3 -> 4; 5 [label="[..]"]; 2 -> 5; 6 [label="0"]; 5 -> 6; 7 [label="3"]; 1 -> 7; }`, -}, { // #22 - src: "a.b = 34", - dot: `digraph ast { 0 [label=""]; 1 [label="="]; 0 -> 1; 2 [label="."]; 1 -> 2; 3 [label="a"]; 2 -> 3; 4 [label="b"]; 2 -> 4; 5 [label="34"]; 1 -> 5; }`, -}, { // #23 - src: "if i < 2 { return j }", - dot: `digraph ast { 0 [label=""]; 1 [label="if"]; 0 -> 1; 2 [label="<"]; 1 -> 2; 3 [label="i"]; 2 -> 3; 4 [label="2"]; 2 -> 4; 5 [label="{..}"]; 1 -> 5; 6 [label="return"]; 5 -> 6; 7 [label="j"]; 6 -> 7; }`, -}, { // #24 - src: "if i:=1; i < 2 { return j }", - dot: `digraph ast { 0 [label=""]; 1 [label="if"]; 0 -> 1; 2 [label=":="]; 1 -> 2; 3 [label="i"]; 2 -> 3; 4 [label="1"]; 2 -> 4; 5 [label="<"]; 1 -> 5; 6 [label="i"]; 5 -> 6; 7 [label="2"]; 5 -> 7; 8 [label="{..}"]; 1 -> 8; 9 [label="return"]; 8 -> 9; 10 [label="j"]; 9 -> 10; }`, -}, { // #25 - src: "f(i) + f(j)", - dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="Call"]; 1 -> 2; 3 [label="f"]; 2 -> 3; 4 [label="(..)"]; 2 -> 4; 5 [label="i"]; 4 -> 5; 6 [label="Call"]; 1 -> 6; 7 [label="f"]; 6 -> 7; 8 [label="(..)"]; 6 -> 8; 9 [label="j"]; 8 -> 9; }`, -}, { // #26 - src: "a := 1 // This is a comment", - dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="1"]; 1 -> 3; }`, - /* - }, { // #27 - src: "a, b, c = 1, f(2), 3", - //src: "f(i) + f(j)(4)", // not ok - }, { // #26 - src: "if i < 2 {return i}; return f(i-2) + f(i-1)", - }, { // #27 - src: "for i < 2 { println(i) }", - }, { // #28 - src: "func f(i int) (int) { if i < 2 { return i}; return f(i-2) + f(i-1) }", - }, { // #29 - src: "a := []int{3, 4}", - }, { // #30 - //src: "a := struct{int}", - src: "a, b = c, d", - }, { // #31 - //src: "a := [2]int{3, 4}", - src: `fmt.Println("Hello")`, - //src: "(1 + 2) * (3 - 4)", - //src: "1 + (1 + 2)", - }, { // #32 - //src: `a(3)(4)`, - //src: `3 + 2 * a(3) + 5`, - //src: `3 + 2 * a(3)(4) + (5)`, - //src: `(a(3))(4)`, - src: `a(3)(4)`, - dot: `digraph ast { 0 [label=""]; 1 [label="Call"]; 0 -> 1; 2 [label="Call"]; 1 -> 2; 3 [label="a"]; 2 -> 3; 4 [label="(..)"]; 2 -> 4; 5 [label="3"]; 4 -> 5; 6 [label="(..)"]; 1 -> 6; 7 [label="4"]; 6 -> 7; }`, - //src: `println("Hello")`, - //src: `a.b.c + 3`, - }, { // #33 - src: `func f(a int, b int) {return a + b}; f(1+2)`, - }, { // #34 - src: `if a == 1 { - println(2) - } - println("bye")`, - */ -}} diff --git a/parser/symbol.go b/parser/symbol.go new file mode 100644 index 0000000..9975c24 --- /dev/null +++ b/parser/symbol.go @@ -0,0 +1,67 @@ +package parser + +import ( + "reflect" + "strings" +) + +type symKind int + +const ( + symValue symKind = iota // a Go value defined in the runtime + symType // a Go type + symLabel // a label indication a position in the VM code + symConst // a Go constant + symVar // a Go variable, located in the VM memory + symFunc // a Go function, located in the VM code +) + +type symbol struct { + kind symKind + index int // address of symbol in frame + value any + Type reflect.Type + local bool // if true address is relative to local frame, otherwise global + used bool +} + +func (p *Parser) AddSym(i int, name string, v any) { p.addSym(i, name, v, symValue, nil, false) } + +func (p *Parser) addSym(i int, name string, v any, k symKind, t reflect.Type, local bool) { + p.symbols[name] = &symbol{kind: k, index: i, local: local, value: v, Type: t, used: true} +} + +// getSym searches for an existing symbol starting from the deepest scope. +func (p *Parser) getSym(name, scope string) (sym *symbol, sc string, ok bool) { + for { + if sym, ok = p.symbols[scope+name]; ok { + return sym, scope, ok + } + scope = strings.TrimSuffix(scope, "/") + i := strings.LastIndex(scope, "/") + if i == -1 { + scope = "" + break + } + if scope = scope[:i]; scope == "" { + break + } + } + sym, ok = p.symbols[name] + return sym, scope, ok +} + +func initUniverse() map[string]*symbol { + return map[string]*symbol{ + "any": {kind: symType, Type: reflect.TypeOf((*any)(nil)).Elem()}, + "bool": {kind: symType, Type: reflect.TypeOf((*bool)(nil)).Elem()}, + "error": {kind: symType, Type: reflect.TypeOf((*error)(nil)).Elem()}, + "int": {kind: symType, Type: reflect.TypeOf((*int)(nil)).Elem()}, + "string": {kind: symType, Type: reflect.TypeOf((*string)(nil)).Elem()}, + + "nil": {}, + "iota": {value: 0}, + "true": {value: true}, + "false": {value: false}, + } +} diff --git a/parser/type.go b/parser/type.go new file mode 100644 index 0000000..77ccc6e --- /dev/null +++ b/parser/type.go @@ -0,0 +1,117 @@ +package parser + +import ( + "fmt" + "log" + "reflect" + + "github.com/gnolang/parscan/lang" +) + +// ParseType parses a list of tokens defining a type expresssion and returns +// the corresponding runtime type or an error. +func (p *Parser) ParseType(in Tokens) (typ reflect.Type, err error) { + log.Println("ParseType", in) + switch in[0].Id { + case lang.Func: + // Get argument and return token positions depending on function pattern: + // method with receiver, named function or anonymous closure. + // TODO: handle variadics + var out Tokens + var indexArgs int + switch l, in1 := len(in), in[1]; { + case l >= 4 && in1.Id == lang.ParenBlock && in[2].Id == lang.Ident: + indexArgs, out = 3, in[4:] + case l >= 3 && in1.Id == lang.Ident: + indexArgs, out = 2, in[3:] + case l >= 2 && in1.Id == lang.ParenBlock: + indexArgs, out = 1, in[2:] + default: + return nil, fmt.Errorf("invalid func signature") + } + + // We can now parse function input and output parameter types. + // Input parameters are always enclosed by parenthesis. + iargs, err := p.Scan(in[indexArgs].Block(), false) + if err != nil { + return nil, err + } + arg, err := p.parseParamTypes(iargs, true) + if err != nil { + return nil, err + } + // Output parameters may be empty, or enclosed or not by parenthesis. + if len(out) == 1 && out[0].Id == lang.ParenBlock { + if out, err = p.Scan(out[0].Block(), false); err != nil { + return nil, err + } + } + ret, err := p.parseParamTypes(out, false) + if err != nil { + return nil, err + } + return reflect.FuncOf(arg, ret, false), nil + + case lang.Ident: + // TODO: selector expression (pkg.type) + s, _, ok := p.getSym(in[0].Str, p.scope) + if !ok || s.kind != symType { + return nil, fmt.Errorf("invalid type %s", in[0].Str) + } + return s.Type, nil + } + return typ, err +} + +// parseParamTypes parses a list of comma separated typed parameters and returns a list of +// runtime types. Implicit parameter names and types are supported. +func (p *Parser) parseParamTypes(in Tokens, arg bool) (types []reflect.Type, err error) { + // Parse from right to left, to allow multiple comma separated parameters of the same type. + list := in.Split(lang.Comma) + for i := len(list) - 1; i >= 0; i-- { + t := list[i] + if len(t) == 0 { + continue + } + param := "" + if p.hasFirstParam(t) { + param = t[0].Str + t = t[1:] + if len(t) == 0 { + if len(types) == 0 { + return nil, fmt.Errorf("Invalid type %v", t[0]) + } + // Type was ommitted, apply the previous one from the right. + types = append([]reflect.Type{types[0]}, types...) + if arg { + p.addSym(-i-2, p.scope+param, nil, symVar, types[0], true) + } else { + p.addSym(i, p.scope+param, nil, symVar, types[0], true) + } + continue + } + } + typ, err := p.ParseType(t) + if err != nil { + return nil, err + } + if param != "" { + if arg { + p.addSym(-i-2, p.scope+param, nil, symVar, typ, true) + } else { + p.addSym(i, p.scope+param, nil, symVar, typ, true) + } + } + types = append([]reflect.Type{typ}, types...) + } + return types, err +} + +// hasFirstParam returns true if the first token of a list is a parameter name. +func (p *Parser) hasFirstParam(in Tokens) bool { + if in[0].Id != lang.Ident { + return false + } + s, _, ok := p.getSym(in[0].Str, p.scope) + return !ok || s.kind != symType +} diff --git a/scanner/scan.go b/scanner/scan.go index e0a32ca..99d6768 100644 --- a/scanner/scan.go +++ b/scanner/scan.go @@ -4,124 +4,55 @@ import ( "errors" "fmt" "regexp" - "strconv" "strings" -) - -// Kind is the token type kind. -type Kind uint -const ( - Undefined Kind = iota - Identifier - Number - Operator - Separator - String - Block - Custom + "github.com/gnolang/parscan/lang" ) -const ( - CharOp = 1 << iota - CharNum - CharAlpha - CharSep - CharLineSep - CharGroupSep - CharStr - CharBlock - StrEsc - StrNonl - ExcludeEnd // exclude end delimiter from content - EosValidEnd // end of input string terminates block or string token +var ( + ErrBlock = errors.New("block not terminated") + ErrIllegal = errors.New("illegal token") ) -var ErrBlock = errors.New("block not terminated") - // Token defines a scanner token. type Token struct { - pos int - kind Kind - content string - start int - end int - value any -} - -type Tokens []*Token - -func (tks Tokens) String() (s string) { - for _, t := range tks { - s += fmt.Sprintf("%#v ", t.content) - } - return + Id lang.TokenId // token identificator + Pos int // position in source + Str string // string in source + Beg int // length of begin delimiter (block, string) + End int // length of end delimiter (block, string) } -func (t *Token) Kind() Kind { return t.kind } -func (t *Token) Content() string { return t.content } -func (t *Token) Start() int { return t.start } -func (t *Token) End() int { return t.end } -func (t *Token) Pos() int { return t.pos } -func (t *Token) Block() string { return t.content[t.start : len(t.content)-t.end] } -func (t *Token) Prefix() string { return t.content[:t.start] } -func (t *Token) Value() any { return t.value } -func (t *Token) IsBlock() bool { return t.kind == Block } -func (t *Token) IsOperator() bool { return t.kind == Operator } -func (t *Token) IsSeparator() bool { return t.kind == Separator } +func (t *Token) Block() string { return t.Str[t.Beg : len(t.Str)-t.End] } +func (t *Token) Prefix() string { return t.Str[:t.Beg] } func (t *Token) Name() string { - name := t.content - if t.start > 1 { - return name[:t.start] + ".." + name := t.Str + if t.Beg > 1 { + return name[:t.Beg] + ".." } - if t.start > 0 { - return name[:t.start] + ".." + name[len(name)-t.end:] + if t.Beg > 0 { + return name[:t.Beg] + ".." + name[len(name)-t.End:] } return name } -func NewToken(content string, pos int) *Token { - return &Token{pos, Custom, content, 0, 0, nil} -} - -const ASCIILen = 1 << 7 // 128 - // Scanner contains the scanner rules for a language. type Scanner struct { - CharProp [ASCIILen]uint // special Character properties - End map[string]string // end delimiters, indexed by start - BlockProp map[string]uint // block properties - SkipSemi map[string]bool // words skipping automatic semicolon insertion after newline - DotNum bool // true if a number can start with '.' - IdAscii bool // true if an identifier can be in ASCII only - Num_ bool // true if a number can contain _ character + *lang.Spec sdre *regexp.Regexp // string delimiters regular expression } -func (sc *Scanner) HasProp(r rune, p uint) bool { - if r >= ASCIILen { - return false - } - return sc.CharProp[r]&p != 0 -} +func NewScanner(spec *lang.Spec) *Scanner { + sc := &Scanner{Spec: spec} -func (sc *Scanner) isOp(r rune) bool { return sc.HasProp(r, CharOp) } -func (sc *Scanner) isSep(r rune) bool { return sc.HasProp(r, CharSep) } -func (sc *Scanner) isLineSep(r rune) bool { return sc.HasProp(r, CharLineSep) } -func (sc *Scanner) isGroupSep(r rune) bool { return sc.HasProp(r, CharGroupSep) } -func (sc *Scanner) isStr(r rune) bool { return sc.HasProp(r, CharStr) } -func (sc *Scanner) isBlock(r rune) bool { return sc.HasProp(r, CharBlock) } -func (sc *Scanner) isDir(r rune) bool { - return !sc.HasProp(r, CharOp|CharSep|CharLineSep|CharGroupSep|CharStr|CharBlock) -} + // TODO: Mark unset ASCII char other than alphanum illegal -func (sc *Scanner) Init() { // Build a regular expression to match all string start delimiters at once. re := "(" for s, p := range sc.BlockProp { - if p&CharStr == 0 { + if p&lang.CharStr == 0 { continue } // TODO: sort keys in decreasing length order. @@ -132,46 +63,69 @@ func (sc *Scanner) Init() { } re = strings.TrimSuffix(re, "|") + ")$" sc.sdre = regexp.MustCompile(re) + + return sc +} + +func (sc *Scanner) HasProp(r rune, p uint) bool { + if r >= lang.ASCIILen { + return false + } + return sc.CharProp[r]&p != 0 +} + +func (sc *Scanner) isOp(r rune) bool { return sc.HasProp(r, lang.CharOp) } +func (sc *Scanner) isSep(r rune) bool { return sc.HasProp(r, lang.CharSep) } +func (sc *Scanner) isLineSep(r rune) bool { return sc.HasProp(r, lang.CharLineSep) } +func (sc *Scanner) isGroupSep(r rune) bool { return sc.HasProp(r, lang.CharGroupSep) } +func (sc *Scanner) isStr(r rune) bool { return sc.HasProp(r, lang.CharStr) } +func (sc *Scanner) isBlock(r rune) bool { return sc.HasProp(r, lang.CharBlock) } +func (sc *Scanner) isDir(r rune) bool { + return !sc.HasProp(r, lang.CharOp|lang.CharSep|lang.CharLineSep|lang.CharGroupSep|lang.CharStr|lang.CharBlock) } func isNum(r rune) bool { return '0' <= r && r <= '9' } -func (sc *Scanner) Scan(src string) (tokens Tokens, err error) { +func (sc *Scanner) Scan(src string, semiEOF bool) (tokens []Token, err error) { offset := 0 - s := src + s := strings.TrimSpace(src) for len(s) > 0 { t, err := sc.Next(s) if err != nil { - return nil, fmt.Errorf("%s: %w", loc(src, offset+t.pos), err) + return nil, fmt.Errorf("%s: %w", loc(src, offset+t.Pos), err) } - if t.kind == Undefined { + if t.Id == lang.Illegal && t.Str == "" { break } skip := false - if len(tokens) > 0 && len(sc.SkipSemi) > 0 && t.kind == Separator && t.content == " " { + if len(tokens) > 0 && t.Str == "\n" { // Check for automatic semi-colon insertion after newline. last := tokens[len(tokens)-1] - if last.kind == Identifier && sc.SkipSemi[last.content] || - last.kind == Operator && !sc.SkipSemi[last.content] { + if last.Id.IsKeyword() && sc.TokenProps[last.Str].SkipSemi || + last.Id.IsOperator() && !sc.TokenProps[last.Str].SkipSemi { skip = true } else { - t.content = ";" + t.Id = lang.Semicolon + t.Str = ";" } } - nr := t.pos + len(t.content) + nr := t.Pos + len(t.Str) s = s[nr:] - t.pos += offset + t.Pos += offset offset += nr if !skip { tokens = append(tokens, t) } } - // Insertion of semi-colon at the end of the token stream. - if len(tokens) > 0 && len(sc.SkipSemi) > 0 { + // Optional insertion of semi-colon at the end of the token stream. + if semiEOF && len(tokens) > 0 { last := tokens[len(tokens)-1] - if !(last.kind == Identifier && sc.SkipSemi[last.content] || - last.kind == Operator && !sc.SkipSemi[last.content]) { - tokens = append(tokens, &Token{kind: Separator, content: ";"}) + if last.Str == ";" { + return tokens, nil + } + if !(last.Id == lang.Ident && sc.TokenProps[last.Str].SkipSemi || + last.Id.IsOperator() && !sc.TokenProps[last.Str].SkipSemi) { + tokens = append(tokens, Token{Id: lang.Semicolon, Str: ";"}) } } return tokens, nil @@ -188,7 +142,7 @@ func loc(s string, p int) string { } // Next returns the next token in string. -func (sc *Scanner) Next(src string) (tok *Token, err error) { +func (sc *Scanner) Next(src string) (tok Token, err error) { p := 0 // Skip initial separators. @@ -204,56 +158,65 @@ func (sc *Scanner) Next(src string) (tok *Token, err error) { for i, r := range src { switch { case sc.isSep(r): - return &Token{}, nil + return Token{}, nil case sc.isGroupSep(r): // TODO: handle group separators. - return &Token{kind: Separator, pos: p + i, content: string(r)}, nil + return Token{Id: sc.TokenProps[string(r)].TokenId, Pos: p + i, Str: string(r)}, nil case sc.isLineSep(r): - return &Token{kind: Separator, pos: p + i, content: " "}, nil + return Token{Pos: p + i, Str: "\n"}, nil case sc.isStr(r): s, ok := sc.getStr(src[i:], 1) if !ok { err = ErrBlock } - return &Token{kind: String, pos: p + i, content: s, start: 1, end: 1}, err + return Token{Id: lang.String, Pos: p + i, Str: s, Beg: 1, End: 1}, err case sc.isBlock(r): b, ok := sc.getBlock(src[i:], 1) if !ok { err = ErrBlock } - return &Token{kind: Block, pos: p + i, content: b, start: 1, end: 1}, err + tok := Token{Pos: p + i, Str: b, Beg: 1, End: 1} + tok.Id = sc.TokenProps[tok.Name()].TokenId + return tok, err case sc.isOp(r): op, isOp := sc.getOp(src[i:]) if isOp { - return &Token{kind: Operator, pos: p + i, content: op}, nil + id := sc.TokenProps[op].TokenId + if id == lang.Illegal { + err = fmt.Errorf("%w: %s", ErrIllegal, op) + } + return Token{Id: id, Pos: p + i, Str: op}, err } flag := sc.BlockProp[op] - if flag&CharStr != 0 { + if flag&lang.CharStr != 0 { s, ok := sc.getStr(src[i:], len(op)) if !ok { err = ErrBlock } - return &Token{kind: String, pos: p + i, content: s, start: len(op), end: len(op)}, err + return Token{Id: lang.Comment, Pos: p + i, Str: s, Beg: len(op), End: len(op)}, err } case isNum(r): - c, v := sc.getNum(src[i:]) - return &Token{kind: Number, pos: p + i, content: c, value: v}, nil + return Token{Id: lang.Int, Pos: p + i, Str: sc.getNum(src[i:])}, nil default: id, isId := sc.getId(src[i:]) if isId { - return &Token{kind: Identifier, pos: p + i, content: id}, nil + ident := sc.TokenProps[id].TokenId + if ident == lang.Illegal { + ident = lang.Ident + } + return Token{Id: ident, Pos: p + i, Str: id}, nil } flag := sc.BlockProp[id] - if flag&CharBlock != 0 { + if flag&lang.CharBlock != 0 { s, ok := sc.getBlock(src[i:], len(id)) if !ok { err = ErrBlock } - return &Token{kind: Block, pos: p + i, content: s, start: len(id), end: len(id)}, err + return Token{Pos: p + i, Str: s, Beg: len(id), End: len(id)}, err } } } - return &Token{}, nil + return Token{}, nil } func (sc *Scanner) getId(src string) (s string, isId bool) { @@ -287,7 +250,7 @@ func (sc *Scanner) getOp(src string) (s string, isOp bool) { return s, true } -func (sc *Scanner) getNum(src string) (s string, v any) { +func (sc *Scanner) getNum(src string) (s string) { // TODO: handle hexa, binary, octal, float and eng notations. for i, r := range src { if !isNum(r) { @@ -295,31 +258,22 @@ func (sc *Scanner) getNum(src string) (s string, v any) { } s = src[:i+1] } - var err error - if strings.ContainsRune(s, '.') { - v, err = strconv.ParseFloat(s, 64) - } else { - v, err = strconv.ParseInt(s, 0, 64) - } - if err != nil { - v = err - } - return s, v + return s } func (sc *Scanner) getStr(src string, nstart int) (s string, ok bool) { start := src[:nstart] end := sc.End[start] prop := sc.BlockProp[start] - canEscape := prop&StrEsc != 0 - nonl := prop&StrNonl != 0 - excludeEnd := prop&ExcludeEnd != 0 + canEscape := prop&lang.StrEsc != 0 + nonl := prop&lang.StrNonl != 0 + excludeEnd := prop&lang.ExcludeEnd != 0 var esc bool for i, r := range src[nstart:] { s = src[:nstart+i+1] if r == '\n' && nonl { - return + return s, ok } if strings.HasSuffix(s, end) && !esc { if excludeEnd { @@ -329,7 +283,7 @@ func (sc *Scanner) getStr(src string, nstart int) (s string, ok bool) { } esc = canEscape && r == '\\' && !esc } - ok = prop&EosValidEnd != 0 + ok = prop&lang.EosValidEnd != 0 return s, ok } @@ -359,22 +313,12 @@ func (sc *Scanner) getBlock(src string, nstart int) (s string, ok bool) { skip = nstart + i + len(str) - l1 } if n == 0 { - if prop&ExcludeEnd != 0 { + if prop&lang.ExcludeEnd != 0 { s = s[:len(s)-len(end)] } return s, true } } - ok = prop&EosValidEnd != 0 + ok = prop&lang.EosValidEnd != 0 return s, ok } - -// Index returns the index of the first instance with content s in tokens, or -1 if not found. -func Index(tokens []*Token, s string) int { - for i, t := range tokens { - if t.content == s { - return i - } - } - return -1 -} diff --git a/scanner/scan_test.go b/scanner/scan_test.go index bd0a13c..496e4e4 100644 --- a/scanner/scan_test.go +++ b/scanner/scan_test.go @@ -1,107 +1,47 @@ -package scanner +package scanner_test import ( + "fmt" "log" "testing" + + "github.com/gnolang/parscan/lang/golang" + "github.com/gnolang/parscan/scanner" ) -var GoScanner = &Scanner{ - CharProp: [ASCIILen]uint{ - '\t': CharSep, - '\n': CharLineSep, - ' ': CharSep, - '!': CharOp, - '"': CharStr | StrEsc | StrNonl, - '%': CharOp, - '&': CharOp, - '\'': CharStr | StrEsc, - '(': CharBlock, - '*': CharOp, - '+': CharOp, - ',': CharGroupSep, - '-': CharOp, - '`': CharStr, - '.': CharOp, - '/': CharOp, - ':': CharOp, - ';': CharGroupSep, - '<': CharOp, - '=': CharOp, - '>': CharOp, - '[': CharBlock, - '^': CharOp, - '{': CharBlock, - '|': CharOp, - '~': CharOp, - }, - End: map[string]string{ - "(": ")", - "{": "}", - "[": "]", - "/*": "*/", - `"`: `"`, - "'": "'", - "`": "`", - "//": "\n", - }, - BlockProp: map[string]uint{ - "(": CharBlock, - "{": CharBlock, - "[": CharBlock, - `"`: CharStr | StrEsc | StrNonl, - "`": CharStr, - "'": CharStr | StrEsc, - "/*": CharStr, - "//": CharStr | ExcludeEnd | EosValidEnd, - }, - SkipSemi: map[string]bool{ - "++": true, - "--": true, - "case": true, - "chan": true, - "const": true, - "default": true, - "defer": true, - "else": true, - "for": true, - "func": true, - "go": true, - "goto": true, - "if": true, - "import": true, - "interface": true, - "map": true, - "package": true, - "range": true, - "select": true, - "struct": true, - "switch": true, - "type": true, - "var": true, - }, +var GoScanner *scanner.Scanner + +func init() { + log.SetFlags(log.Lshortfile) + GoScanner = scanner.NewScanner(golang.GoSpec) } func TestScan(t *testing.T) { - log.SetFlags(log.Lshortfile) - GoScanner.Init() for _, test := range tests { test := test t.Run("", func(t *testing.T) { errStr := "" - tokens, err := GoScanner.Scan(test.src) + tokens, err := GoScanner.Scan(test.src, true) if err != nil { errStr = err.Error() } if errStr != test.err { t.Errorf("got error %v, want error %#v", errStr, test.err) } - if result := tokens.String(); result != test.tok { + if result := tokStr(tokens); result != test.tok { t.Errorf("got %v, want %v", result, test.tok) } }) } } +func tokStr(tokens []scanner.Token) (s string) { + for _, t := range tokens { + s += fmt.Sprintf("%#v ", t.Str) + } + return +} + var tests = []struct { src, tok, err string }{{ // #00 @@ -192,4 +132,7 @@ def"`, }, { // #28 src: "f(3).\nfield", tok: `"f" "(3)" "." "field" ";" `, +}, { // #29 + src: "\n\n\tif i < 1 {return 0}", + tok: `"if" "i" "<" "1" "{return 0}" ";" `, }} diff --git a/testdata/add b/testdata/add deleted file mode 100644 index a403485..0000000 --- a/testdata/add +++ /dev/null @@ -1,2 +0,0 @@ -func add(a int, b int) int { return a + b } -add(4, 3) diff --git a/testdata/fib b/testdata/fib deleted file mode 100644 index 654c5c0..0000000 --- a/testdata/fib +++ /dev/null @@ -1,6 +0,0 @@ -func fib(i int) int { - if i < 2 { return i } - return fib(i-2) + fib(i-1) -} - -println(fib(6)) diff --git a/testdata/p00 b/testdata/p00 deleted file mode 100644 index 19f5084..0000000 --- a/testdata/p00 +++ /dev/null @@ -1 +0,0 @@ -1+2 diff --git a/testdata/p01 b/testdata/p01 deleted file mode 100644 index baafaa9..0000000 --- a/testdata/p01 +++ /dev/null @@ -1,2 +0,0 @@ -s := "Hello" -println(s, 5+2) diff --git a/testdata/p02 b/testdata/p02 deleted file mode 100644 index 1aeb4a9..0000000 --- a/testdata/p02 +++ /dev/null @@ -1,7 +0,0 @@ -a := 1 - -if a < 2 { - println("yep") -} - -println("ok") diff --git a/testdata/p03 b/testdata/p03 deleted file mode 100644 index 39a3cda..0000000 --- a/testdata/p03 +++ /dev/null @@ -1,3 +0,0 @@ -func f(a int, b int) int { return a + b } - -println("f:", f(3, 4), f(5, 6)) diff --git a/testdata/p04 b/testdata/p04 deleted file mode 100644 index f2a85c8..0000000 --- a/testdata/p04 +++ /dev/null @@ -1,4 +0,0 @@ -func f() int { println(a) } - -a := 4 -f() diff --git a/testdata/p05 b/testdata/p05 deleted file mode 100644 index 51c2c9b..0000000 --- a/testdata/p05 +++ /dev/null @@ -1,10 +0,0 @@ -func f(i int) int { - if i < 2 { - if i == 1 { - return - } } - println("i", i) -} - -f(1) -println("bye") diff --git a/testdata/p06 b/testdata/p06 deleted file mode 100644 index 88029cc..0000000 --- a/testdata/p06 +++ /dev/null @@ -1,7 +0,0 @@ -func f(i int) { - if i < 2 { return } - println("i > 1:", i) -} - -f(1) -println("bye") diff --git a/testdata/p07 b/testdata/p07 deleted file mode 100644 index 2fa7ee0..0000000 --- a/testdata/p07 +++ /dev/null @@ -1,5 +0,0 @@ -func f1() { println("in f1") } - -func f2() { println("in f2"); f1() } - -f2() diff --git a/vm/README.md b/vm/README.md new file mode 100644 index 0000000..92b1ac8 --- /dev/null +++ b/vm/README.md @@ -0,0 +1,38 @@ +# vm + +`vm` is a bytecode based stack machine. + +The purpose of `vm` is to provide a simple, fast, embeddable and +portable Go execution environment. + +```mermaid +graph LR +s[ ] --> |source| a(scanner) +--> |tokens| b(parser) +--> |AST| c(codegen) +--> |bytecode| d[vm] +subgraph vm + d +end +style s height:0px; +``` + +The bytecode consists of a dozen of instructions, each taking 0 or 1 +immediate argument (non-immediate arguments are only passed through the +stack). Only a few operators for a few types are implemented. I expect +to have 1 instruction per operator per numerical type, all with the same +pattern, which would be generated from a template. Estimate is around 20 +operators and 10 numerical types, so around 200 instructions in final. + +Structurally, the vm implements logical and arithmetic operators, +condional jumps for `if`, `for` and `switch` control flow, and function +call, return and frame management. + +the memory state of the vm is a slice of Go interfaces (`[]any`). + +The whole vm is coded in a single function of 80 lines with no +dependencies. The size will grow as we add missing instructions, but the +code complexity will remain the same. + +the vm1 package is totally standalone and could be used for any purpose +outside of parscan and/or gno. diff --git a/vm/vm.go b/vm/vm.go new file mode 100644 index 0000000..4057fc1 --- /dev/null +++ b/vm/vm.go @@ -0,0 +1,229 @@ +package vm + +import ( + "fmt" // for tracing only + "log" // for tracing only + "reflect" // for optional CallX only + "strconv" // for tracing only +) + +const debug = true + +// Byte-code instruction set. +const ( + // instruction effect on stack: values consumed -- values produced + Nop = iota // -- + Add // n1 n2 -- sum ; sum = n1+n2 + Assign // val -- ; mem[$1] = val + Fassign // val -- ; mem[$1] = val + Call // f [a1 .. ai] -- [r1 .. rj] ; r1, ... = prog[f](a1, ...) + Calli // f [a1 .. ai] -- [r1 .. rj] ; r1, ... = prog[f](a1, ...) + CallX // f [a1 .. ai] -- [r1 .. rj] ; r1, ... = mem[f](a1, ...) + Dup // addr -- value ; value = mem[addr] + Fdup // addr -- value ; value = mem[addr] + Equal // n1 n2 -- cond ; cond = n1 == n2 + Exit // -- ; + Jump // -- ; ip += $1 + JumpTrue // cond -- ; if cond { ip += $1 } + JumpFalse // cond -- ; if cond { ip += $1 } + Lower // n1 n2 -- cond ; cond = n1 < n2 + Loweri // n1 -- cond ; cond = n1 < $1 + Pop // v -- + Push // -- v + Return // [r1 .. ri] -- ; exit frame: sp = fp, fp = pop + Sub // n1 n2 -- diff ; diff = n1 - n2 + Subi // n1 -- diff ; diff = n1 - $1 +) + +var strop = [...]string{ // for VM tracing. + Nop: "Nop", + Add: "Add", + Assign: "Assign", + Call: "Call", + Calli: "Calli", + CallX: "CallX", + Dup: "Dup", + Equal: "Equal", + Exit: "Exit", + Fassign: "Fassign", + Fdup: "Fdup", + Jump: "Jump", + JumpTrue: "JumpTrue", + JumpFalse: "JumpFalse", + Lower: "Lower", + Loweri: "Loweri", + Pop: "Pop", + Push: "Push", + Return: "Return", + Sub: "Sub", + Subi: "Subi", +} + +// Machine represents a virtual machine. +type Machine struct { + code [][]int64 // code to execute + mem []any // memory, as a stack + ip, fp int // instruction and frame pointer + ic uint64 // instruction counter, incremented at each instruction executed + // flags uint // to set options such as restrict CallX, etc... +} + +// Run runs a program. +func (m *Machine) Run() (err error) { + code, mem, ip, fp, sp, ic := m.code, m.mem, m.ip, m.fp, 0, m.ic + + defer func() { m.mem, m.ip, m.fp, m.ic = mem, ip, fp, ic }() + + trace := func() { + if !debug { + return + } + var op2, op3 string + c := code[ip] + if l := len(c); l > 2 { + op2 = strconv.Itoa(int(c[2])) + if l > 3 { + op3 = strconv.Itoa(int(c[3])) + } + } + log.Printf("ip:%-4d sp:%-4d fp:%-4d op:[%-9s %-4s %-4s] mem:%v\n", ip, sp, fp, strop[c[1]], op2, op3, mem) + } + + for { + sp = len(mem) // stack pointer + trace() + ic++ + switch op := code[ip]; op[1] { + case Add: + mem[sp-2] = mem[sp-2].(int) + mem[sp-1].(int) + mem = mem[:sp-1] + case Assign: + mem[op[2]] = mem[sp-1] + mem = mem[:sp-1] + case Fassign: + mem[fp+int(op[2])-1] = mem[sp-1] + mem = mem[:sp-1] + case Call: + nip := mem[sp-1].(int) + mem = append(mem[:sp-1], ip+1, fp) + ip = nip + fp = sp + 1 + continue + case Calli: + mem = append(mem, ip+1, fp) + fp = sp + 2 + ip += int(op[2]) + continue + case CallX: // Should be made optional. + l := int(op[2]) + in := make([]reflect.Value, l) + for i := range in { + in[i] = reflect.ValueOf(mem[sp-2-i]) + } + f := reflect.ValueOf(mem[sp-1]) + mem = mem[:sp-l-1] + for _, v := range f.Call(in) { + mem = append(mem, v.Interface()) + } + case Dup: + mem = append(mem, mem[int(op[2])]) + case Equal: + mem[sp-2] = mem[sp-2].(int) == mem[sp-1].(int) + mem = mem[:sp-1] + case Exit: + return err + case Fdup: + mem = append(mem, mem[int(op[2])+fp-1]) + case Jump: + ip += int(op[2]) + continue + case JumpTrue: + cond := mem[sp-1].(bool) + mem = mem[:sp-1] + if cond { + ip += int(op[2]) + continue + } + case JumpFalse: + cond := mem[sp-1].(bool) + mem = mem[:sp-1] + if !cond { + ip += int(op[2]) + continue + } + case Lower: + mem[sp-2] = mem[sp-1].(int) < mem[sp-2].(int) + mem = mem[:sp-1] + case Loweri: + mem[sp-1] = mem[sp-1].(int) < int(op[2]) + case Pop: + mem = mem[:sp-int(op[2])] + case Push: + mem = append(mem, int(op[2])) + case Return: + ip = mem[fp-2].(int) + ofp := fp + fp = mem[fp-1].(int) + mem = append(mem[:ofp-int(op[2])-int(op[3])-1], mem[sp-int(op[2]):]...) + continue + case Sub: + mem[sp-2] = mem[sp-1].(int) - mem[sp-2].(int) + mem = mem[:sp-1] + case Subi: + mem[sp-1] = mem[sp-1].(int) - int(op[2]) + } + ip++ + } +} + +func (m *Machine) PushCode(code ...[]int64) (p int) { + p = len(m.code) + m.code = append(m.code, code...) + return p +} + +func (m *Machine) SetIP(ip int) { m.ip = ip } +func (m *Machine) Push(v ...any) (l int) { l = len(m.mem); m.mem = append(m.mem, v...); return l } +func (m *Machine) Pop() (v any) { l := len(m.mem) - 1; v = m.mem[l]; m.mem = m.mem[:l]; return v } +func (m *Machine) Top() (v any) { + if l := len(m.mem); l > 0 { + v = m.mem[l-1] + } + return v +} + +func (m *Machine) PopExit() { + if l := len(m.code); l > 0 && m.code[l-1][1] == Exit { + m.code = m.code[:l-1] + } +} + +func CodeString(op []int64) string { + switch len(op) { + case 2: + return strop[op[1]] + case 3: + return fmt.Sprintf("%s %d", strop[op[1]], op[2]) + case 4: + return fmt.Sprintf("%s %d %d", strop[op[1]], op[2], op[3]) + } + return "" +} + +// Disassemble returns the code as a readable string. +func Disassemble(code [][]int64) (asm string) { + for _, op := range code { + asm += CodeString(op) + "\n" + /* + switch len(op) { + case 2: + asm += strop[op[1]] + "\n" + case 3: + asm += fmt.Sprintf("%s %d\n", strop[op[1]], op[2]) + case 4: + asm += fmt.Sprintf("%s %d %d\n", strop[op[1]], op[2], op[3]) + } + */ + } + return asm +} diff --git a/vm/vm_test.go b/vm/vm_test.go new file mode 100644 index 0000000..08e8054 --- /dev/null +++ b/vm/vm_test.go @@ -0,0 +1,155 @@ +package vm + +import ( + "fmt" + "log" + "testing" +) + +func init() { + log.SetFlags(log.Lshortfile) +} + +func TestVM(t *testing.T) { + for _, test := range tests { + test := test + t.Run("", func(t *testing.T) { + m := &Machine{} + for _, v := range test.sym { + m.Push(v) + } + m.PushCode(test.code...) + if err := m.Run(); err != nil { + t.Errorf("run error: %v", err) + } + t.Log(m.mem) + r := fmt.Sprintf("%v", m.mem[test.start:test.end]) + if r != test.mem { + t.Errorf("got %v, want %v", r, test.mem) + } + }) + } +} + +func BenchmarkVM(b *testing.B) { + for _, test := range tests { + test := test + b.Run("", func(b *testing.B) { + for i := 0; i < b.N; i++ { + b.StopTimer() + m := &Machine{} + m.PushCode(test.code...) + b.StartTimer() + + if err := m.Run(); err != nil { + b.Errorf("run error: %v", err) + } + } + }) + } +} + +var tests = []struct { + sym []any // initial memory values + code [][]int64 // bytecode to execute + start, end int // + mem string // expected memory content +}{{ // #00 -- A simple addition. + code: [][]int64{ + {0, Push, 1}, + {0, Push, 2}, + {0, Add}, + {0, Exit}, + }, + start: 0, end: 1, mem: "[3]", +}, { // #01 -- Calling a function defined outside the VM. + sym: []any{fmt.Println, "Hello"}, + code: [][]int64{ + {0, Dup, 0}, + {0, CallX, 1}, + {0, Exit}, + }, + start: 1, end: 3, mem: "[6 ]", +}, { // #02 -- Defining and calling a function in VM. + code: [][]int64{ + {0, Jump, 3}, // 0 + {0, Push, 3}, // 1 + {0, Return, 1, 1}, // 2 + {0, Push, 1}, // 3 + {0, Calli, -3}, // 4 + {0, Exit}, // 5 + }, + start: 0, end: 1, mem: "[3]", +}, { // #03 -- Defining and calling a function in VM. + code: [][]int64{ + {0, Jump, 3}, // 0 + {0, Push, 3}, // 1 + {0, Return, 1, 1}, // 2 + {0, Push, 1}, // 3 + {0, Push, 1}, // 4 + {0, Call}, // 5 + {0, Exit}, // 6 + }, + start: 0, end: 1, mem: "[3]", +}, { // #04 -- Defining and calling a function in VM. + code: [][]int64{ + {0, Jump, 5}, // 0 + {0, Push, 3}, // 1 + {0, Fassign, -2}, // 2 + {0, Fdup, -2}, // 3 + {0, Return, 1, 1}, // 4 + {0, Push, 1}, // 5 + {0, Push, 1}, // 6 + {0, Call}, // 7 + {0, Exit}, // 8 + }, + start: 0, end: 1, mem: "[3]", +}, { // #05 -- Fibonacci numbers, hand written. Showcase recursivity. + code: [][]int64{ + {0, Jump, 19}, // 0 + {0, Push, 2}, // 2 [2] + {0, Fdup, -2}, // 1 [2 i] + {0, Lower}, // 3 [true/false] + {0, JumpTrue, 13}, // 4 [], goto 17 + {0, Push, 2}, // 6 [i 2] + {0, Fdup, -2}, // 5 [i] + {0, Sub}, // 7 [(i-2)] + {0, Push, 1}, // 8 + {0, Call}, // 9 [fib(i-2)] + {0, Push, 1}, // 11 [(i-2) i 1] + {0, Fdup, -2}, // 10 [fib(i-2) i] + {0, Sub}, // 12 [(i-2) (i-1)] + {0, Push, 1}, // 13 + {0, Call}, // 14 [fib(i-2) fib(i-1)] + {0, Add}, // 15 [fib(i-2)+fib(i-1)] + {0, Return, 1, 1}, // 16 return i + {0, Fdup, -2}, // 17 [i] + {0, Return, 1, 1}, // 18 return i + {0, Push, 6}, // 19 [1] + {0, Push, 1}, // 20 + {0, Call}, // 21 [fib(*1)] + {0, Exit}, // 22 + }, + start: 0, end: 1, mem: "[8]", +}, { // #06 -- Fibonacci with some immediate instructions. + code: [][]int64{ + {0, Jump, 14}, // 0 + {0, Fdup, -2}, // 1 [i] + {0, Loweri, 2}, // 2 [true/false] + {0, JumpTrue, 9}, // 3 [], goto 12 + {0, Fdup, -2}, // 4 [i] + {0, Subi, 2}, // 5 [(i-2)] + {0, Calli, -5}, // 6 [fib(i-2)] + {0, Fdup, -2}, // 7 [fib(i-2) i] + {0, Subi, 1}, // 8 [(i-2) (i-1)] + {0, Calli, -8}, // 9 [fib(i-2) fib(i-1)], call 1 + {0, Add}, // 10 [fib(i-2)+fib(i-1)] + {0, Return, 1, 1}, // 11 return i + {0, Fdup, -2}, // 12 [i] + {0, Return, 1, 1}, // 13 return i + {0, Push, 6}, // 14 [1] + {0, Calli, -14}, // 15 [fib(*1)], call 1 + {0, Exit}, // 16 + }, + start: 0, end: 1, mem: "[8]", +}} diff --git a/vm0/README.md b/vm0/README.md deleted file mode 100644 index fc89429..0000000 --- a/vm0/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# vm0 - -vm0 is a virtual machine executing directly the syntax tree. - -```mermaid -graph LR -s[ ] --> |source| a(scanner) ---> |tokens| b(parser) ---> |AST| c(vm) -subgraph vm0 - c -end -style s height:0px; -``` - -The execution is performed by walking the AST and evaluating each -visited node. - - -## Motivation - -- have a reference execution model for each defined language -- usable for compilation time evaluation -- to modelize similar VMs (i.e. gnovm) -- to validate and compare with other VMs (once it is itself validated) -- could serve as a basis for AST based symbolic execution (to be - investigated) diff --git a/vm0/func.go b/vm0/func.go deleted file mode 100644 index ee503bc..0000000 --- a/vm0/func.go +++ /dev/null @@ -1,75 +0,0 @@ -package vm0 - -import ( - "reflect" - "strings" - - "github.com/gnolang/parscan/parser" -) - -var types = map[string]reflect.Type{ - "int": reflect.TypeOf(0), - "string": reflect.TypeOf(""), -} - -func (i *Interp) callFunc(n *parser.Node) { - fp := i.fp - l := len(i.stack) - nargs := i.stack[l-1].(int) - args := make([]reflect.Value, nargs) - for j := range args { - args[nargs-j-1] = reflect.ValueOf(i.stack[l-2-j]) - } - f := reflect.ValueOf(i.stack[l-2-nargs]) - out := f.Call(args) - i.fp = fp - i.stack = i.stack[:l-2-nargs] - for _, v := range out { - i.push(v.Interface()) - } -} - -func (i *Interp) declareFunc(r *parser.Node, scope string) { - fname := r.Child[0].Content() - - // Add symbols for input and output function arguments. - inArgs := r.Child[1].Child - fscope := strings.TrimPrefix(scope+"/"+fname+"/", "/") - in := make([]reflect.Type, len(inArgs)) - for j, c := range inArgs { - i.sym[fscope+c.Content()] = j - in[j] = types[c.Child[0].Content()] - } - var out []reflect.Type - if len(r.Child) > 3 { // function has return values - if i.IsBlock(r.Child[2]) { - outArgs := r.Child[2].Child - out = make([]reflect.Type, len(outArgs)) - for j, c := range outArgs { - out[j] = types[c.Content()] - } - } else { - out = []reflect.Type{types[r.Child[2].Content()]} - } - } - funT := reflect.FuncOf(in, out, false) - - // Generate a wrapper function which will run function body AST. - f := reflect.MakeFunc(funT, func(args []reflect.Value) (res []reflect.Value) { - i.fp = len(i.stack) // fp will be restored by caller (callFunc). - for _, arg := range args { - i.push(arg.Interface()) - } - if err := i.Run(r.Child[len(r.Child)-1], fscope); err != nil { - panic(err) - } - b := len(i.stack) - len(out) - for j := range out { - res = append(res, reflect.ValueOf(i.stack[b+j])) - } - return res - }) - - // Add a symbol for newly created func. - i.sym[scope+fname] = i.push(f.Interface()) - i.fp -} diff --git a/vm0/vm.go b/vm0/vm.go deleted file mode 100644 index 62344ed..0000000 --- a/vm0/vm.go +++ /dev/null @@ -1,157 +0,0 @@ -package vm0 - -import ( - "fmt" - "os" - "strings" - - "github.com/gnolang/parscan/parser" -) - -const debug = true - -type Interp struct { - *parser.Parser - stack []any // stack memory space - fp int // frame pointer: index of current frame in stack - sym map[string]int // symbol table, maps scoped identifiers to offsets relative to fp -} - -func New(p *parser.Parser) (i *Interp) { - i = &Interp{Parser: p, stack: []any{}, sym: map[string]int{}} - i.sym["println"] = i.push(fmt.Println) - return i -} - -func (i *Interp) Eval(src string) (res any, err error) { - n := &parser.Node{} - if n.Child, err = i.Parse(src, n); err != nil { - return - } - if debug { - n.Dot(os.Getenv("DOT"), "") - } - for _, nod := range n.Child { - if err = i.Run(nod, ""); err != nil { - break - } - } - return -} - -// Run implements a stack based virtual machine which directly walks the AST. -func (i *Interp) Run(node *parser.Node, scope string) (err error) { - stop := false - - node.Walk2(nil, 0, func(n, a *parser.Node, k int) (ok bool) { - // Node pre-order processing. - switch n.Kind { - case parser.BlockStmt: - if a != nil && a.Kind == parser.StmtIf { - // Control-flow in 'if' sub-tree - if k == 1 { - // 'if' first body branch, evaluated when condition is true. - if len(a.Child) > 2 { - return i.peek().(bool) // keep condition on stack for else branch - } - return i.pop().(bool) - } - // 'else' body branch, evaluated when condition is false. - return !i.pop().(bool) - } - case parser.DeclFunc: - i.declareFunc(n, scope) - return false - } - return true - }, func(n, a *parser.Node, k int) (ok bool) { - // Node post-order processing. - if stop { - return false - } - l := len(i.stack) - switch n.Kind { - case parser.LiteralNumber: - switch v := n.Value().(type) { - case int64: - i.push(int(v)) - case error: - err = v - return false - default: - err = fmt.Errorf("type not supported: %T\n", v) - return false - } - case parser.LiteralString: - i.push(n.Block()) - case parser.OpInferior: - i.stack[l-2] = i.stack[l-2].(int) < i.stack[l-1].(int) - i.stack = i.stack[:l-1] - case parser.OpAdd: - i.stack[l-2] = i.stack[l-2].(int) + i.stack[l-1].(int) - i.stack = i.stack[:l-1] - case parser.OpSubtract: - i.stack[l-2] = i.stack[l-2].(int) - i.stack[l-1].(int) - i.stack = i.stack[:l-1] - case parser.OpMultiply: - i.stack[l-2] = i.stack[l-2].(int) * i.stack[l-1].(int) - i.stack = i.stack[:l-1] - case parser.OpAssign, parser.OpDefine: - i.stack[i.stack[l-2].(int)] = i.stack[l-1] - i.stack = i.stack[:l-2] - case parser.StmtReturn: - stop = true - return false - case parser.ExprCall: - i.push(len(n.Child[1].Child)) // number of arguments to call - i.callFunc(n) - case parser.Ident: - name := n.Content() - v, sc, ok := i.getSym(name, scope) - fp := i.fp - if sc == "" { - fp = 0 - } - if ok { - if a.Content() == ":=" { - i.push(fp + v) // reference for assign (absolute index) - break - } - i.push(i.stack[fp+v]) // value - break - } - if a.Content() != ":=" { - fmt.Println("error: undefined:", name, "scope:", scope) - } - v = i.push(any(nil)) - i.fp // v is the address of new value, relative to frame pointer - i.sym[scope+name] = v // bind scoped name to address in symbol table - i.push(v) - } - return true - }) - return nil -} - -// getSym searches for an existing symbol starting from the deepest scope. -func (i *Interp) getSym(name, scope string) (index int, sc string, ok bool) { - for { - if index, ok = i.sym[scope+name]; ok { - return index, scope, ok - } - scope = strings.TrimSuffix(scope, "/") - j := strings.LastIndex(scope, "/") - if j == -1 { - scope = "" - break - } - if scope = scope[:j]; scope == "" { - break - } - } - index, ok = i.sym[name] - return index, scope, ok -} - -func (i *Interp) peek() any { return i.stack[len(i.stack)-1] } -func (i *Interp) push(v any) (l int) { l = len(i.stack); i.stack = append(i.stack, v); return } -func (i *Interp) pop() (v any) { l := len(i.stack) - 1; v = i.stack[l]; i.stack = i.stack[:l]; return } diff --git a/vm0/vm_test.go b/vm0/vm_test.go deleted file mode 100644 index a4a24e6..0000000 --- a/vm0/vm_test.go +++ /dev/null @@ -1,26 +0,0 @@ -package vm0 - -import ( - "os" - "testing" - - "github.com/gnolang/parscan/lang/golang" -) - -func TestEval(t *testing.T) { - i := New(golang.GoParser) - t.Logf("%#v\n", i.Parser) - //i.Eval("println(2*5)") - //n, _ := i.Parse("println(2*5)") - //n, _ := i.Parse(`a := 2 + 5`) - src := `a := 2` - nodes, err := i.Parse(src, nil) - if err != nil { - t.Errorf("error %v", err) - } - i.Adot(nodes, os.Getenv("DOT")) - for _, n := range nodes { - err := i.Run(n, "") - t.Log(err) - } -} diff --git a/vm1/README.md b/vm1/README.md deleted file mode 100644 index a06dd34..0000000 --- a/vm1/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# vm1 - -`vm1` is a bytecode based stack machine. - -The purpose of `vm1` is to provide a simple, fast, embeddable and -portable Go execution environment. - -```mermaid -graph LR -s[ ] --> |source| a(scanner) ---> |tokens| b(parser) ---> |AST| c(codegen) ---> |bytecode| d[vm] -subgraph vm1 - d -end -style s height:0px; -``` - -The bytecode consists of a dozen of instructions, each taking 0 or 1 -immediate argument (non-immediate arguments are only passed through the -stack). Only a few operators for a few types are implemented. I expect -to have 1 instruction per operator per numerical type, all with the same -pattern, which would be generated from a template. Estimate is around 20 -operators and 10 numerical types, so around 200 instructions in final. - -Structurally, the vm implements logical and arithmetic operators, -condional jumps for `if`, `for` and `switch` control flow, and function -call, return and frame management. - -the memory state of the vm is a slice of Go interfaces (`[]any`). - -The whole vm is coded in a single function of 80 lines with no -dependencies. The size will grow as we add missing instructions, but the -code complexity will remain the same. - -the vm1 package is totally standalone and could be used for any purpose -outside of parscan and/or gno. diff --git a/vm1/vm.go b/vm1/vm.go deleted file mode 100644 index 01bba83..0000000 --- a/vm1/vm.go +++ /dev/null @@ -1,196 +0,0 @@ -package vm1 - -import ( - "fmt" // for tracing only - "log" // for tracing only - "reflect" // for optional CallX only - "strconv" // for tracing only -) - -const debug = true - -// Byte-code instruction set. -const ( - // instruction effect on stack: values consumed -- values produced - Nop = iota // -- - Add // n1 n2 -- sum ; sum = n1+n2 - Assign // val -- ; mem[$1] = val - Call // f [a1 .. ai] -- [r1 .. rj] ; r1, ... = prog[f](a1, ...) - CallX // f [a1 .. ai] -- [r1 .. rj] ; r1, ... = mem[f](a1, ...) - Dup // addr -- value ; value = mem[addr] - Fdup // addr -- value ; value = mem[addr] - Exit // -- ; - Jump // -- ; ip += $1 - JumpTrue // cond -- ; if cond { ip += $1 } - JumpFalse // cond -- ; if cond { ip += $1 } - Lower // n1 n2 -- cond ; cond = n1 < n2 - Loweri // n1 -- cond ; cond = n1 < $1 - Pop // v -- - Push // -- v - Return // [r1 .. ri] -- ; exit frame: sp = fp, fp = pop - Sub // n1 n2 -- diff ; diff = n1 - n2 - Subi // n1 -- diff ; diff = n1 - $1 -) - -var strop = [...]string{ // for VM tracing. - Nop: "Nop", - Add: "Add", - Assign: "Assign", - Call: "Call", - CallX: "CallX", - Dup: "Dup", - Fdup: "Fdup", - Exit: "Exit", - Jump: "Jump", - JumpTrue: "JumpTrue", - JumpFalse: "JumpFalse", - Lower: "Lower", - Loweri: "Loweri", - Pop: "Pop", - Push: "Push", - Return: "Return", - Sub: "Sub", - Subi: "Subi", -} - -// Machine represents a virtual machine. -type Machine struct { - code [][]int64 // code to execute - mem []any // memory, as a stack - ip, fp int // instruction and frame pointer - ic uint64 // instruction counter, incremented at each instruction executed - // flags uint // to set options such as restrict CallX, etc... -} - -// Run runs a program. -func (m *Machine) Run() (err error) { - code, mem, ip, fp, sp, ic := m.code, m.mem, m.ip, m.fp, 0, m.ic - - defer func() { m.mem, m.ip, m.fp, m.ic = mem, ip, fp, ic }() - - trace := func() { - if !debug { - return - } - var op2, op3 string - c := code[ip] - if l := len(c); l > 2 { - op2 = strconv.Itoa(int(c[2])) - if l > 3 { - op3 = strconv.Itoa(int(c[3])) - } - } - log.Printf("ip:%-4d sp:%-4d fp:%-4d op:[%-9s %-4s %-4s] mem:%v\n", ip, sp, fp, strop[c[1]], op2, op3, mem) - } - - for { - sp = len(mem) // stack pointer - trace() - ic++ - switch op := code[ip]; op[1] { - case Add: - mem[sp-2] = mem[sp-2].(int) + mem[sp-1].(int) - mem = mem[:sp-1] - case Assign: - mem[op[2]] = mem[sp-1] - mem = mem[:sp-1] - case Call: - mem = append(mem, ip+1, fp) - fp = sp + 2 - ip += int(op[2]) - continue - case CallX: // Should be made optional. - l := int(op[2]) - in := make([]reflect.Value, l) - for i := range in { - in[l-1-i] = reflect.ValueOf(mem[sp-1-i]) - } - f := reflect.ValueOf(mem[sp-l-1]) - mem = mem[:sp-l-1] - for _, v := range f.Call(in) { - mem = append(mem, v.Interface()) - } - case Dup: - mem = append(mem, mem[int(op[2])]) - case Exit: - return - case Fdup: - mem = append(mem, mem[int(op[2])+fp-1]) - case Jump: - ip += int(op[2]) - continue - case JumpTrue: - cond := mem[sp-1].(bool) - mem = mem[:sp-1] - if cond { - ip += int(op[2]) - continue - } - case JumpFalse: - cond := mem[sp-1].(bool) - mem = mem[:sp-1] - if !cond { - ip += int(op[2]) - continue - } - case Lower: - mem[sp-2] = mem[sp-2].(int) < mem[sp-1].(int) - mem = mem[:sp-1] - case Loweri: - mem[sp-1] = mem[sp-1].(int) < int(op[2]) - case Pop: - mem = mem[:sp-int(op[2])] - case Push: - mem = append(mem, int(op[2])) - case Return: - ip = mem[fp-2].(int) - ofp := fp - fp = mem[fp-1].(int) - mem = append(mem[:ofp-int(op[2])-int(op[3])-1], mem[sp-int(op[2]):]...) - continue - case Sub: - mem[sp-2] = mem[sp-2].(int) - mem[sp-1].(int) - mem = mem[:sp-1] - case Subi: - mem[sp-1] = mem[sp-1].(int) - int(op[2]) - } - ip++ - } -} - -func (m *Machine) PushCode(code ...[]int64) (p int) { - p = len(m.code) - m.code = append(m.code, code...) - return p -} - -func (m *Machine) SetIP(ip int) { m.ip = ip } -func (m *Machine) Push(v ...any) (l int) { l = len(m.mem); m.mem = append(m.mem, v...); return } -func (m *Machine) Pop() (v any) { l := len(m.mem) - 1; v = m.mem[l]; m.mem = m.mem[:l]; return } -func (m *Machine) Top() (v any) { - if l := len(m.mem); l > 0 { - v = m.mem[l-1] - } - return -} - -func (m *Machine) PopExit() { - if l := len(m.code); l > 0 && m.code[l-1][1] == Exit { - m.code = m.code[:l-1] - } -} - -// Disassemble returns the code as a readable string. -func Disassemble(code [][]int64) (asm string) { - for _, op := range code { - switch len(op) { - case 2: - asm += strop[op[1]] + "\n" - case 3: - asm += fmt.Sprintf("%s %d\n", strop[op[1]], op[2]) - case 4: - asm += fmt.Sprintf("%s %d %d\n", strop[op[1]], op[2], op[3]) - } - } - return asm -} diff --git a/vm1/vm_test.go b/vm1/vm_test.go deleted file mode 100644 index 272a321..0000000 --- a/vm1/vm_test.go +++ /dev/null @@ -1,122 +0,0 @@ -package vm1 - -import ( - "fmt" - "testing" -) - -func TestVM(t *testing.T) { - for _, test := range tests { - test := test - t.Run("", func(t *testing.T) { - m := &Machine{} - for _, v := range test.sym { - m.Push(v) - } - m.PushCode(test.code...) - if err := m.Run(); err != nil { - t.Errorf("run error: %v", err) - } - t.Log(m.mem) - r := fmt.Sprintf("%v", m.mem[test.start:test.end]) - if r != test.mem { - t.Errorf("got %v, want %v", r, test.mem) - } - }) - } -} - -func BenchmarkVM(b *testing.B) { - for _, test := range tests { - test := test - b.Run("", func(b *testing.B) { - for i := 0; i < b.N; i++ { - b.StopTimer() - m := &Machine{} - m.PushCode(test.code...) - b.StartTimer() - - if err := m.Run(); err != nil { - b.Errorf("run error: %v", err) - } - } - }) - } -} - -var tests = []struct { - sym []any // initial memory values - code [][]int64 // bytecode to execute - start, end int // - mem string // expected memory content -}{{ // #00 -- A simple addition. - code: [][]int64{ - {0, Push, 1}, - {0, Push, 2}, - {0, Add}, - {0, Exit}, - }, - start: 0, end: 1, mem: "[3]", -}, { // #01 -- Calling a function defined outside the VM. - sym: []any{fmt.Println, "Hello"}, - code: [][]int64{ - {0, CallX, 1}, - {0, Exit}, - }, - start: 0, end: 2, mem: "[6 ]", -}, { // #02 -- Defining and calling a function in VM. - code: [][]int64{ - {0, Jump, 3}, // 0 - {0, Push, 3}, // 1 - {0, Return, 1, 1}, // 2 - {0, Push, 1}, // 3 - {0, Call, -3}, // 4 - {0, Exit}, // 5 - }, - start: 0, end: 1, mem: "[3]", -}, { // #03 -- Fibonacci numbers, hand written. Showcase recursivity. - code: [][]int64{ - {0, Jump, 17}, // 0 - {0, Fdup, -2}, // 1 [i] - {0, Push, 2}, // 2 [i 2] - {0, Lower}, // 3 [true/false] - {0, JumpTrue, 11}, // 4 [], goto 16 - {0, Fdup, -2}, // 5 [i] - {0, Push, 2}, // 6 [i 2] - {0, Sub}, // 7 [(i-2)] - {0, Call, -7}, // 8 [fib(i-2)] - {0, Fdup, -2}, // 9 [fib(i-2) i] - {0, Push, 1}, // 10 [(i-2) i 1] - {0, Sub}, // 11 [(i-2) (i-1)] - {0, Call, -11}, // 12 [fib(i-2) fib(i-1)] - {0, Add}, // 13 [fib(i-2)+fib(i-1)] - {0, Return, 1, 1}, // 14 return i - {0, Fdup, -2}, // 15 [i] - {0, Return, 1, 1}, // 16 return i - {0, Push, 6}, // 17 [1] - {0, Call, -17}, // 18 [fib(*1)] - {0, Exit}, // 19 - }, - start: 0, end: 1, mem: "[8]", -}, { // #04 -- Fibonacci with some immediate instructions. - code: [][]int64{ - {0, Jump, 14}, // 0 - {0, Fdup, -2}, // 1 [i] - {0, Loweri, 2}, // 2 [true/false] - {0, JumpTrue, 9}, // 3 [], goto 13 - {0, Fdup, -2}, // 4 [i] - {0, Subi, 2}, // 5 [(i-2)] - {0, Call, -5}, // 6 [fib(i-2)] - {0, Fdup, -2}, // 7 [fib(i-2) i] - {0, Subi, 1}, // 8 [(i-2) (i-1)] - {0, Call, -8}, // 9 [fib(i-2) fib(i-1)], call 1 - {0, Add}, // 10 [fib(i-2)+fib(i-1)] - {0, Return, 1, 1}, // 11 return i - {0, Fdup, -2}, // 12 [i] - {0, Return, 1, 1}, // 13 return i - {0, Push, 6}, // 14 [1] - {0, Call, -14}, // 15 [fib(*1)], call 1 - {0, Exit}, // 16 - }, - start: 0, end: 1, mem: "[8]", -}} -- cgit v1.2.3