From 80c277773a1e73267832641574654361b85e6028 Mon Sep 17 00:00:00 2001 From: Marc Vertes Date: Mon, 10 Jul 2023 15:54:13 +0200 Subject: first commit --- parser/dot.go | 74 +++++++++++++++++ parser/node.go | 45 ++++++++++ parser/parse.go | 210 +++++++++++++++++++++++++++++++++++++++++++++++ parser/parse_test.go | 228 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 557 insertions(+) create mode 100644 parser/dot.go create mode 100644 parser/node.go create mode 100644 parser/parse.go create mode 100644 parser/parse_test.go (limited to 'parser') diff --git a/parser/dot.go b/parser/dot.go new file mode 100644 index 0000000..cef1acb --- /dev/null +++ b/parser/dot.go @@ -0,0 +1,74 @@ +package parser + +import ( + "bytes" + "fmt" + "io" + "log" + "os/exec" + "strings" +) + +func (*Parser) Adot(nodes []*Node, c string) { + if c == "" { + return + } + n := &Node{Child: nodes} + n.Dot(c, "") +} + +func (n *Node) Dot(c, s string) { n.astDot(dotWriter(c), s) } + +func (n *Node) Sdot(s string) string { + var buf bytes.Buffer + n.astDot(&buf, s) + return buf.String() +} + +// TODO: rewrite it using Walk2 +func (n *Node) astDot(out io.Writer, label string) { + fmt.Fprintf(out, "digraph ast { ") + if label != "" { + fmt.Fprintf(out, "labelloc=\"t\"; label=\"%s\";", label) + } + anc := map[*Node]*Node{} + index := map[*Node]int{} + count := 0 + n.Walk(func(nod *Node) bool { + index[nod] = count + count++ + + for _, c := range nod.Child { + anc[c] = nod + } + name := strings.ReplaceAll(nod.Name(), `"`, `\"`) + fmt.Fprintf(out, "%d [label=\"%s\"]; ", index[nod], name) + if anc[nod] != nil { + fmt.Fprintf(out, "%d -> %d; ", index[anc[nod]], index[nod]) + } + return true + }, nil) + fmt.Fprintf(out, "}") +} + +type nopCloser struct { + io.Writer +} + +func (nopCloser) Close() error { return nil } + +func dotWriter(dotCmd string) io.WriteCloser { + if dotCmd == "" { + return nopCloser{io.Discard} + } + fields := strings.Fields(dotCmd) + cmd := exec.Command(fields[0], fields[1:]...) + dotin, err := cmd.StdinPipe() + if err != nil { + log.Fatal(err) + } + if err = cmd.Start(); err != nil { + log.Fatal(err) + } + return dotin +} diff --git a/parser/node.go b/parser/node.go new file mode 100644 index 0000000..6ebc2a7 --- /dev/null +++ b/parser/node.go @@ -0,0 +1,45 @@ +package parser + +import "github.com/gnolang/parscan/scanner" + +type Kind int + +type Node struct { + Child []*Node // sub-tree nodes + scanner.Token // token at origin of the node + Kind // Node kind, depends on the language spec + + unary bool // TODO(marc): get rid of it. +} + +// TODO: remove it in favor of Walk2 +func (n *Node) Walk(in, out func(*Node) bool) (stop bool) { + if in != nil && !in(n) { + return true + } + for _, child := range n.Child { + if child.Walk(in, out) { + return + } + } + if out != nil { + stop = !out(n) + } + return +} + +// Idem to walk, but also propagates the ancestor of visited node and child index. +func (n *Node) Walk2(a *Node, i int, in, out func(*Node, *Node, int) bool) (stop bool) { + if in != nil && !in(n, a, i) { + return true + } + for j, child := range n.Child { + if child.Walk2(n, j, in, out) { + return + } + } + if out != nil { + stop = !out(n, a, i) + } + return +} diff --git a/parser/parse.go b/parser/parse.go new file mode 100644 index 0000000..cfbff27 --- /dev/null +++ b/parser/parse.go @@ -0,0 +1,210 @@ +package parser + +import ( + "github.com/gnolang/parscan/scanner" +) + +const ( + Stmt = 1 << iota + ExprSep + Call + Index + Decl +) + +type NodeSpec struct { + Kind // AST node kind + Flags uint // composable properties used for AST generation + Order int // operator precedence order +} + +type Parser struct { + *scanner.Scanner + Spec map[string]NodeSpec +} + +func (p *Parser) Parse(src string) (n []*Node, err error) { + tokens, err := p.Scan(src) + if err != nil { + return + } + return p.ParseTokens(tokens) +} + +func (p *Parser) ParseTokens(token []scanner.Token) (roots []*Node, err error) { + // TODO: error handling. + var root *Node // current root node + var expr *Node // current expression root node + var prev, c *Node + var lce *Node + for i, t := range token { + prev = c + c = &Node{ + Token: t, + Kind: p.Spec[t.Name()].Kind, + unary: t.Kind() == scanner.Operator && (i == 0 || token[i-1].Kind() == scanner.Operator), + } + if c.Kind == 0 { + switch t.Kind() { + case scanner.Number: + c.Kind = p.Spec["#num"].Kind + case scanner.Identifier: + c.Kind = p.Spec["#id"].Kind + } + } + + if root == nil { + if p.isSep(c) { + continue + } + lce = nil + root = c + if p.isExpr(c) { + expr = c + } + continue + } + + if t.Kind() == scanner.Block { + if expr != nil { + if p.hasBlockProp(c, ExprSep) && p.isExprSep(root) { + // A bracket block may end a previous expression. + root.Child = append(root.Child, expr) + expr = nil + } else if p.hasBlockProp(c, Call) && !p.hasProp(root, Decl) && p.canCallToken(token[i-1]) { + // Handle (possibly nested) call expressions. + if lce == nil || lce != expr { // TODO(marc): not general, fix it. + lce = prev + } + lce.Child = []*Node{{Token: lce.Token, Child: lce.Child, Kind: lce.Kind}} + lce.Token = scanner.NewToken("Call", c.Pos()) + lce.Kind = p.Spec["#call"].Kind + } + } + tcont := t.Content() + s := tcont[t.Start() : len(tcont)-t.End()] + n2, err := p.Parse(s) + if err != nil { + return nil, err + } + c.Child = append(c.Child, n2...) + } + + // Process the end of an expression or a statement. + if t.Kind() == scanner.Separator { + if expr != nil && p.hasProp(root, Stmt) { + root.Child = append(root.Child, expr) + if p.hasBlockProp(expr, ExprSep) { + roots = append(roots, root) + root = nil + } + expr = nil + } else { + if expr != nil { + root = expr + } + roots = append(roots, root) + expr = nil + root = nil + } + continue + } + + // We assume from now that current node is part of an expression subtree. + if expr == nil { + if p.isStatement(root) { + expr = c + continue + } + expr = root + } + + // Update the expression subtree according to binary operator precedence rules. + // - operators are binary infix by default. + // - if an operator follows another, then it's unary prefix. + // - if an expression starts by an operator, then it's unary prefix. + // - non operator nodes have a default precedence of 0. + // TODO: handle postfix unary (i.e. ++) and ternary (i.e. ?:) + // + ep := p.Spec[expr.Content()].Order + cp := p.Spec[c.Content()].Order + a := expr + if c.unary { + cp = 0 + } + if cp != 0 { + if cp > ep { + // Perform an operator permutation at expr root as required by precedence. + // TODO(marc): maybe it can be generalized in below else branch. + expr, c = c, expr + a = expr // Temporary ancestor: its children may have to be permuted. + } else { + // Findout if an operator permutation is necessary in subtree. + c1 := expr + for { + a = c1 + if c1.unary { + c1, c = c, c1 + a = c1 + if c == expr { + expr = a + } + break + } + if len(c1.Child) < 2 { + break + } + c1 = c1.Child[1] + if c1.Token.Kind() != scanner.Operator || c1.unary || cp > p.Spec[c1.Content()].Order { + break + } + } + // No permutation occured. Append current to last visited ancestor. + if len(a.Child) > 1 { + a.Child = a.Child[:1] + c.Child = append(c.Child, c1) + } + } + } else if ep != 0 { + for len(a.Child) > 1 { + a = a.Child[1] + } + } + a.Child = append(a.Child, c) + if p.hasProp(a, Call) { + lce = a + } + } + if root != nil && p.isStatement(root) { + if expr != nil { + root.Child = append(root.Child, expr) + } + } else if expr != nil { + root = expr + } + if root != nil { + roots = append(roots, root) + } + return roots, err +} + +func (p *Parser) hasProp(n *Node, prop uint) bool { return p.Spec[n.Name()].Flags&prop != 0 } +func (p *Parser) isStatement(n *Node) bool { return p.Spec[n.Content()].Flags&Stmt != 0 } +func (p *Parser) isExprSep(n *Node) bool { return p.Spec[n.Content()].Flags&ExprSep != 0 } +func (p *Parser) isExpr(n *Node) bool { return !p.isStatement(n) && !p.isExprSep(n) } +func (p *Parser) isSep(n *Node) bool { return n.Token.Kind() == scanner.Separator } +func (p *Parser) IsBlock(n *Node) bool { return n.Token.Kind() == scanner.Block } +func (p *Parser) hasBlockProp(n *Node, prop uint) bool { + return p.Spec[n.Content()[:n.Start()]].Flags&prop != 0 +} +func (p *Parser) precedenceToken(t scanner.Token) int { + s := t.Content() + if l := t.Start(); l > 0 { + s = s[:l] + } + return p.Spec[s].Order +} + +func (p *Parser) canCallToken(t scanner.Token) bool { + return p.precedenceToken(t) == 0 || p.Spec[t.Name()].Flags&Call != 0 +} diff --git a/parser/parse_test.go b/parser/parse_test.go new file mode 100644 index 0000000..dc363cb --- /dev/null +++ b/parser/parse_test.go @@ -0,0 +1,228 @@ +package parser + +import ( + "os" + "testing" + + "github.com/gnolang/parscan/scanner" +) + +var GoScanner = &scanner.Scanner{ + CharProp: [scanner.ASCIILen]uint{ + '\t': scanner.CharSep, + '\n': scanner.CharLineSep, + ' ': scanner.CharSep, + '!': scanner.CharOp, + '"': scanner.CharStr, + '%': scanner.CharOp, + '&': scanner.CharOp, + '\'': scanner.CharStr, + '(': scanner.CharBlock, + '*': scanner.CharOp, + '+': scanner.CharOp, + ',': scanner.CharGroupSep, + '-': scanner.CharOp, + '.': scanner.CharOp, + '/': scanner.CharOp, + ':': scanner.CharOp, + ';': scanner.CharGroupSep, + '<': scanner.CharOp, + '=': scanner.CharOp, + '>': scanner.CharOp, + '[': scanner.CharBlock, + '^': scanner.CharOp, + '{': scanner.CharBlock, + '|': scanner.CharOp, + '~': scanner.CharOp, + }, + End: map[string]string{ + "(": ")", + "{": "}", + "[": "]", + "/*": "*/", + `"`: `"`, + "'": "'", + "`": "`", + "//": "\n", + }, +} + +const ( + Undefined = Kind(iota) + FuncDecl + CallExpr + IfStmt + StmtBloc + ReturnStmt + ParBloc + MulOp + AddOp + SubOp + DotOp + AssignOp + DefOp + InfOp +) + +var GoParser = &Parser{ + Scanner: GoScanner, + Spec: map[string]NodeSpec{ + ".": {DotOp, Call, 3}, + "*": {MulOp, 0, 4}, + "+": {AddOp, 0, 5}, + "-": {SubOp, 0, 5}, + "<": {InfOp, 0, 6}, + ":=": {DefOp, 0, 7}, + "=": {AssignOp, 0, 7}, + "#Call": {CallExpr, 0, 0}, + "if": {IfStmt, Stmt | ExprSep, 0}, + "func": {FuncDecl, Decl | Call, 0}, + "return": {ReturnStmt, Stmt, 0}, + "{..}": {StmtBloc, ExprSep, 0}, + "{": {StmtBloc, ExprSep, 0}, + "(": {ParBloc, Call, 0}, + "(..)": {ParBloc, Call, 0}, + }, +} + +func TestParse(t *testing.T) { + for _, test := range goTests { + t.Run("", func(t *testing.T) { + var err error + errStr := "" + n := &Node{} + if n.Child, err = GoParser.Parse(test.src); err != nil { + errStr = err.Error() + } + if errStr != test.err { + t.Errorf("got error %#v, want error %#v", errStr, test.err) + } + if dot := n.Sdot(""); dot != test.dot { + t.Errorf("got %#v, want %#v", dot, test.dot) + } + t.Log(test.src) + t.Log(n.Sdot("")) + if dotCmd := os.Getenv("DOT"); dotCmd != "" { + n.Dot(dotCmd, "") + } + }) + } +} + +var goTests = []struct { + src, dot, err string + skip bool +}{{ // #00 + src: "", + dot: `digraph ast { 0 [label=""]; }`, +}, { // #01 + src: "12", + dot: `digraph ast { 0 [label=""]; 1 [label="12"]; 0 -> 1; }`, +}, { // #02 + src: "1 + 2", + dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="1"]; 1 -> 2; 3 [label="2"]; 1 -> 3; }`, +}, { // #03 + src: "1 + 2 * 3", + dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="1"]; 1 -> 2; 3 [label="*"]; 1 -> 3; 4 [label="2"]; 3 -> 4; 5 [label="3"]; 3 -> 5; }`, +}, { // #04 + src: "1 * 2 + 3", + dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="*"]; 1 -> 2; 3 [label="1"]; 2 -> 3; 4 [label="2"]; 2 -> 4; 5 [label="3"]; 1 -> 5; }`, +}, { // #05 + src: "a := 2 + 5", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="2"]; 3 -> 4; 5 [label="5"]; 3 -> 5; }`, +}, { // #06 + src: "a := 1 * 2 + 3", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="*"]; 3 -> 4; 5 [label="1"]; 4 -> 5; 6 [label="2"]; 4 -> 6; 7 [label="3"]; 3 -> 7; }`, +}, { // #07 + src: "a := 1 + 2 * 3", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="1"]; 3 -> 4; 5 [label="*"]; 3 -> 5; 6 [label="2"]; 5 -> 6; 7 [label="3"]; 5 -> 7; }`, +}, { // #08 + src: "a := 1 + 2 * 3 + 4", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="1"]; 3 -> 4; 5 [label="+"]; 3 -> 5; 6 [label="*"]; 5 -> 6; 7 [label="2"]; 6 -> 7; 8 [label="3"]; 6 -> 8; 9 [label="4"]; 5 -> 9; }`, +}, { // #09 + src: "a := 1 + 2 + 3 * 4", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="1"]; 3 -> 4; 5 [label="+"]; 3 -> 5; 6 [label="2"]; 5 -> 6; 7 [label="*"]; 5 -> 7; 8 [label="3"]; 7 -> 8; 9 [label="4"]; 7 -> 9; }`, +}, { // #10 + src: "a := 1 * 2 + 3 * 4", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="*"]; 3 -> 4; 5 [label="1"]; 4 -> 5; 6 [label="2"]; 4 -> 6; 7 [label="*"]; 3 -> 7; 8 [label="3"]; 7 -> 8; 9 [label="4"]; 7 -> 9; }`, +}, { // #11 + src: "a := (1 + 2) * 3", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="*"]; 1 -> 3; 4 [label="(..)"]; 3 -> 4; 5 [label="+"]; 4 -> 5; 6 [label="1"]; 5 -> 6; 7 [label="2"]; 5 -> 7; 8 [label="3"]; 3 -> 8; }`, +}, { // #12 + src: "a := 2 * -3", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="*"]; 1 -> 3; 4 [label="2"]; 3 -> 4; 5 [label="-"]; 3 -> 5; 6 [label="3"]; 5 -> 6; }`, +}, { // #13 + src: "-5 + 4", + dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="-"]; 1 -> 2; 3 [label="5"]; 2 -> 3; 4 [label="4"]; 1 -> 4; }`, +}, { // #14 + src: "-5 + -4", + dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="-"]; 1 -> 2; 3 [label="5"]; 2 -> 3; 4 [label="-"]; 1 -> 4; 5 [label="4"]; 4 -> 5; }`, +}, { // #15 + src: "a := -5 + -4", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="-"]; 3 -> 4; 5 [label="5"]; 4 -> 5; 6 [label="-"]; 3 -> 6; 7 [label="4"]; 6 -> 7; }`, +}, { // #16 + src: "*a := 5 * -3", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="*"]; 1 -> 2; 3 [label="a"]; 2 -> 3; 4 [label="*"]; 1 -> 4; 5 [label="5"]; 4 -> 5; 6 [label="-"]; 4 -> 6; 7 [label="3"]; 6 -> 7; }`, +}, { // #17 + src: "*a := -5 * 3", + dot: `digraph ast { 0 [label=""]; 1 [label=":="]; 0 -> 1; 2 [label="*"]; 1 -> 2; 3 [label="a"]; 2 -> 3; 4 [label="*"]; 1 -> 4; 5 [label="-"]; 4 -> 5; 6 [label="5"]; 5 -> 6; 7 [label="3"]; 4 -> 7; }`, +}, { // #18 + src: "1+2\n3-4", + dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="1"]; 1 -> 2; 3 [label="2"]; 1 -> 3; 4 [label="-"]; 0 -> 4; 5 [label="3"]; 4 -> 5; 6 [label="4"]; 4 -> 6; }`, +}, { // #19 + src: "i = i+1", + dot: `digraph ast { 0 [label=""]; 1 [label="="]; 0 -> 1; 2 [label="i"]; 1 -> 2; 3 [label="+"]; 1 -> 3; 4 [label="i"]; 3 -> 4; 5 [label="1"]; 3 -> 5; }`, +}, { // #20 + src: "a[12] = 5", + dot: `digraph ast { 0 [label=""]; 1 [label="="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="[..]"]; 2 -> 3; 4 [label="12"]; 3 -> 4; 5 [label="5"]; 1 -> 5; }`, +}, { // #21 + src: "a[12][0] = 3", + dot: `digraph ast { 0 [label=""]; 1 [label="="]; 0 -> 1; 2 [label="a"]; 1 -> 2; 3 [label="[..]"]; 2 -> 3; 4 [label="12"]; 3 -> 4; 5 [label="[..]"]; 2 -> 5; 6 [label="0"]; 5 -> 6; 7 [label="3"]; 1 -> 7; }`, +}, { // #22 + src: "a.b = 34", + dot: `digraph ast { 0 [label=""]; 1 [label="="]; 0 -> 1; 2 [label="."]; 1 -> 2; 3 [label="a"]; 2 -> 3; 4 [label="b"]; 2 -> 4; 5 [label="34"]; 1 -> 5; }`, +}, { // #23 + src: "if i < 2 { return j }", + dot: `digraph ast { 0 [label=""]; 1 [label="if"]; 0 -> 1; 2 [label="<"]; 1 -> 2; 3 [label="i"]; 2 -> 3; 4 [label="2"]; 2 -> 4; 5 [label="{..}"]; 1 -> 5; 6 [label="return"]; 5 -> 6; 7 [label="j"]; 6 -> 7; }`, +}, { // #24 + src: "if i:=1; i < 2 { return j }", + dot: `digraph ast { 0 [label=""]; 1 [label="if"]; 0 -> 1; 2 [label=":="]; 1 -> 2; 3 [label="i"]; 2 -> 3; 4 [label="1"]; 2 -> 4; 5 [label="<"]; 1 -> 5; 6 [label="i"]; 5 -> 6; 7 [label="2"]; 5 -> 7; 8 [label="{..}"]; 1 -> 8; 9 [label="return"]; 8 -> 9; 10 [label="j"]; 9 -> 10; }`, +}, { // #25 + src: "f(i) + f(j)", + dot: `digraph ast { 0 [label=""]; 1 [label="+"]; 0 -> 1; 2 [label="Call"]; 1 -> 2; 3 [label="f"]; 2 -> 3; 4 [label="(..)"]; 2 -> 4; 5 [label="i"]; 4 -> 5; 6 [label="Call"]; 1 -> 6; 7 [label="f"]; 6 -> 7; 8 [label="(..)"]; 6 -> 8; 9 [label="j"]; 8 -> 9; }`, + //src: "f(i) + f(j)(4)", // not ok + /* + }, { // #26 + src: "if i < 2 {return i}; return f(i-2) + f(i-1)", + }, { // #27 + src: "for i < 2 { println(i) }", + }, { // #28 + src: "func f(i int) (int) { if i < 2 { return i}; return f(i-2) + f(i-1) }", + }, { // #29 + src: "a := []int{3, 4}", + }, { // #30 + //src: "a := struct{int}", + src: "a, b = c, d", + }, { // #31 + //src: "a := [2]int{3, 4}", + src: `fmt.Println("Hello")`, + //src: "(1 + 2) * (3 - 4)", + //src: "1 + (1 + 2)", + }, { // #32 + //src: `a(3)(4)`, + //src: `3 + 2 * a(3) + 5`, + //src: `3 + 2 * a(3)(4) + (5)`, + //src: `(a(3))(4)`, + src: `a(3)(4)`, + dot: `digraph ast { 0 [label=""]; 1 [label="Call"]; 0 -> 1; 2 [label="Call"]; 1 -> 2; 3 [label="a"]; 2 -> 3; 4 [label="(..)"]; 2 -> 4; 5 [label="3"]; 4 -> 5; 6 [label="(..)"]; 1 -> 6; 7 [label="4"]; 6 -> 7; }`, + //src: `println("Hello")`, + //src: `a.b.c + 3`, + }, { // #33 + src: `func f(a int, b int) {return a + b}; f(1+2)`, + }, { // #34 + src: `if a == 1 { + println(2) + } + println("bye")`, + */ +}} -- cgit v1.2.3