From c83cdc374f8d50189c2da3460a6db94a4527c1fc Mon Sep 17 00:00:00 2001 From: Marc Vertes Date: Thu, 24 Aug 2023 09:31:14 +0200 Subject: scanner: handle long string delimiters (#7) * scanner: handle long string delimiters Strings now can be delimited by arbitrary sequences of characters. It is also possible to exclude the end delimiter, as for example required for `//` C or Go comments. * scanner: fix handling strings within blocks --- scanner/readme.md | 11 ++-- scanner/scan.go | 168 ++++++++++++++++++++++++++++++++++----------------- scanner/scan_test.go | 124 ++++++++++++++++++++++++++----------- 3 files changed, 210 insertions(+), 93 deletions(-) (limited to 'scanner') diff --git a/scanner/readme.md b/scanner/readme.md index b8b31fb..c131a9f 100644 --- a/scanner/readme.md +++ b/scanner/readme.md @@ -31,12 +31,13 @@ A successful test must be provided to check the status. - [x] unescaped strings (including multiline) - [x] escaped string (including multiline) - [x] separators (in UTF-8 range) -- [ ] single line string (\n not allowed) +- [x] single line string (\n not allowed) - [x] identifiers (in UTF-8 range) - [x] operators, concatenated or not - [x] single character block/string delimiters - [x] arbitrarly nested blocks and strings -- [ ] multiple characters block/string delimiters -- [ ] blocks delimited by identifiers/operators/separators -- [ ] blocks with delimiter inclusion/exclusion rules -- [ ] blocks delimited by indentation level +- [x] multiple characters block/string delimiters +- [x] blocks delimited by operator characters +- [ ] blocks delimited by identifiers +- [x] blocks with delimiter inclusion/exclusion rules +- [ ] blocks delimited by indentation level (python, yaml, ...) diff --git a/scanner/scan.go b/scanner/scan.go index 066fc2a..2a4b125 100644 --- a/scanner/scan.go +++ b/scanner/scan.go @@ -3,6 +3,7 @@ package scanner import ( "errors" "fmt" + "regexp" "strconv" "strings" ) @@ -32,6 +33,8 @@ const ( CharBlock StrEsc StrNonl + ExcludeEnd // exclude end delimiter from content + EosValidEnd // end of input string terminates block or string token ) var ErrBlock = errors.New("block not terminated") @@ -74,11 +77,14 @@ const ASCIILen = 1 << 7 // 128 // Scanner contains the scanner rules for a language. type Scanner struct { - CharProp [ASCIILen]uint // Special Character properties. - End map[string]string // End delimiters. - DotNum bool // True if a number can start with '.'. - IdAscii bool // True if an identifier can be in ASCII only. - Num_ bool // True if a number can contain _ character. + CharProp [ASCIILen]uint // special Character properties + End map[string]string // end delimiters, indexed by start + BlockProp map[string]uint // block properties + DotNum bool // true if a number can start with '.' + IdAscii bool // true if an identifier can be in ASCII only + Num_ bool // true if a number can contain _ character + + sdre *regexp.Regexp // string delimiters regular expression } func (sc *Scanner) HasProp(r rune, p uint) bool { @@ -98,6 +104,23 @@ func (sc *Scanner) IsId(r rune) bool { return !sc.HasProp(r, CharOp|CharSep|CharLineSep|CharGroupSep|CharStr|CharBlock) } +func (sc *Scanner) Init() { + // Build a regular expression to match all string delimiters. + re := "(" + for s, p := range sc.BlockProp { + if p&CharStr == 0 { + continue + } + // TODO: sort keys in decreasing length order. + for _, b := range []byte(s) { + re += fmt.Sprintf("\\x%02x", b) + } + re += "|" + } + re = strings.TrimSuffix(re, "|") + ")$" + sc.sdre = regexp.MustCompile(re) +} + func IsNum(r rune) bool { return '0' <= r && r <= '9' } func (sc *Scanner) Scan(src string) (tokens []Token, err error) { @@ -106,7 +129,6 @@ func (sc *Scanner) Scan(src string) (tokens []Token, err error) { for len(s) > 0 { t, err := sc.Next(s) if err != nil { - //return nil, fmt.Errorf("%s: %w '%s'", loc(src, offset+t.pos), err, t.delim) return nil, fmt.Errorf("%s: %w", loc(src, offset+t.pos), err) } if t.kind == Undefined { @@ -155,56 +177,89 @@ func (sc *Scanner) Next(src string) (tok Token, err error) { case sc.IsLineSep(r): return Token{kind: Separator, pos: p + i, content: " "}, nil case sc.IsStr(r): - s, ok := sc.GetStr(src[i:]) + s, ok := sc.getStr(src[i:], 1) if !ok { err = ErrBlock } return Token{kind: String, pos: p + i, content: s, start: 1, end: 1}, err case sc.IsBlock(r): - b, ok := sc.GetBlock(src[i:]) + b, ok := sc.getBlock(src[i:], 1) if !ok { err = ErrBlock } return Token{kind: Block, pos: p + i, content: b, start: 1, end: 1}, err case sc.IsOp(r): - return Token{kind: Operator, pos: p + i, content: sc.GetOp(src[i:])}, nil + op, isOp := sc.getOp(src[i:]) + if isOp { + return Token{kind: Operator, pos: p + i, content: op}, nil + } + flag := sc.BlockProp[op] + if flag&CharStr != 0 { + s, ok := sc.getStr(src[i:], len(op)) + if !ok { + err = ErrBlock + } + return Token{kind: String, pos: p + i, content: s, start: len(op), end: len(op)}, err + } case IsNum(r): - c, v := sc.GetNum(src[i:]) + c, v := sc.getNum(src[i:]) return Token{kind: Number, pos: p + i, content: c, value: v}, nil default: - return Token{kind: Identifier, pos: p + i, content: sc.GetId(src[i:])}, nil + id, isId := sc.getId(src[i:]) + if isId { + return Token{kind: Identifier, pos: p + i, content: id}, nil + } + flag := sc.BlockProp[id] + if flag&CharBlock != 0 { + s, ok := sc.getBlock(src[i:], len(id)) + if !ok { + err = ErrBlock + } + return Token{kind: Block, pos: p + i, content: s, start: len(id), end: len(id)}, err + } } } return Token{}, nil } -func (sc *Scanner) GetId(src string) (s string) { - for _, r := range src { +func (sc *Scanner) getId(src string) (s string, isId bool) { + s = sc.nextId(src) + if _, match := sc.BlockProp[s]; match { + return s, false + } + return s, true +} + +func (sc *Scanner) nextId(src string) (s string) { + for i, r := range src { if !sc.IsId(r) { break } - s += string(r) + s = src[:i+1] } return s } -func (sc *Scanner) GetOp(src string) (s string) { - for _, r := range src { +func (sc *Scanner) getOp(src string) (s string, isOp bool) { + for i, r := range src { if !sc.IsOp(r) { break } - s += string(r) + s = src[:i+1] + if _, match := sc.BlockProp[s]; match { + return s, false + } } - return s + return s, true } -func (sc *Scanner) GetNum(src string) (s string, v any) { +func (sc *Scanner) getNum(src string) (s string, v any) { // TODO: handle hexa, binary, octal, float and eng notations. - for _, r := range src { + for i, r := range src { if !IsNum(r) { break } - s += string(r) + s = src[:i+1] } var err error if strings.ContainsRune(s, '.') { @@ -218,64 +273,69 @@ func (sc *Scanner) GetNum(src string) (s string, v any) { return s, v } -func (sc *Scanner) GetGroupSep(src string) (s string) { +func (sc *Scanner) getGroupSep(src string) (s string) { for _, r := range src { return string(r) } return s } -func (sc *Scanner) GetStr(src string) (s string, ok bool) { - // TODO: handle long delimiters. - var delim rune - var esc, canEscape, nonl bool - for i, r := range src { - s += string(r) - if i == 0 { - delim = r - canEscape = sc.HasProp(r, StrEsc) - nonl = sc.HasProp(r, StrNonl) - continue - } +func (sc *Scanner) getStr(src string, nstart int) (s string, ok bool) { + start := src[:nstart] + end := sc.End[start] + prop := sc.BlockProp[start] + canEscape := prop&StrEsc != 0 + nonl := prop&StrNonl != 0 + excludeEnd := prop&ExcludeEnd != 0 + var esc bool + + for i, r := range src[nstart:] { + s = src[:nstart+i+1] if r == '\n' && nonl { return } - if r == delim && !(esc && canEscape) { + if strings.HasSuffix(s, end) && !esc { + if excludeEnd { + s = s[:len(s)-len(end)] + } return s, true } - esc = r == '\\' && !esc + esc = canEscape && r == '\\' && !esc } return } -func (sc *Scanner) GetBlock(src string) (s string, ok bool) { - // TODO: handle long and word delimiters. - var start, end rune +func (sc *Scanner) getBlock(src string, nstart int) (s string, ok bool) { + start := src[:nstart] + end := sc.End[start] + prop := sc.BlockProp[start] + skip := 0 n := 1 - for i, r := range src { - s += string(r) - if i == 0 { - start = r - end = rune(sc.End[string(r)][0]) // FIXME: not robust. - continue - } + + for i := range src[nstart:] { + s = src[:nstart+i+1] if i < skip { continue - } else if r == start { - n++ - } else if r == end { + } + if strings.HasSuffix(s, end) { n-- - } else if sc.IsStr(r) { - str, ok := sc.GetStr(src[i:]) + } else if strings.HasSuffix(s, start) { + n++ + } else if m := sc.sdre.FindStringSubmatch(s); len(m) > 1 { + str, ok := sc.getStr(src[i:], len(m[1])) if !ok { return s, false } - skip = i + len(str) + skip = i + len(str) - 1 } if n == 0 { - break + if prop&ExcludeEnd != 0 { + s = s[:len(s)-len(end)] + } + return s, true } } - return s, n == 0 + ok = prop&EosValidEnd != 0 + return s, ok } diff --git a/scanner/scan_test.go b/scanner/scan_test.go index 6be60a4..9eb079e 100644 --- a/scanner/scan_test.go +++ b/scanner/scan_test.go @@ -2,6 +2,7 @@ package scanner import ( "fmt" + "log" "testing" ) @@ -44,37 +45,21 @@ var GoScanner = &Scanner{ "`": "`", "//": "\n", }, + BlockProp: map[string]uint{ + "(": CharBlock, + "{": CharBlock, + "[": CharBlock, + `"`: CharStr | StrEsc | StrNonl, + "`": CharStr, + "'": CharStr | StrEsc, + "/*": CharStr, + "//": CharStr | ExcludeEnd | EosValidEnd, + }, } func TestScan(t *testing.T) { - tests := []struct{ src, result, errStr string }{ - // Simple tokens: separators, identifiers, numbers, operators. - {"", "[]", ""}, - {" abc + 5", "[{3 1 abc 0 0 } {7 3 + 0 0 } {9 2 5 0 0 5}]", ""}, - {"abc0+5 ", "[{0 1 abc0 0 0 } {4 3 + 0 0 } {5 2 5 0 0 5}]", ""}, - {"a+5\na=x-4", "[{0 1 a 0 0 } {1 3 + 0 0 } {2 2 5 0 0 5} {3 4 0 0 } {4 1 a 0 0 } {5 3 = 0 0 } {6 1 x 0 0 } {7 3 - 0 0 } {8 2 4 0 0 4}]", ""}, - - // Strings. - {`return "hello world" + 4`, `[{0 1 return 0 0 } {7 5 "hello world" 1 1 } {21 3 + 0 0 } {23 2 4 0 0 4}]`, ""}, - {`print(4 * (3+7))`, "[{0 1 print 0 0 } {5 6 (4 * (3+7)) 1 1 }]", ""}, - {`"foo`, "[]", "1:1: block not terminated"}, - {`abc -def "foo truc`, "[]", "2:6: block not terminated"}, - {`"ab\"`, "[]", "1:1: block not terminated"}, - {`"ab\\"`, `[{0 5 "ab\\" 1 1 }]`, ""}, - {`"ab\\\"`, "[]", "1:1: block not terminated"}, - {`"ab\\\\"`, `[{0 5 "ab\\\\" 1 1 }]`, ""}, - {`"abc -def"`, "[]", "1:1: block not terminated"}, - {"`hello\nworld`", "[{0 5 `hello\nworld` 1 1 }]", ""}, - - // Nested blocks. - // {`f("a)bc")+1, 3)`, "[{0 1 f } {1 6 (\"a)bc\", 3) (}]", ""}, - {"2* (3+4", "[]", "1:4: block not terminated"}, - {`("fo)o")+1`, "[{0 6 (\"fo)o\") 1 1 } {8 3 + 0 0 } {9 2 1 0 0 1}]", ""}, - {`"foo""bar"`, "[{0 5 \"foo\" 1 1 } {5 5 \"bar\" 1 1 }]", ""}, - } - + log.SetFlags(log.Lshortfile) + GoScanner.Init() for _, test := range tests { test := test t.Run("", func(t *testing.T) { @@ -83,14 +68,85 @@ def"`, "[]", "1:1: block not terminated"}, if err != nil { errStr = err.Error() } - if errStr != test.errStr { - t.Errorf("got error %#v, want error %#v", errStr, test.errStr) + if errStr != test.err { + t.Errorf("got error %#v, want error %#v", errStr, test.err) } - result := fmt.Sprintf("%v", token) - t.Logf("%#v\n%v %v\n", test.src, result, errStr) - if result != test.result { - t.Errorf("got %#v, want %#v", result, test.result) + t.Logf("%#v\n%v %v\n", test.src, token, errStr) + if result := tokStr(token); result != test.tok { + t.Errorf("got %v, want %v", result, test.tok) } }) } } + +func tokStr(tokens []Token) (s string) { + for _, t := range tokens { + s += fmt.Sprintf("%#v ", t.content) + } + return s +} + +var tests = []struct { + src, tok, err string +}{{ // #00 + src: "", +}, { // #01 + src: " abc + 5", + tok: `"abc" "+" "5" `, +}, { // #02 + src: "abc0+5 ", + tok: `"abc0" "+" "5" `, +}, { // #03 + src: "a+5\na=x-4", + tok: `"a" "+" "5" " " "a" "=" "x" "-" "4" `, +}, { // #04 + src: `return "hello world" + 4`, + tok: `"return" "\"hello world\"" "+" "4" `, +}, { // #05 + src: `print(4 * (3+7))`, + tok: `"print" "(4 * (3+7))" `, +}, { // #06 + src: `"foo`, + err: "1:1: block not terminated", +}, { // #07 + src: `abc +def "foo truc`, + err: "2:6: block not terminated", +}, { // #08 + src: `"ab\"`, + err: "1:1: block not terminated", +}, { // #09 + src: `"ab\\"`, + tok: `"\"ab\\\\\"" `, +}, { // #10 + src: `"ab\\\"`, + err: "1:1: block not terminated", +}, { // #11 + src: `"ab\\\\"`, + tok: `"\"ab\\\\\\\\\"" `, +}, { // #12 + src: `"abc +def"`, + err: "1:1: block not terminated", +}, { // #13 + src: "`hello\nworld`", + tok: "\"`hello\\nworld`\" ", +}, { // #14 + src: "2* (3+4", + err: "1:4: block not terminated", +}, { // #15 + src: `("fo)o")+1`, + tok: `"(\"fo)o\")" "+" "1" `, +}, { // #16 + src: `"foo""bar"`, + tok: `"\"foo\"" "\"bar\"" `, +}, { // #17 + src: "/* a comment */ a = 2", + tok: `"/* a comment */" "a" "=" "2" `, +}, { // #18 + src: "return // quit\nbegin", + tok: `"return" "// quit" " " "begin" `, +}, { // #19 + src: "println(3 /* argh ) */)", + tok: `"println" "(3 /* argh ) */)" `, +}} -- cgit v1.2.3