diff options
Diffstat (limited to 'compiler')
| -rw-r--r-- | compiler/compiler.go | 521 |
1 files changed, 521 insertions, 0 deletions
diff --git a/compiler/compiler.go b/compiler/compiler.go new file mode 100644 index 0000000..d63dd14 --- /dev/null +++ b/compiler/compiler.go @@ -0,0 +1,521 @@ +package compiler + +import ( + "fmt" + "log" + "os" + "path" + "reflect" + "runtime" + "strconv" + + "github.com/mvertes/parscan/lang" + "github.com/mvertes/parscan/parser" + "github.com/mvertes/parscan/scanner" + "github.com/mvertes/parscan/vm" +) + +// Compiler represents the state of a compiler. +type Compiler struct { + *parser.Parser + vm.Code // produced code, to fill VM with + Data []vm.Value // produced data, will be at the bottom of VM stack + Entry int // offset in Code to start execution from (skip function defintions) + + strings map[string]int // locations of strings in Data +} + +// NewCompiler returns a new compiler state for a given scanner. +func NewCompiler(scanner *scanner.Scanner) *Compiler { + return &Compiler{ + Parser: parser.NewParser(scanner, true), + Entry: -1, + strings: map[string]int{}, + } +} + +// AddSym adds a new named value to the compiler symbol table, and returns its index in memory. +func (c *Compiler) AddSym(name string, value vm.Value) int { + p := len(c.Data) + c.Data = append(c.Data, value) + c.AddSymbol(p, name, value, parser.SymValue, nil, false) + return p +} + +// Codegen generates vm code from parsed tokens. +func (c *Compiler) Codegen(tokens parser.Tokens) (err error) { + log.Println("Codegen tokens:", tokens) + fixList := parser.Tokens{} // list of tokens to fix after all necessary information is gathered + stack := []*parser.Symbol{} // for symbolic evaluation, type checking, etc + + emit := func(t scanner.Token, op vm.Op, arg ...int) { + _, file, line, _ := runtime.Caller(1) + fmt.Fprintf(os.Stderr, "%s:%d: %v emit %v %v\n", path.Base(file), line, t, op, arg) + c.Code = append(c.Code, vm.Instruction{Pos: vm.Pos(t.Pos), Op: op, Arg: arg}) + } + push := func(s *parser.Symbol) { stack = append(stack, s) } + pop := func() *parser.Symbol { l := len(stack) - 1; s := stack[l]; stack = stack[:l]; return s } + + for i, t := range tokens { + switch t.Tok { + case lang.Int: + n, err := strconv.Atoi(t.Str) + if err != nil { + return err + } + push(&parser.Symbol{Kind: parser.SymConst, Value: vm.ValueOf(n), Type: vm.TypeOf(0)}) + emit(t, vm.Push, n) + + case lang.String: + s := t.Block() + v := vm.Value{Data: reflect.ValueOf(s), Type: vm.TypeOf(s)} + i, ok := c.strings[s] + if !ok { + i = len(c.Data) + c.Data = append(c.Data, v) + c.strings[s] = i + } + push(&parser.Symbol{Kind: parser.SymConst, Value: v}) + emit(t, vm.Dup, i) + + case lang.Add: + push(&parser.Symbol{Type: arithmeticOpType(pop(), pop())}) + emit(t, vm.Add) + + case lang.Mul: + push(&parser.Symbol{Type: arithmeticOpType(pop(), pop())}) + emit(t, vm.Mul) + + case lang.Sub: + push(&parser.Symbol{Type: arithmeticOpType(pop(), pop())}) + emit(t, vm.Sub) + + case lang.Minus: + emit(t, vm.Push, 0) + emit(t, vm.Sub) + + case lang.Not: + emit(t, vm.Not) + + case lang.Plus: + // Unary '+' is idempotent. Nothing to do. + + case lang.Addr: + push(&parser.Symbol{Type: vm.PointerTo(pop().Type)}) + emit(t, vm.Addr) + + case lang.Deref: + push(&parser.Symbol{Type: pop().Type.Elem()}) + emit(t, vm.Deref) + + case lang.Index: + push(&parser.Symbol{Type: pop().Type.Elem()}) + emit(t, vm.Index) + + case lang.Greater: + push(&parser.Symbol{Type: booleanOpType(pop(), pop())}) + emit(t, vm.Greater) + + case lang.Less: + push(&parser.Symbol{Type: booleanOpType(pop(), pop())}) + emit(t, vm.Lower) + + case lang.Call: + s := pop() + if s.Kind != parser.SymValue { + typ := s.Type + // TODO: pop input types (careful with variadic function). + for i := 0; i < typ.Rtype.NumOut(); i++ { + push(&parser.Symbol{Type: typ.Out(i)}) + } + emit(t, vm.Call) + break + } + push(s) + fallthrough // A symValue must be called through callX. + + case lang.CallX: + rtyp := pop().Value.Data.Type() + // TODO: pop input types (careful with variadic function). + for i := 0; i < rtyp.NumOut(); i++ { + push(&parser.Symbol{Type: &vm.Type{Rtype: rtyp.Out(i)}}) + } + emit(t, vm.CallX, t.Beg) + + case lang.Composite: + log.Println("COMPOSITE") + /* + d := pop() + switch d.typ.Rtype.Kind() { + case reflect.Struct: + // nf := d.typ.Rtype.NumField() + // emit(t.Pos, vm.New, d.index, c.typeSym(d.typ).index) + emit(t, vm.Field, 0) + emit(t, vm.Vassign) + emit(t, vm.Fdup, 2) + emit(t, vm.Field, 1) + emit(t, vm.Vassign) + emit(t, vm.Pop, 1) + // emit(t, vm.Fdup, 2) + // Assume an element list with no keys, one per struct field in order + } + */ + + case lang.Grow: + emit(t, vm.Grow, t.Beg) + + case lang.Define: + // TODO: support assignment to local, composite objects. + st := tokens[i-1] + l := len(c.Data) + d := pop() + typ := d.Type + if typ == nil { + typ = d.Value.Type + } + v := vm.NewValue(typ) + c.AddSymbol(l, st.Str, v, parser.SymVar, typ, false) + c.Data = append(c.Data, v) + emit(t, vm.Assign, l) + + case lang.Assign: + st := tokens[i-1] + if st.Tok == lang.Period || st.Tok == lang.Index { + emit(t, vm.Vassign) + break + } + s, ok := c.Symbols[st.Str] + if !ok { + return fmt.Errorf("symbol not found: %s", st.Str) + } + d := pop() + typ := d.Type + if typ == nil { + typ = d.Value.Type + } + if s.Type == nil { + s.Type = typ + s.Value = vm.NewValue(typ) + } + if s.Local { + if !s.Used { + emit(st, vm.New, s.Index, c.typeSym(s.Type).Index) + s.Used = true + } + emit(st, vm.Fassign, s.Index) + break + } + if s.Index == parser.UnsetAddr { + s.Index = len(c.Data) + c.Data = append(c.Data, s.Value) + } + emit(st, vm.Assign, s.Index) + + case lang.Equal: + push(&parser.Symbol{Type: booleanOpType(pop(), pop())}) + emit(t, vm.Equal) + + case lang.EqualSet: + push(&parser.Symbol{Type: booleanOpType(pop(), pop())}) + emit(t, vm.EqualSet) + + case lang.Ident: + if i < len(tokens)-1 { + switch t1 := tokens[i+1]; t1.Tok { + case lang.Define, lang.Assign, lang.Colon: + continue + } + } + s, ok := c.Symbols[t.Str] + if !ok { + return fmt.Errorf("symbol not found: %s", t.Str) + } + push(s) + if s.Kind == parser.SymPkg { + break + } + if s.Local { + emit(t, vm.Fdup, s.Index) + } else { + if s.Index == parser.UnsetAddr { + s.Index = len(c.Data) + c.Data = append(c.Data, s.Value) + } + emit(t, vm.Dup, s.Index) + } + + case lang.Label: + lc := len(c.Code) + s, ok := c.Symbols[t.Str] + if ok { + s.Value = vm.ValueOf(lc) + if s.Kind == parser.SymFunc { + // label is a function entry point, register its code address in data. + s.Index = len(c.Data) + c.Data = append(c.Data, s.Value) + } else { + c.Data[s.Index] = s.Value + } + } else { + c.Symbols[t.Str] = &parser.Symbol{Kind: parser.SymLabel, Value: vm.ValueOf(lc)} + } + + case lang.JumpFalse: + var i int + if s, ok := c.Symbols[t.Str]; !ok { + // t.Beg contains the position in code which needs to be fixed. + t.Beg = len(c.Code) + fixList = append(fixList, t) + } else { + i = int(s.Value.Data.Int()) - len(c.Code) + } + emit(t, vm.JumpFalse, i) + + case lang.JumpSetFalse: + var i int + if s, ok := c.Symbols[t.Str]; !ok { + // t.Beg contains the position in code which needs to be fixed. + t.Beg = len(c.Code) + fixList = append(fixList, t) + } else { + i = int(s.Value.Data.Int()) - len(c.Code) + } + emit(t, vm.JumpSetFalse, i) + + case lang.JumpSetTrue: + var i int + if s, ok := c.Symbols[t.Str]; !ok { + // t.Beg contains the position in code which needs to be fixed. + t.Beg = len(c.Code) + fixList = append(fixList, t) + } else { + i = int(s.Value.Data.Int()) - len(c.Code) + } + emit(t, vm.JumpSetTrue, i) + + case lang.Goto: + var i int + if s, ok := c.Symbols[t.Str]; !ok { + t.Beg = len(c.Code) + fixList = append(fixList, t) + } else { + i = int(s.Value.Data.Int()) - len(c.Code) + } + emit(t, vm.Jump, i) + + case lang.Period: + s := pop() + switch s.Kind { + case parser.SymPkg: + p, ok := parser.Packages[s.PkgPath] + if !ok { + return fmt.Errorf("package not found: %s", s.PkgPath) + } + v, ok := p[t.Str[1:]] + if !ok { + return fmt.Errorf("symbol not found in package %s: %s", s.PkgPath, t.Str[1:]) + } + name := s.PkgPath + t.Str + var l int + sym, _, ok := c.GetSym(name, "") + if ok { + l = sym.Index + } else { + l = len(c.Data) + c.Data = append(c.Data, v) + c.AddSymbol(l, name, v, parser.SymValue, v.Type, false) + sym = c.Symbols[name] + } + push(sym) + emit(t, vm.Dup, l) + default: + if f, ok := s.Type.Rtype.FieldByName(t.Str[1:]); ok { + emit(t, vm.Field, f.Index...) + break + } + return fmt.Errorf("field or method not found: %s", t.Str[1:]) + } + + case lang.Return: + emit(t, vm.Return, t.Beg, t.End) + + default: + return fmt.Errorf("Codegen: unsupported token %v", t) + } + } + + // Finally we fix unresolved labels for jump destinations. + for _, t := range fixList { + s, ok := c.Symbols[t.Str] + if !ok { + return fmt.Errorf("label not found: %q", t.Str) + } + c.Code[t.Beg].Arg[0] = int(s.Value.Data.Int()) - t.Beg + } + return err +} +func arithmeticOpType(s1, _ *parser.Symbol) *vm.Type { return parser.SymbolType(s1) } +func booleanOpType(_, _ *parser.Symbol) *vm.Type { return vm.TypeOf(true) } + +// PrintCode pretty prints the generated code. +func (c *Compiler) PrintCode() { + labels := map[int][]string{} // labels indexed by code location + data := map[int]string{} // data indexed by frame location + + for name, sym := range c.Symbols { + if sym.Kind == parser.SymLabel || sym.Kind == parser.SymFunc { + i := int(sym.Value.Data.Int()) + labels[i] = append(labels[i], name) + } + if sym.Used { + data[sym.Index] = name + } + } + + fmt.Fprintln(os.Stderr, "# Code:") + for i, l := range c.Code { + for _, label := range labels[i] { + fmt.Fprintln(os.Stderr, label+":") + } + extra := "" + switch l.Op { + case vm.Jump, vm.JumpFalse, vm.JumpTrue, vm.JumpSetFalse, vm.JumpSetTrue, vm.Calli: + if d, ok := labels[i+l.Arg[0]]; ok { + extra = "// " + d[0] + } + case vm.Dup, vm.Assign, vm.Fdup, vm.Fassign: + if d, ok := data[l.Arg[0]]; ok { + extra = "// " + d + } + } + fmt.Fprintf(os.Stderr, "%4d %-14v %v\n", i, l, extra) + } + + for _, label := range labels[len(c.Code)] { + fmt.Fprintln(os.Stderr, label+":") + } + fmt.Fprintln(os.Stderr, "# End code") +} + +type entry struct { + name string + *parser.Symbol +} + +func (e entry) String() string { + if e.Symbol != nil { + return fmt.Sprintf("name: %s,local: %t, i: %d, k: %d, t: %s, v: %v", + e.name, + e.Local, + e.Index, + e.Kind, + e.Type, + e.Value, + ) + } + return e.name +} + +// PrintData pretty prints the generated global data symbols in compiler. +func (c *Compiler) PrintData() { + dict := c.symbolsByIndex() + + fmt.Fprintln(os.Stderr, "# Data:") + for i, d := range c.Data { + fmt.Fprintf(os.Stderr, "%4d %T %v %v\n", i, d.Data.Interface(), d.Data, dict[i]) + } +} + +func (c *Compiler) symbolsByIndex() map[int]entry { + dict := map[int]entry{} + for name, sym := range c.Symbols { + if sym.Index == parser.UnsetAddr { + continue + } + dict[sym.Index] = entry{name, sym} + } + return dict +} + +// Dump represents the state of a data dump. +type Dump struct { + Values []*DumpValue +} + +// DumpValue is a value of a dump state. +type DumpValue struct { + Index int + Name string + Kind int + Type string + Value any +} + +// Dump creates a snapshot of the execution state of global variables. +// This method is specifically implemented in the Compiler to minimize the coupling between +// the dump format and other components. By situating the dump logic in the Compiler, +// it relies solely on the program being executed and the indexing algorithm used for ordering variables +// (currently, this is an integer that corresponds to the order of variables in the program). +// This design choice allows the Virtual Machine (VM) to evolve its memory management strategies +// without compromising backward compatibility with dumps generated by previous versions. +func (c *Compiler) Dump() *Dump { + dict := c.symbolsByIndex() + dv := make([]*DumpValue, len(c.Data)) + for i, d := range c.Data { + e := dict[i] + dv[i] = &DumpValue{ + Index: e.Index, + Name: e.name, + Kind: int(e.Kind), + Type: e.Type.Name, + Value: d.Data.Interface(), + } + } + return &Dump{Values: dv} +} + +// ApplyDump sets previously saved dump, restoring the state of global variables. +func (c *Compiler) ApplyDump(d *Dump) error { + dict := c.symbolsByIndex() + for _, dv := range d.Values { + // do all the checks to be sure we are applying the correct values + e, ok := dict[dv.Index] + if !ok { + return fmt.Errorf("entry not found on index %d", dv.Index) + } + + if dv.Name != e.name || + dv.Type != e.Type.Name || + dv.Kind != int(e.Kind) { + return fmt.Errorf("entry with index %d does not match with provided entry. "+ + "dumpValue: %s, %s, %d. memoryValue: %s, %s, %d", + dv.Index, + dv.Name, dv.Type, dv.Kind, + e.name, e.Type, e.Kind) + } + + if dv.Index >= len(c.Data) { + return fmt.Errorf("index (%d) bigger than memory (%d)", dv.Index, len(c.Data)) + } + + if !c.Data[dv.Index].Data.CanSet() { + return fmt.Errorf("value %v cannot be set", dv.Value) + } + + c.Data[dv.Index].Data.Set(reflect.ValueOf(dv.Value)) + } + return nil +} + +func (c *Compiler) typeSym(t *vm.Type) *parser.Symbol { + tsym, ok := c.Symbols[t.Rtype.String()] + if !ok { + tsym = &parser.Symbol{Index: parser.UnsetAddr, Kind: parser.SymType, Type: t} + c.Symbols[t.Rtype.String()] = tsym + } + if tsym.Index == parser.UnsetAddr { + tsym.Index = len(c.Data) + c.Data = append(c.Data, vm.NewValue(t)) + } + return tsym +} |
