cgo: implement the constant parser as a real parser

Previously it was just a combination of heuristics to try to fit a constant in an *ast.BasicLit. For more complex expressions, this is not enough. This change also introduces proper syntax error with locations, if parsing a constant failed. For example, this will print a real error message with source location: #define FOO 5)
author: Ayke van Laethem <[email protected]> 2019-11-04 16:30:57 +0100
committer: Ron Evans <[email protected]> 2019-11-05 14:18:38 +0100
commit: cadb75a4aa1cdf9a447842013e5616c2b2e4b894 (patch)
tree: 1f66cfc66ae98f77716b587cd15ef7d2f6e72f52 /cgo
parent: 5987233b99c7b78f28a358ddf98f74853c27635b (diff)
download: tinygo-cadb75a4aa1cdf9a447842013e5616c2b2e4b894.tar.gz
tinygo-cadb75a4aa1cdf9a447842013e5616c2b2e4b894.zip
4 files changed, 203 insertions, 51 deletions
diff --git a/cgo/cgo.go b/cgo/cgo.go
index d42034c1a..9867b875e 100644
--- a/cgo/cgo.go
+++ b/cgo/cgo.go
@@ -42,7 +42,7 @@ type cgoPackage struct {
 // constantInfo stores some information about a CGo constant found by libclang
 // and declared in the Go AST.
 type constantInfo struct {
-	expr *ast.BasicLit
+	expr ast.Expr
 	pos  token.Pos
 }
 
diff --git a/cgo/const.go b/cgo/const.go
index 831515e7c..97483a5cf 100644
--- a/cgo/const.go
+++ b/cgo/const.go
@@ -4,56 +4,191 @@ package cgo
 // parse common #define statements to Go constant expressions.
 
 import (
+	"fmt"
 	"go/ast"
+	"go/scanner"
 	"go/token"
 	"strings"
 )
 
 // parseConst parses the given string as a C constant.
-func parseConst(pos token.Pos, value string) *ast.BasicLit {
-	for len(value) != 0 && value[0] == '(' && value[len(value)-1] == ')' {
-		value = strings.TrimSpace(value[1 : len(value)-1])
+func parseConst(pos token.Pos, fset *token.FileSet, value string) (ast.Expr, *scanner.Error) {
+	t := newTokenizer(pos, fset, value)
+	expr, err := parseConstExpr(t)
+	if t.token != token.EOF {
+		return nil, &scanner.Error{
+			Pos: t.fset.Position(t.pos),
+			Msg: "unexpected token " + t.token.String(),
+		}
 	}
-	if len(value) == 0 {
-		// Pretend it doesn't exist at all.
-		return nil
+	return expr, err
+}
+
+// parseConstExpr parses a stream of C tokens to a Go expression.
+func parseConstExpr(t *tokenizer) (ast.Expr, *scanner.Error) {
+	switch t.token {
+	case token.LPAREN:
+		lparen := t.pos
+		t.Next()
+		x, err := parseConstExpr(t)
+		if err != nil {
+			return nil, err
+		}
+		if t.token != token.RPAREN {
+			return nil, unexpectedToken(t, token.RPAREN)
+		}
+		expr := &ast.ParenExpr{
+			Lparen: lparen,
+			X:      x,
+			Rparen: t.pos,
+		}
+		t.Next()
+		return expr, nil
+	case token.INT, token.FLOAT, token.STRING, token.CHAR:
+		expr := &ast.BasicLit{
+			ValuePos: t.pos,
+			Kind:     t.token,
+			Value:    t.value,
+		}
+		t.Next()
+		return expr, nil
+	case token.EOF:
+		return nil, &scanner.Error{
+			Pos: t.fset.Position(t.pos),
+			Msg: "empty constant",
+		}
+	default:
+		return nil, &scanner.Error{
+			Pos: t.fset.Position(t.pos),
+			Msg: fmt.Sprintf("unexpected token %s", t.token),
+		}
 	}
-	// For information about integer literals:
-	// https://en.cppreference.com/w/cpp/language/integer_literal
-	if value[0] == '"' {
-		// string constant
-		return &ast.BasicLit{ValuePos: pos, Kind: token.STRING, Value: value}
+}
+
+// unexpectedToken returns an error of the form "unexpected token FOO, expected
+// BAR".
+func unexpectedToken(t *tokenizer, expected token.Token) *scanner.Error {
+	return &scanner.Error{
+		Pos: t.fset.Position(t.pos),
+		Msg: fmt.Sprintf("unexpected token %s, expected %s", t.token, expected),
 	}
-	if value[0] == '\'' {
-		// char constant
-		return &ast.BasicLit{ValuePos: pos, Kind: token.CHAR, Value: value}
+}
+
+// tokenizer reads C source code and converts it to Go tokens.
+type tokenizer struct {
+	pos   token.Pos
+	fset  *token.FileSet
+	token token.Token
+	value string
+	buf   string
+}
+
+// newTokenizer initializes a new tokenizer, positioned at the first token in
+// the string.
+func newTokenizer(start token.Pos, fset *token.FileSet, buf string) *tokenizer {
+	t := &tokenizer{
+		pos:   start,
+		fset:  fset,
+		buf:   buf,
+		token: token.ILLEGAL,
 	}
-	// assume it's a number (int or float)
-	value = strings.Replace(value, "'", "", -1) // remove ' chars
-	value = strings.TrimRight(value, "lu")      // remove llu suffixes etc.
-	// find the first non-number
-	nonnum := byte(0)
-	for i := 0; i < len(value); i++ {
-		if value[i] < '0' || value[i] > '9' {
-			nonnum = value[i]
-			break
+	t.Next() // Parse the first token.
+	return t
+}
+
+// Next consumes the next token in the stream. There is no return value, read
+// the next token from the pos, token and value properties.
+func (t *tokenizer) Next() {
+	t.pos += token.Pos(len(t.value))
+	for {
+		if len(t.buf) == 0 {
+			t.token = token.EOF
+			return
+		}
+		c := t.buf[0]
+		switch {
+		case c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v':
+			// Skip whitespace.
+			// Based on this source, not sure whether it represents C whitespace:
+			// https://en.cppreference.com/w/cpp/string/byte/isspace
+			t.pos++
+			t.buf = t.buf[1:]
+		case c == '(' || c == ')':
+			// Single-character tokens.
+			switch c {
+			case '(':
+				t.token = token.LPAREN
+			case ')':
+				t.token = token.RPAREN
+			}
+			t.value = t.buf[:1]
+			t.buf = t.buf[1:]
+			return
+		case c >= '0' && c <= '9':
+			// Numeric constant (int, float, etc.).
+			// Find the last non-numeric character.
+			tokenLen := len(t.buf)
+			hasDot := false
+			for i, c := range t.buf {
+				if c == '.' {
+					hasDot = true
+				}
+				if (c >= '0' && c <= '9') || c == '.' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') {
+					tokenLen = i + 1
+				}
+			}
+			t.value = t.buf[:tokenLen]
+			t.buf = t.buf[tokenLen:]
+			if hasDot {
+				// Integer constants are more complicated than this but this is
+				// a close approximation.
+				// https://en.cppreference.com/w/cpp/language/integer_literal
+				t.token = token.FLOAT
+				t.value = strings.TrimRight(t.value, "f")
+			} else {
+				t.token = token.INT
+				t.value = strings.TrimRight(t.value, "uUlL")
+			}
+			return
+		case c == '"':
+			// String constant. Find the first '"' character that is not
+			// preceded by a backslash.
+			escape := false
+			tokenLen := len(t.buf)
+			for i, c := range t.buf {
+				if i != 0 && c == '"' && !escape {
+					tokenLen = i + 1
+					break
+				}
+				if !escape {
+					escape = c == '\\'
+				}
+			}
+			t.token = token.STRING
+			t.value = t.buf[:tokenLen]
+			t.buf = t.buf[tokenLen:]
+			return
+		case c == '\'':
+			// Char (rune) constant. Find the first '\'' character that is not
+			// preceded by a backslash.
+			escape := false
+			tokenLen := len(t.buf)
+			for i, c := range t.buf {
+				if i != 0 && c == '\'' && !escape {
+					tokenLen = i + 1
+					break
+				}
+				if !escape {
+					escape = c == '\\'
+				}
+			}
+			t.token = token.CHAR
+			t.value = t.buf[:tokenLen]
+			t.buf = t.buf[tokenLen:]
+			return
+		default:
+			t.token = token.ILLEGAL
+			return
 		}
 	}
-	// determine number type based on the first non-number
-	switch nonnum {
-	case 0:
-		// no non-number found, must be an integer
-		return &ast.BasicLit{ValuePos: pos, Kind: token.INT, Value: value}
-	case 'x', 'X':
-		// hex integer constant
-		// TODO: may also be a floating point number per C++17.
-		return &ast.BasicLit{ValuePos: pos, Kind: token.INT, Value: value}
-	case '.', 'e':
-		// float constant
-		value = strings.TrimRight(value, "fFlL")
-		return &ast.BasicLit{ValuePos: pos, Kind: token.FLOAT, Value: value}
-	default:
-		// unknown type, ignore
-	}
-	return nil
 }
diff --git a/cgo/const_test.go b/cgo/const_test.go
index 27ed8b772..235a7117b 100644
--- a/cgo/const_test.go
+++ b/cgo/const_test.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"go/format"
 	"go/token"
+	"strings"
 	"testing"
 )
 
@@ -14,20 +15,33 @@ func TestParseConst(t *testing.T) {
 		Go string
 	}{
 		{`5`, `5`},
-		{`(5)`, `5`},
-		{`(((5)))`, `5`},
+		{`(5)`, `(5)`},
+		{`(((5)))`, `(5)`},
+		{`)`, `error: 1:1: unexpected token )`},
+		{`5)`, `error: 1:2: unexpected token )`},
+		{"  \t)", `error: 1:4: unexpected token )`},
 		{`5.8f`, `5.8`},
-		{`foo`, `<invalid>`}, // identifiers unimplemented
-		{``, `<invalid>`},    // empty constants not allowed in Go
+		{`foo`, `error: 1:1: unexpected token ILLEGAL`}, // identifiers unimplemented
+		{``, `error: 1:1: empty constant`},              // empty constants not allowed in Go
 		{`"foo"`, `"foo"`},
+		{`"a\\n"`, `"a\\n"`},
+		{`"a\n"`, `"a\n"`},
+		{`"a\""`, `"a\""`},
 		{`'a'`, `'a'`},
-		{`0b10`, `<invalid>`}, // binary number literals unimplemented
+		{`0b10`, `0b10`},
+		{`0x1234_5678`, `0x1234_5678`},
 	} {
 		fset := token.NewFileSet()
-		startPos := fset.AddFile("test.c", -1, 1000).Pos(0)
-		expr := parseConst(startPos, tc.C)
+		startPos := fset.AddFile("", -1, 1000).Pos(0)
+		expr, err := parseConst(startPos, fset, tc.C)
 		s := "<invalid>"
-		if expr != nil {
+		if err != nil {
+			if !strings.HasPrefix(tc.Go, "error: ") {
+				t.Errorf("expected value %#v for C constant %#v but got error %#v", tc.Go, tc.C, err.Error())
+				continue
+			}
+			s = "error: " + err.Error()
+		} else if expr != nil {
 			// Serialize the Go constant to a string, for more readable test
 			// cases.
 			buf := &bytes.Buffer{}
diff --git a/cgo/libclang.go b/cgo/libclang.go
index efefe370b..a4817a4f1 100644
--- a/cgo/libclang.go
+++ b/cgo/libclang.go
@@ -245,9 +245,12 @@ func tinygo_clang_globals_visitor(c, parent C.GoCXCursor, client_data C.CXClient
 			p.addError(pos, fmt.Sprintf("internal error: expected macro value to start with %#v, got %#v", name, source))
 			break
 		}
-		value := strings.TrimSpace(source[len(name):])
+		value := source[len(name):]
 		// Try to convert this #define into a Go constant expression.
-		expr := parseConst(pos, value)
+		expr, err := parseConst(pos+token.Pos(len(name)), p.fset, value)
+		if err != nil {
+			p.errors = append(p.errors, err)
+		}
 		if expr != nil {
 			// Parsing was successful.
 			p.constants[name] = constantInfo{expr, pos}
author	Ayke van Laethem <[email protected]>	2019-11-04 16:30:57 +0100
committer	Ron Evans <[email protected]>	2019-11-05 14:18:38 +0100
commit	cadb75a4aa1cdf9a447842013e5616c2b2e4b894 (patch)
tree	1f66cfc66ae98f77716b587cd15ef7d2f6e72f52 /cgo
parent	5987233b99c7b78f28a358ddf98f74853c27635b (diff)
download	tinygo-cadb75a4aa1cdf9a447842013e5616c2b2e4b894.tar.gz tinygo-cadb75a4aa1cdf9a447842013e5616c2b2e4b894.zip