package cgo // This file implements a parser of a subset of the C language, just enough to // parse common #define statements to Go constant expressions. import ( "fmt" "go/ast" "go/scanner" "go/token" "strings" ) var ( prefixParseFns map[token.Token]func(*tokenizer) (ast.Expr, *scanner.Error) precedences = map[token.Token]int{ token.OR: precedenceOr, token.XOR: precedenceXor, token.AND: precedenceAnd, token.SHL: precedenceShift, token.SHR: precedenceShift, token.ADD: precedenceAdd, token.SUB: precedenceAdd, token.MUL: precedenceMul, token.QUO: precedenceMul, token.REM: precedenceMul, } ) // See: https://en.cppreference.com/w/c/language/operator_precedence const ( precedenceLowest = iota + 1 precedenceOr precedenceXor precedenceAnd precedenceShift precedenceAdd precedenceMul precedencePrefix ) func init() { // This must be done in an init function to avoid an initialization order // failure. prefixParseFns = map[token.Token]func(*tokenizer) (ast.Expr, *scanner.Error){ token.IDENT: parseIdent, token.INT: parseBasicLit, token.FLOAT: parseBasicLit, token.STRING: parseBasicLit, token.CHAR: parseBasicLit, token.LPAREN: parseParenExpr, token.SUB: parseUnaryExpr, } } // parseConst parses the given string as a C constant. func parseConst(pos token.Pos, fset *token.FileSet, value string, params []ast.Expr, callerPos token.Pos, f *cgoFile) (ast.Expr, *scanner.Error) { t := newTokenizer(pos, fset, value, f) // If params is non-nil (could be a zero length slice), this const is // actually a function-call like expression from another macro. // This means we have to parse a string like "(a, b) (a+b)". // We do this by parsing the parameters at the start and then treating the // following like a normal constant expression. if params != nil { // Parse opening paren. if t.curToken != token.LPAREN { return nil, unexpectedToken(t, token.LPAREN) } t.Next() // Parse parameters (identifiers) and closing paren. var paramIdents []string for i := 0; ; i++ { if i == 0 && t.curToken == token.RPAREN { // No parameters, break early. t.Next() break } // Read the parameter name. if t.curToken != token.IDENT { return nil, unexpectedToken(t, token.IDENT) } paramIdents = append(paramIdents, t.curValue) t.Next() // Read the next token: either a continuation (comma) or end of list // (rparen). if t.curToken == token.RPAREN { // End of parameter list. t.Next() break } else if t.curToken == token.COMMA { // Comma, so there will be another parameter name. t.Next() } else { return nil, &scanner.Error{ Pos: t.fset.Position(t.curPos), Msg: "unexpected token " + t.curToken.String() + " inside macro parameters, expected ',' or ')'", } } } // Report an error if there is a mismatch in parameter length. // The error is reported at the location of the closing paren from the // caller location. if len(params) != len(paramIdents) { return nil, &scanner.Error{ Pos: t.fset.Position(callerPos), Msg: fmt.Sprintf("unexpected number of parameters: expected %d, got %d", len(paramIdents), len(params)), } } // Assign values to the parameters. // These parameter names are closer in 'scope' than other identifiers so // will be used first when parsing an identifier. for i, name := range paramIdents { t.params[name] = params[i] } } expr, err := parseConstExpr(t, precedenceLowest) t.Next() if t.curToken != token.EOF { return nil, &scanner.Error{ Pos: t.fset.Position(t.curPos), Msg: "unexpected token " + t.curToken.String() + ", expected end of expression", } } return expr, err } // parseConstExpr parses a stream of C tokens to a Go expression. func parseConstExpr(t *tokenizer, precedence int) (ast.Expr, *scanner.Error) { if t.curToken == token.EOF { return nil, &scanner.Error{ Pos: t.fset.Position(t.curPos), Msg: "empty constant", } } prefix := prefixParseFns[t.curToken] if prefix == nil { return nil, &scanner.Error{ Pos: t.fset.Position(t.curPos), Msg: fmt.Sprintf("unexpected token %s", t.curToken), } } leftExpr, err := prefix(t) for t.peekToken != token.EOF && precedence < precedences[t.peekToken] { switch t.peekToken { case token.OR, token.XOR, token.AND, token.SHL, token.SHR, token.ADD, token.SUB, token.MUL, token.QUO, token.REM: t.Next() leftExpr, err = parseBinaryExpr(t, leftExpr) } } return leftExpr, err } func parseIdent(t *tokenizer) (ast.Expr, *scanner.Error) { // If the identifier is one of the parameters of this function-like macro, // use the parameter value. if val, ok := t.params[t.curValue]; ok { return val, nil } if t.f != nil { // Check whether this identifier is actually a macro "call" with // parameters. In that case, we should parse the parameters and pass it // on to a new invocation of parseConst. if t.peekToken == token.LPAREN { if cursor, ok := t.f.names[t.curValue]; ok && t.f.isFunctionLikeMacro(cursor) { // We know the current and peek tokens (the peek one is the '(' // token). So skip ahead until the current token is the first // unknown token. t.Next() t.Next() // Parse the list of parameters until ')' (rparen) is found. params := []ast.Expr{} for i := 0; ; i++ { if i == 0 && t.curToken == token.RPAREN { break } x, err := parseConstExpr(t, precedenceLowest) if err != nil { return nil, err } params = append(params, x) t.Next() if t.curToken == token.COMMA { t.Next() } else if t.curToken == token.RPAREN { break } else { return nil, &scanner.Error{ Pos: t.fset.Position(t.curPos), Msg: "unexpected token " + t.curToken.String() + ", ',' or ')'", } } } // Evaluate the macro value and use it as the identifier value. rparen := t.curPos pos, text := t.f.getMacro(cursor) return parseConst(pos, t.fset, text, params, rparen, t.f) } } // Normally the name is something defined in the file (like another // macro) which we get the declaration from using getASTDeclName. // This ensures that names that are only referenced inside a macro are // still getting defined. if cursor, ok := t.f.names[t.curValue]; ok { return &ast.Ident{ NamePos: t.curPos, Name: t.f.getASTDeclName(t.curValue, cursor, false), }, nil } } // t.f is nil during testing. This is a fallback. return &ast.Ident{ NamePos: t.curPos, Name: "C." + t.curValue, }, nil } func parseBasicLit(t *tokenizer) (ast.Expr, *scanner.Error) { return &ast.BasicLit{ ValuePos: t.curPos, Kind: t.curToken, Value: t.curValue, }, nil } func parseParenExpr(t *tokenizer) (ast.Expr, *scanner.Error) { lparen := t.curPos t.Next() x, err := parseConstExpr(t, precedenceLowest) if err != nil { return nil, err } t.Next() if t.curToken != token.RPAREN { return nil, unexpectedToken(t, token.RPAREN) } expr := &ast.ParenExpr{ Lparen: lparen, X: x, Rparen: t.curPos, } return expr, nil } func parseBinaryExpr(t *tokenizer, left ast.Expr) (ast.Expr, *scanner.Error) { expression := &ast.BinaryExpr{ X: left, Op: t.curToken, OpPos: t.curPos, } precedence := precedences[t.curToken] t.Next() right, err := parseConstExpr(t, precedence) expression.Y = right return expression, err } func parseUnaryExpr(t *tokenizer) (ast.Expr, *scanner.Error) { expression := &ast.UnaryExpr{ OpPos: t.curPos, Op: t.curToken, } t.Next() x, err := parseConstExpr(t, precedencePrefix) expression.X = x return expression, err } // unexpectedToken returns an error of the form "unexpected token FOO, expected // BAR". func unexpectedToken(t *tokenizer, expected token.Token) *scanner.Error { return &scanner.Error{ Pos: t.fset.Position(t.curPos), Msg: fmt.Sprintf("unexpected token %s, expected %s", t.curToken, expected), } } // tokenizer reads C source code and converts it to Go tokens. type tokenizer struct { f *cgoFile curPos, peekPos token.Pos fset *token.FileSet curToken, peekToken token.Token curValue, peekValue string buf string params map[string]ast.Expr } // newTokenizer initializes a new tokenizer, positioned at the first token in // the string. func newTokenizer(start token.Pos, fset *token.FileSet, buf string, f *cgoFile) *tokenizer { t := &tokenizer{ f: f, peekPos: start, fset: fset, buf: buf, peekToken: token.ILLEGAL, params: make(map[string]ast.Expr), } // Parse the first two tokens (cur and peek). t.Next() t.Next() return t } // Next consumes the next token in the stream. There is no return value, read // the next token from the pos, token and value properties. func (t *tokenizer) Next() { // The previous peek is now the current token. t.curPos = t.peekPos t.curToken = t.peekToken t.curValue = t.peekValue // Parse the next peek token. if t.peekPos != token.NoPos { t.peekPos += token.Pos(len(t.curValue)) } for { if len(t.buf) == 0 { t.peekToken = token.EOF return } c := t.buf[0] switch { case c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v': // Skip whitespace. // Based on this source, not sure whether it represents C whitespace: // https://en.cppreference.com/w/cpp/string/byte/isspace if t.peekPos != token.NoPos { t.peekPos++ } t.buf = t.buf[1:] case len(t.buf) >= 2 && (string(t.buf[:2]) == "||" || string(t.buf[:2]) == "&&" || string(t.buf[:2]) == "<<" || string(t.buf[:2]) == ">>"): // Two-character tokens. switch c { case '&': t.peekToken = token.LAND case '|': t.peekToken = token.LOR case '<': t.peekToken = token.SHL case '>': t.peekToken = token.SHR default: panic("unreachable") } t.peekValue = t.buf[:2] t.buf = t.buf[2:] return case c == '(' || c == ')' || c == ',' || c == '+' || c == '-' || c == '*' || c == '/' || c == '%' || c == '&' || c == '|' || c == '^': // Single-character tokens. // TODO: ++ (increment) and -- (decrement) operators. switch c { case '(': t.peekToken = token.LPAREN case ')': t.peekToken = token.RPAREN case ',': t.peekToken = token.COMMA case '+': t.peekToken = token.ADD case '-': t.peekToken = token.SUB case '*': t.peekToken = token.MUL case '/': t.peekToken = token.QUO case '%': t.peekToken = token.REM case '&': t.peekToken = token.AND case '|': t.peekToken = token.OR case '^': t.peekToken = token.XOR } t.peekValue = t.buf[:1] t.buf = t.buf[1:] return case c >= '0' && c <= '9': // Numeric constant (int, float, etc.). // Find the last non-numeric character. tokenLen := len(t.buf) hasDot := false for i, c := range t.buf { if c == '.' { hasDot = true } if c >= '0' && c <= '9' || c == '.' || c == '_' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' { tokenLen = i + 1 } else { break } } t.peekValue = t.buf[:tokenLen] t.buf = t.buf[tokenLen:] if hasDot { // Integer constants are more complicated than this but this is // a close approximation. // https://en.cppreference.com/w/cpp/language/integer_literal t.peekToken = token.FLOAT t.peekValue = strings.TrimRight(t.peekValue, "f") } else { t.peekToken = token.INT t.peekValue = strings.TrimRight(t.peekValue, "uUlL") } return case c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' || c == '_': // Identifier. Find all remaining tokens that are part of this // identifier. tokenLen := len(t.buf) for i, c := range t.buf { if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' || c == '_' { tokenLen = i + 1 } else { break } } t.peekValue = t.buf[:tokenLen] t.buf = t.buf[tokenLen:] t.peekToken = token.IDENT return case c == '"': // String constant. Find the first '"' character that is not // preceded by a backslash. escape := false tokenLen := len(t.buf) for i, c := range t.buf { if i != 0 && c == '"' && !escape { tokenLen = i + 1 break } if !escape { escape = c == '\\' } } t.peekToken = token.STRING t.peekValue = t.buf[:tokenLen] t.buf = t.buf[tokenLen:] return case c == '\'': // Char (rune) constant. Find the first '\'' character that is not // preceded by a backslash. escape := false tokenLen := len(t.buf) for i, c := range t.buf { if i != 0 && c == '\'' && !escape { tokenLen = i + 1 break } if !escape { escape = c == '\\' } } t.peekToken = token.CHAR t.peekValue = t.buf[:tokenLen] t.buf = t.buf[tokenLen:] return default: t.peekToken = token.ILLEGAL return } } }