From: Thomas Bruce Date: Wed, 19 Feb 2025 21:25:30 +0000 (-0500) Subject: initial lexer + parser, TODO: fix assignment kind X-Git-Url: https://git.deglebe.com/non-work/w/static/git-logo.png?a=commitdiff_plain;h=558c5aebdd1d733e32f0c3b4acf10f9fc6e44ad3;p=barec%2Fbarec.git initial lexer + parser, TODO: fix assignment kind --- 558c5aebdd1d733e32f0c3b4acf10f9fc6e44ad3 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3e68405 --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +CC = tcc +CFLAGS += -D_POSIX_C_SOURCE=200809L -Wall -Wextra -Werror -std=c99 -g +TARGET = barec0 + +all: $(TARGET) + +$(TARGET): lexer.o parser.o + $(CC) $(CFLAGS) -o $(TARGET) lexer.o parser.o + +lexer.o: lexer.c lexer.h + $(CC) $(CFLAGS) -c lexer.c + +parser.o: parser.c lexer.h + $(CC) $(CFLAGS) -c parser.c + +clean: + rm -f *.o $(TARGET) diff --git a/README.md b/README.md new file mode 100644 index 0000000..260165f --- /dev/null +++ b/README.md @@ -0,0 +1,7 @@ +# bareC + +a c dialect intended for low-level systems programming + +# current milestone + +super basic AST processing, and that's it! this isn't even a usable language diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..d259454 --- /dev/null +++ b/lexer.c @@ -0,0 +1,146 @@ +/* lexer.c + * bareC lexer implementation (c source) + */ + +#include "lexer.h" + +/* global state for lexer */ +static int g_currentChar = ' '; +static FILE* g_input = NULL; + +/* advance: read next char from g_input */ +static void advance(void) { g_currentChar = fgetc(g_input); } + +/* isEOF: check if the current token is end-of-file */ +static bool isEOF(void) { return (g_currentChar == EOF); } + +/* skipWhitespace: skip over whitespace characters */ +static void skipWhitespace(void) { while (isspace(g_currentChar)) advance(); } + +/* skipSingleLineComment: skips over singleline comment on seeing `//` */ +static void skipSingleLineComment(void) { + advance(); // consume second `/` + while (!isEOF() && g_currentChar != '\n') advance(); + if (!isEOF()) advance(); +} + +/* identifier helpers */ +static bool isIdentStart(int c) { return (isalpha(c) || c == '_'); } +static bool isIdentChar(int c) { return (isalnum(c) || c == '_'); } + +/* makeToken: create token of given kind and lexeme */ +static Token makeToken(TokenKind kind, const char* lexeme) { + Token t; + t.kind = kind; + t.lexeme = strdup(lexeme); + t.intValue = 0; + return t; +} + +/* makeNumberToken: create token with numeric literal value */ +static Token makeNumberToken(int value, const char *lexeme) { + Token t; + t.kind = TK_NUMBER; + t.intValue = value; + t.lexeme = strdup(lexeme); + return t; +} + +/* getNextToken: return next token from stream */ +Token getNextToken(void) { + skipWhitespace(); + + if (isEOF()) return makeToken(TK_EOF, "EOF"); + + /* handle comments: `/` check next char also equals `/` */ + if (g_currentChar == '/') { + int nextChar = fgetc(g_input); + ungetc(nextChar, g_input); + if (nextChar == '/') { + skipSingleLineComment(); + return getNextToken(); + } else { + advance(); + return makeToken(TK_SLASH, "/"); + } + } + + /* Handle operators and punctuation */ + if (g_currentChar == '=') { + int nextChar = fgetc(g_input); + if (nextChar == '=') { + advance(); // consume '=' + advance(); // consume second '=' + return makeToken(TK_EQ, "=="); + } else { + ungetc(nextChar, g_input); + advance(); + return makeToken(TK_ASSIGN, "="); + } + } + + if (g_currentChar == '+') { advance(); return makeToken(TK_PLUS, "+"); } + if (g_currentChar == '-') { advance(); return makeToken(TK_MINUS, "-"); } + if (g_currentChar == '*') { advance(); return makeToken(TK_STAR, "*"); } + if (g_currentChar == '(') { advance(); return makeToken(TK_LPAREN, "("); } + if (g_currentChar == ')') { advance(); return makeToken(TK_RPAREN, ")"); } + if (g_currentChar == '{') { advance(); return makeToken(TK_LBRACE, "{"); } + if (g_currentChar == '}') { advance(); return makeToken(TK_RBRACE, "}"); } + if (g_currentChar == ';') { advance(); return makeToken(TK_SEMICOLON, ";"); } + if (g_currentChar == '>') { advance(); return makeToken(TK_GT, ">"); } + + /* handle numeric literal */ + if (isdigit(g_currentChar)) { + char buffer[64]; + int idx = 0; + while (isdigit(g_currentChar) && idx < 63) { + buffer[idx++] = (char)g_currentChar; + advance(); + } + buffer[idx] = '\0'; + int value = atoi(buffer); + return makeNumberToken(value, buffer); + } + + /* handle ident or keyword */ + if (isIdentStart(g_currentChar)) { + char buffer[128]; + int idx = 0; + while (isIdentChar(g_currentChar) && idx < 127) { + buffer[idx++] = (char)g_currentChar; + advance(); + } + buffer[idx] = '\0'; + + if (strcmp(buffer, "int") == 0) + return makeToken(TK_INT, "int"); + if (strcmp(buffer, "return") == 0) + return makeToken(TK_RETURN, "return"); + if (strcmp(buffer, "if") == 0) + return makeToken(TK_IF, "if"); + if (strcmp(buffer, "else") == 0) + return makeToken(TK_ELSE, "else"); + if (strcmp(buffer, "while") == 0) + return makeToken(TK_WHILE, "while"); + return makeToken(TK_IDENT, buffer); + } + + /* else: unknown */ + char unknownLexeme[2] = { (char)g_currentChar, '\0' }; + advance(); + return makeToken(TK_UNKNOWN, unknownLexeme); +} + +/* initLexer: initialize lexer with source filename */ +void initLexer(int argc, char** argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + exit(1); + } + g_input = fopen(argv[1], "r"); + if (!g_input) { + fprintf(stderr, "Err: could not open '%s'\n", argv[1]); + exit(1); + } + advance(); +} diff --git a/lexer.h b/lexer.h new file mode 100644 index 0000000..bfd7d19 --- /dev/null +++ b/lexer.h @@ -0,0 +1,47 @@ +/* lexer.h + * bareC lexer header (c source) + */ + +#include +#include +#include +#include +#include + +/* what kinds of tokens exist */ +typedef enum { + TK_INT, // `int` + TK_RETURN, // `return` + TK_IF, // `if` + TK_ELSE, // `else` + TK_WHILE, // `while` + TK_IDENT, // identifier + TK_NUMBER, // integer literal + TK_EQ, // `==` + TK_ASSIGN, // `=` + TK_PLUS, // `+` + TK_MINUS, // `-` + TK_STAR, // `*` + TK_SLASH, // `/` + TK_LPAREN, // `(` + TK_RPAREN, // `)` + TK_LBRACE, // `{` + TK_RBRACE, // `}` + TK_SEMICOLON, // `;` + TK_GT, // `>` + TK_EOF, // EOF + TK_UNKNOWN // UNKNOWN +} TokenKind; + +/* token structure */ +typedef struct { + TokenKind kind; + char* lexeme; // textual representation + int intValue; // numeric literal value +} Token; + +/* initialize the lexer; take source filename */ +void initLexer(int argc, char** argv); + +/* return next token from stream */ +Token getNextToken(void); diff --git a/parser.c b/parser.c new file mode 100644 index 0000000..1446792 --- /dev/null +++ b/parser.c @@ -0,0 +1,397 @@ +/* parser.c + * bareC parser and ast structure (c source) + */ + +#include +#include +#include +#include + +#include "lexer.h" + +/* ast definitions */ +typedef enum { + AST_FUNCDEF, // function def + AST_BLOCK, // compound statement block + AST_RETURN, // return statement + AST_IF, // if statement + AST_WHILE, // while loop + AST_EXPR_STMT, // expresison statement + AST_BINARY, // bunary expression + AST_NUM, // numeric literal + AST_IDENT, // identifier + AST_DECL // variable declaration +} ASTKind; + +typedef struct ASTNode ASTNode; +struct ASTNode { + ASTKind kind; + ASTNode* left; // for binary expr, condition of if, etc. + ASTNode* right; // second child + ASTNode* third; // for if: else + ASTNode** blockStatements; // compound blocks + int blockCount; // num statements in block + char* funcName; // for function definitions + ASTNode* funcBody; // body block + int intValue; // numeric literals + char* identName; // identifiers and operator string in binary nodes +}; + +/* utilities for creating new ast nodes */ +static ASTNode* newASTNode(ASTKind kind) { + ASTNode* node = (ASTNode*)calloc(1, sizeof(ASTNode)); + node->kind = kind; + return node; +} + +static ASTNode* newBinaryNode(ASTNode* lhs, ASTNode* rhs, const char* op) { + ASTNode* node = newASTNode(AST_BINARY); + node->left = lhs; + node->right = rhs; + node->identName = strdup(op); + return node; +} + +static ASTNode* newNumNode(int value) { + ASTNode* node = newASTNode(AST_NUM); + node->intValue = value; + return node; +} + +static ASTNode* newIdentNode(const char* name) { + ASTNode* node = newASTNode(AST_IDENT); + node->identName = strdup(name); + return node; +} + +/* parsing state and utility */ +static Token g_currentToken; + +/* advance to the next token */ +static void nextToken(void) { + g_currentToken = getNextToken(); +} + +/* if match, consume */ +static bool match(TokenKind kind) { + if (g_currentToken.kind == kind) { + nextToken(); + return true; + } + return false; +} + +/* expect to be of a given kind, exit on error */ +static void expect(TokenKind kind) { + if (g_currentToken.kind == kind) { + nextToken(); + } else { + fprintf(stderr, "Parse err: expected kind %d, got %d\n", + kind, g_currentToken.kind); + exit(1); + } +} + +/* forward decl. */ +static ASTNode* parseExpression(void); +static ASTNode* parseEquality(void); +static ASTNode* parseAdditive(void); +static ASTNode* parseTerm(void); +static ASTNode* parseFactor(void); +static ASTNode* parseDeclaration(void); +static ASTNode* parseStatement(void); +static ASTNode* parseCompoundStatement(void); +static ASTNode* parseFunctionDefinition(void); +static ASTNode* parseProgram(void); + +/* parsing */ + +/* program := (function-definition)* EOF */ + +static ASTNode* parseProgram(void) { + ASTNode* root = newASTNode(AST_BLOCK); + root->blockStatements = NULL; + root->blockCount = 0; + while (g_currentToken.kind != TK_EOF) { + ASTNode* func = parseFunctionDefinition(); + root->blockCount++; + root->blockStatements = (ASTNode**)realloc( + root->blockStatements, + sizeof(ASTNode*) * root->blockCount + ); + root->blockStatements[root->blockCount - 1] = func; + } + return root; +} + +/* function-definition := "int" IDENT "(" ")" compound-statement */ +static ASTNode* parseFunctionDefinition(void) { + expect(TK_INT); + if (g_currentToken.kind != TK_IDENT) { + fprintf(stderr, "Parse error: expected identifier after 'int'\n"); + exit(1); + } + /* Duplicate the function name string */ + char* funcName = strdup(g_currentToken.lexeme); + nextToken(); + expect(TK_LPAREN); + expect(TK_RPAREN); + ASTNode* body = parseCompoundStatement(); + ASTNode* funcNode = newASTNode(AST_FUNCDEF); + funcNode->funcName = funcName; + funcNode->funcBody = body; + return funcNode; +} + +/* compound-statement := "{" statement* "}" */ +static ASTNode* parseCompoundStatement(void) { + ASTNode* blockNode = newASTNode(AST_BLOCK); + blockNode->blockStatements = NULL; + blockNode->blockCount = 0; + expect(TK_LBRACE); + while (g_currentToken.kind != TK_RBRACE) { + ASTNode* stmt = parseStatement(); + blockNode->blockCount++; + blockNode->blockStatements = (ASTNode**)realloc( + blockNode->blockStatements, + sizeof(ASTNode*) * blockNode->blockCount + ); + blockNode->blockStatements[blockNode->blockCount - 1] = stmt; + } + expect(TK_RBRACE); + return blockNode; +} + +/* statement := + return-statement + | if-statement + | while-statement + | expression-statement + | compound-statement +*/ +static ASTNode* parseStatement(void) { + if (match(TK_RETURN)) { + ASTNode* node = newASTNode(AST_RETURN); + if (g_currentToken.kind != TK_SEMICOLON) + node->left = parseExpression(); + expect(TK_SEMICOLON); + return node; + } else if (match(TK_IF)) { + ASTNode* node = newASTNode(AST_IF); + expect(TK_LPAREN); + node->left = parseExpression(); + expect(TK_RPAREN); + node->right = parseStatement(); + if (match(TK_ELSE)) + node->third = parseStatement(); + return node; + } else if (match(TK_WHILE)) { + ASTNode* node = newASTNode(AST_WHILE); + expect(TK_LPAREN); + node->left = parseExpression(); + expect(TK_RPAREN); + node->right = parseStatement(); + return node; + } else if (g_currentToken.kind == TK_LBRACE) { + return parseCompoundStatement(); + } else if (g_currentToken.kind == TK_INT) { + return parseDeclaration(); + } else { + /* Expression-statement or empty statement */ + ASTNode* node = newASTNode(AST_EXPR_STMT); + if (g_currentToken.kind != TK_SEMICOLON) + node->left = parseExpression(); + expect(TK_SEMICOLON); + return node; + } +} + +/* expression := equality */ +static ASTNode* parseExpression(void) { + return parseEquality(); +} + +/* equality := additive ( "==" additive )* */ +static ASTNode* parseEquality(void) { + ASTNode* node = parseAdditive(); + while (g_currentToken.kind == TK_EQ || g_currentToken.kind == TK_GT) { + TokenKind op = g_currentToken.kind; + nextToken(); + ASTNode* rhs = parseAdditive(); + if (op == TK_EQ) + node = newBinaryNode(node, rhs, "=="); + else if (op == TK_GT) + node = newBinaryNode(node, rhs, ">"); + } + return node; +} + +/* additive := term ( ("+" | "-") term )* */ +static ASTNode* parseAdditive(void) { + ASTNode* node = parseTerm(); + while (true) { + if (match(TK_PLUS)) { + ASTNode* rhs = parseTerm(); + node = newBinaryNode(node, rhs, "+"); + } else if (match(TK_MINUS)) { + ASTNode* rhs = parseTerm(); + node = newBinaryNode(node, rhs, "-"); + } else { + break; + } + } + return node; +} + +/* term := factor ( ("*" | "/") factor )* */ +static ASTNode* parseTerm(void) { + ASTNode* node = parseFactor(); + while (true) { + if (match(TK_STAR)) { + ASTNode* rhs = parseFactor(); + node = newBinaryNode(node, rhs, "*"); + } else if (match(TK_SLASH)) { + ASTNode* rhs = parseFactor(); + node = newBinaryNode(node, rhs, "/"); + } else { + break; + } + } + return node; +} + +/* factor := "(" expression ")" | IDENT | NUMBER */ +static ASTNode* parseFactor(void) { + if (match(TK_LPAREN)) { + ASTNode* node = parseExpression(); + expect(TK_RPAREN); + return node; + } + if (g_currentToken.kind == TK_IDENT) { + ASTNode* node = newIdentNode(g_currentToken.lexeme); + nextToken(); + return node; + } + if (g_currentToken.kind == TK_NUMBER) { + ASTNode* node = newNumNode(g_currentToken.intValue); + nextToken(); + return node; + } + fprintf(stderr, "Parse err: unexpected token '%s' in factor\n", g_currentToken.lexeme); + exit(1); + return NULL; // unreachable, but silences compiler warnings. +} + +/* declaration */ +static ASTNode* parseDeclaration(void) { + expect(TK_INT); + + if (g_currentToken.kind != TK_IDENT) { + fprintf(stderr, "Parse err: expected identifier in declaration, got '%s'\n", g_currentToken.lexeme); + exit(1); + } + char *varName = strdup(g_currentToken.lexeme); + nextToken(); + + ASTNode* decl = newASTNode(AST_DECL); + decl->identName = varName; + + if (match(TK_ASSIGN)) { decl->left = parseExpression(); } + + expect(TK_SEMICOLON); + return decl; +} + + +/* demonstration: ast printing*/ +static void printIndent(int indent) { + for (int i = 0; i < indent; i++) + printf(" "); +} + +static void printAST(ASTNode* node, int indent) { + if (!node) return; + switch (node->kind) { + case AST_FUNCDEF: + printIndent(indent); + printf("FunctionDef: name=%s\n", node->funcName); + printAST(node->funcBody, indent + 1); + break; + case AST_BLOCK: + printIndent(indent); + printf("{\n"); + for (int i = 0; i < node->blockCount; i++) + printAST(node->blockStatements[i], indent + 1); + printIndent(indent); + printf("}\n"); + break; + case AST_RETURN: + printIndent(indent); + printf("Return\n"); + if (node->left) + printAST(node->left, indent + 1); + break; + case AST_IF: + printIndent(indent); + printf("If\n"); + printIndent(indent + 1); + printf("Condition:\n"); + printAST(node->left, indent + 2); + printIndent(indent + 1); + printf("Then:\n"); + printAST(node->right, indent + 2); + if (node->third) { + printIndent(indent + 1); + printf("Else:\n"); + printAST(node->third, indent + 2); + } + break; + case AST_WHILE: + printIndent(indent); + printf("While\n"); + printIndent(indent + 1); + printf("Condition:\n"); + printAST(node->left, indent + 2); + printIndent(indent + 1); + printf("Body:\n"); + printAST(node->right, indent + 2); + break; + case AST_EXPR_STMT: + printIndent(indent); + printf("ExprStmt\n"); + if (node->left) + printAST(node->left, indent + 1); + break; + case AST_BINARY: + printIndent(indent); + printf("BinaryOp (%s)\n", node->identName); + printAST(node->left, indent + 1); + printAST(node->right, indent + 1); + break; + case AST_NUM: + printIndent(indent); + printf("Number (%d)\n", node->intValue); + break; + case AST_IDENT: + printIndent(indent); + printf("Identifier (%s)\n", node->identName); + break; + default: + printIndent(indent); + printf("Unknown AST node kind\n"); + break; + } +} + +int main(int argc, char** argv) { + /* initialize the lexer with the source filename */ + initLexer(argc, argv); + nextToken(); // prime the token stream + + ASTNode* root = parseProgram(); + + printf("=== AST DUMP ===\n"); + printAST(root, 0); + + return 0; +} diff --git a/tests/test01.bc b/tests/test01.bc new file mode 100644 index 0000000..e72a8a0 --- /dev/null +++ b/tests/test01.bc @@ -0,0 +1,4 @@ +int main() { + return 42; +} + diff --git a/tests/test02.bc b/tests/test02.bc new file mode 100644 index 0000000..addd491 --- /dev/null +++ b/tests/test02.bc @@ -0,0 +1,3 @@ +int main() { + return 1 + 2 * (3 - 4) / 5; +} diff --git a/tests/test03.bc b/tests/test03.bc new file mode 100644 index 0000000..63deb0c --- /dev/null +++ b/tests/test03.bc @@ -0,0 +1,7 @@ +int main() { + if (1) { + return 10; + } else { + return 20; + } +} diff --git a/tests/test04.bc b/tests/test04.bc new file mode 100644 index 0000000..ee360bb --- /dev/null +++ b/tests/test04.bc @@ -0,0 +1,7 @@ +int main() { + int x = 5; + while (x > 0) { + x = x - 1; + } + return x; +} diff --git a/tests/test05.bc b/tests/test05.bc new file mode 100644 index 0000000..51a295b --- /dev/null +++ b/tests/test05.bc @@ -0,0 +1,7 @@ +int square(int x) { + return x * x; +} + +int main() { + return square(5); +}