]> git.deglebe.com Git - barec/barec.git/commitdiff
initial lexer + parser, TODO: fix assignment kind
authorThomas Bruce <tdb@tdbio.me>
Wed, 19 Feb 2025 21:25:30 +0000 (16:25 -0500)
committerThomas Bruce <tdb@tdbio.me>
Wed, 19 Feb 2025 21:25:30 +0000 (16:25 -0500)
Makefile [new file with mode: 0644]
README.md [new file with mode: 0644]
lexer.c [new file with mode: 0644]
lexer.h [new file with mode: 0644]
parser.c [new file with mode: 0644]
tests/test01.bc [new file with mode: 0644]
tests/test02.bc [new file with mode: 0644]
tests/test03.bc [new file with mode: 0644]
tests/test04.bc [new file with mode: 0644]
tests/test05.bc [new file with mode: 0644]

diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..3e68405
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,17 @@
+CC = tcc
+CFLAGS += -D_POSIX_C_SOURCE=200809L -Wall -Wextra -Werror -std=c99 -g
+TARGET = barec0
+
+all: $(TARGET)
+
+$(TARGET): lexer.o parser.o
+       $(CC) $(CFLAGS) -o $(TARGET) lexer.o parser.o
+
+lexer.o: lexer.c lexer.h
+       $(CC) $(CFLAGS) -c lexer.c
+
+parser.o: parser.c lexer.h
+       $(CC) $(CFLAGS) -c parser.c
+
+clean:
+       rm -f *.o $(TARGET)
diff --git a/README.md b/README.md
new file mode 100644 (file)
index 0000000..260165f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+# bareC
+
+a c dialect intended for low-level systems programming
+
+# current milestone
+
+super basic AST processing, and that's it! this isn't even a usable language
diff --git a/lexer.c b/lexer.c
new file mode 100644 (file)
index 0000000..d259454
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,146 @@
+/* lexer.c
+ * bareC lexer implementation (c source)
+ */ 
+
+#include "lexer.h"
+
+/* global state for lexer */
+static int g_currentChar = ' ';
+static FILE* g_input = NULL;
+
+/* advance: read next char from g_input */
+static void advance(void) { g_currentChar = fgetc(g_input); }
+
+/* isEOF: check if the current token is end-of-file */
+static bool isEOF(void) { return (g_currentChar == EOF); }
+
+/* skipWhitespace: skip over whitespace characters */
+static void skipWhitespace(void) { while (isspace(g_currentChar)) advance(); }
+
+/* skipSingleLineComment: skips over singleline comment on seeing `//` */
+static void skipSingleLineComment(void) {
+       advance(); // consume second `/`
+       while (!isEOF() && g_currentChar != '\n') advance();
+       if (!isEOF()) advance();
+}
+
+/* identifier helpers */
+static bool isIdentStart(int c) { return (isalpha(c) || c == '_'); }
+static bool isIdentChar(int c) { return (isalnum(c) || c == '_'); }
+
+/* makeToken: create token of given kind and lexeme */
+static Token makeToken(TokenKind kind, const char* lexeme) {
+       Token t;
+       t.kind = kind;
+       t.lexeme = strdup(lexeme);
+       t.intValue = 0;
+       return t;
+}
+
+/* makeNumberToken: create token with numeric literal value */
+static Token makeNumberToken(int value, const char *lexeme) {
+       Token t;
+       t.kind = TK_NUMBER;
+       t.intValue = value;
+       t.lexeme = strdup(lexeme);
+       return t;
+}
+
+/* getNextToken: return next token from stream */
+Token getNextToken(void) {
+       skipWhitespace();
+
+       if (isEOF()) return makeToken(TK_EOF, "EOF");
+
+       /* handle comments: `/` check next char also equals `/` */
+       if (g_currentChar == '/') {
+               int nextChar = fgetc(g_input);
+               ungetc(nextChar, g_input);
+               if (nextChar == '/') {
+                       skipSingleLineComment();
+                       return getNextToken();
+               } else {
+                       advance();
+                       return makeToken(TK_SLASH, "/");
+               }
+       }
+
+       /* Handle operators and punctuation */
+       if (g_currentChar == '=') {
+               int nextChar = fgetc(g_input);
+               if (nextChar == '=') {
+                       advance(); // consume '='
+                       advance(); // consume second '='
+                       return makeToken(TK_EQ, "==");
+               } else {
+                       ungetc(nextChar, g_input);
+                       advance();
+                       return makeToken(TK_ASSIGN, "=");
+               }
+       }
+
+       if (g_currentChar == '+') { advance(); return makeToken(TK_PLUS, "+");  }
+       if (g_currentChar == '-') { advance(); return makeToken(TK_MINUS, "-");  }
+       if (g_currentChar == '*') { advance(); return makeToken(TK_STAR, "*");  }
+       if (g_currentChar == '(') { advance(); return makeToken(TK_LPAREN, "("); }
+       if (g_currentChar == ')') { advance(); return makeToken(TK_RPAREN, ")"); }
+       if (g_currentChar == '{') { advance(); return makeToken(TK_LBRACE, "{"); }
+       if (g_currentChar == '}') { advance(); return makeToken(TK_RBRACE, "}"); }
+       if (g_currentChar == ';') { advance(); return makeToken(TK_SEMICOLON, ";"); }
+       if (g_currentChar == '>') { advance(); return makeToken(TK_GT, ">"); }
+
+       /* handle numeric literal */
+       if (isdigit(g_currentChar)) {
+               char buffer[64];
+               int idx = 0;
+               while (isdigit(g_currentChar) && idx < 63) {
+                       buffer[idx++] = (char)g_currentChar;
+                       advance();
+               }
+               buffer[idx] = '\0';
+               int value = atoi(buffer);
+               return makeNumberToken(value, buffer);
+       }
+
+       /* handle ident or keyword */
+       if (isIdentStart(g_currentChar)) {
+               char buffer[128];
+               int idx = 0;
+               while (isIdentChar(g_currentChar) && idx < 127) {
+                       buffer[idx++] = (char)g_currentChar;
+                       advance();
+               }
+               buffer[idx] = '\0';
+
+               if (strcmp(buffer, "int") == 0)
+                       return makeToken(TK_INT, "int");
+               if (strcmp(buffer, "return") == 0)
+                       return makeToken(TK_RETURN, "return");
+               if (strcmp(buffer, "if") == 0)
+                       return makeToken(TK_IF, "if");
+               if (strcmp(buffer, "else") == 0)
+                       return makeToken(TK_ELSE, "else");
+               if (strcmp(buffer, "while") == 0)
+                       return makeToken(TK_WHILE, "while");
+               return makeToken(TK_IDENT, buffer);
+       }
+
+       /* else: unknown */
+       char unknownLexeme[2] = { (char)g_currentChar, '\0' };
+       advance();
+       return makeToken(TK_UNKNOWN, unknownLexeme);
+}
+
+/* initLexer: initialize lexer with source filename */
+void initLexer(int argc, char** argv) {
+       if (argc < 2) {
+               fprintf(stderr, "Usage: %s <source file>\n", argv[0]);
+               exit(1);
+       }
+       g_input = fopen(argv[1], "r");
+       if (!g_input) {
+               fprintf(stderr, "Err: could not open '%s'\n", argv[1]);
+               exit(1);
+       }
+       advance();
+}
diff --git a/lexer.h b/lexer.h
new file mode 100644 (file)
index 0000000..bfd7d19
--- /dev/null
+++ b/lexer.h
@@ -0,0 +1,47 @@
+/* lexer.h
+ * bareC lexer header (c source)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdbool.h>
+
+/* what kinds of tokens exist */
+typedef enum {
+       TK_INT,         // `int`
+       TK_RETURN,      // `return`
+       TK_IF,          // `if`
+       TK_ELSE,        // `else`
+       TK_WHILE,       // `while`
+       TK_IDENT,       // identifier
+       TK_NUMBER,      // integer literal
+       TK_EQ,          // `==`
+       TK_ASSIGN,      // `=`
+       TK_PLUS,        // `+`
+       TK_MINUS,       // `-`
+       TK_STAR,        // `*`
+       TK_SLASH,       // `/`
+       TK_LPAREN,      // `(`
+       TK_RPAREN,      // `)`
+       TK_LBRACE,      // `{`
+       TK_RBRACE,      // `}`
+       TK_SEMICOLON,   // `;`
+       TK_GT,          // `>`
+       TK_EOF,         // EOF
+       TK_UNKNOWN      // UNKNOWN
+} TokenKind;
+
+/* token structure */
+typedef struct {
+       TokenKind       kind;
+       char*           lexeme;         // textual representation
+       int             intValue;       // numeric literal value
+} Token;
+
+/* initialize the lexer; take source filename */
+void initLexer(int argc, char** argv);
+
+/* return next token from stream */
+Token getNextToken(void);
diff --git a/parser.c b/parser.c
new file mode 100644 (file)
index 0000000..1446792
--- /dev/null
+++ b/parser.c
@@ -0,0 +1,397 @@
+/* parser.c
+ * bareC parser and ast structure (c source)
+ */ 
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "lexer.h"
+
+/* ast definitions */
+typedef enum {
+       AST_FUNCDEF,    // function def
+       AST_BLOCK,      // compound statement block
+       AST_RETURN,     // return statement
+       AST_IF,         // if statement
+       AST_WHILE,      // while loop
+       AST_EXPR_STMT,  // expresison statement
+       AST_BINARY,     // bunary expression
+       AST_NUM,        // numeric literal
+       AST_IDENT,      // identifier
+       AST_DECL        // variable declaration
+} ASTKind;
+
+typedef struct ASTNode ASTNode;
+struct ASTNode {
+       ASTKind kind;
+       ASTNode* left;  // for binary expr, condition of if, etc.
+       ASTNode* right; // second child
+       ASTNode* third; // for if: else
+       ASTNode** blockStatements; // compound blocks
+       int blockCount; // num statements in block
+       char* funcName; // for function definitions
+       ASTNode* funcBody; // body block
+       int intValue;   // numeric literals
+       char* identName; // identifiers and operator string in binary nodes
+};
+
+/* utilities for creating new ast nodes */
+static ASTNode* newASTNode(ASTKind kind) {
+       ASTNode* node = (ASTNode*)calloc(1, sizeof(ASTNode));
+       node->kind = kind;
+       return node;
+}
+
+static ASTNode* newBinaryNode(ASTNode* lhs, ASTNode* rhs, const char* op) {
+       ASTNode* node = newASTNode(AST_BINARY);
+       node->left = lhs;
+       node->right = rhs;
+       node->identName = strdup(op);
+       return node;
+}
+
+static ASTNode* newNumNode(int value) {
+       ASTNode* node = newASTNode(AST_NUM);
+       node->intValue = value;
+       return node;
+}
+
+static ASTNode* newIdentNode(const char* name) {
+       ASTNode* node = newASTNode(AST_IDENT);
+       node->identName = strdup(name);
+       return node;
+}
+
+/* parsing state and utility */
+static Token g_currentToken;
+
+/* advance to the next token */
+static void nextToken(void) {
+       g_currentToken = getNextToken();
+}
+
+/* if match, consume */
+static bool match(TokenKind kind) {
+       if (g_currentToken.kind == kind) {
+               nextToken();
+               return true;
+       }
+       return false;
+}
+
+/* expect to be of a given kind, exit on error */
+static void expect(TokenKind kind) {
+       if (g_currentToken.kind == kind) {
+               nextToken();
+       } else {
+               fprintf(stderr, "Parse err: expected kind %d, got %d\n",
+                               kind, g_currentToken.kind);
+               exit(1);
+       }
+}
+
+/* forward decl. */
+static ASTNode* parseExpression(void);
+static ASTNode* parseEquality(void);
+static ASTNode* parseAdditive(void);
+static ASTNode* parseTerm(void);
+static ASTNode* parseFactor(void);
+static ASTNode* parseDeclaration(void);
+static ASTNode* parseStatement(void);
+static ASTNode* parseCompoundStatement(void);
+static ASTNode* parseFunctionDefinition(void);
+static ASTNode* parseProgram(void);
+
+/* parsing */
+
+/* program := (function-definition)* EOF */
+
+static ASTNode* parseProgram(void) {
+       ASTNode* root = newASTNode(AST_BLOCK);
+       root->blockStatements = NULL;
+       root->blockCount = 0;
+       while (g_currentToken.kind != TK_EOF) {
+               ASTNode* func = parseFunctionDefinition();
+               root->blockCount++;
+               root->blockStatements = (ASTNode**)realloc(
+                       root->blockStatements,
+                       sizeof(ASTNode*) * root->blockCount
+               );
+               root->blockStatements[root->blockCount - 1] = func;
+       }
+       return root;
+}
+
+/* function-definition := "int" IDENT "(" ")" compound-statement */
+static ASTNode* parseFunctionDefinition(void) {
+       expect(TK_INT);
+       if (g_currentToken.kind != TK_IDENT) {
+               fprintf(stderr, "Parse error: expected identifier after 'int'\n");
+               exit(1);
+       }
+       /* Duplicate the function name string */
+       char* funcName = strdup(g_currentToken.lexeme);
+       nextToken();
+       expect(TK_LPAREN);
+       expect(TK_RPAREN);
+       ASTNode* body = parseCompoundStatement();
+       ASTNode* funcNode = newASTNode(AST_FUNCDEF);
+       funcNode->funcName = funcName;
+       funcNode->funcBody = body;
+       return funcNode;
+}
+
+/* compound-statement := "{" statement* "}" */
+static ASTNode* parseCompoundStatement(void) {
+       ASTNode* blockNode = newASTNode(AST_BLOCK);
+       blockNode->blockStatements = NULL;
+       blockNode->blockCount = 0;
+       expect(TK_LBRACE);
+       while (g_currentToken.kind != TK_RBRACE) {
+               ASTNode* stmt = parseStatement();
+               blockNode->blockCount++;
+               blockNode->blockStatements = (ASTNode**)realloc(
+                       blockNode->blockStatements,
+                       sizeof(ASTNode*) * blockNode->blockCount
+               );
+               blockNode->blockStatements[blockNode->blockCount - 1] = stmt;
+       }
+       expect(TK_RBRACE);
+       return blockNode;
+}
+
+/* statement :=
+          return-statement
+        | if-statement
+        | while-statement
+        | expression-statement
+        | compound-statement
+*/
+static ASTNode* parseStatement(void) {
+       if (match(TK_RETURN)) {
+               ASTNode* node = newASTNode(AST_RETURN);
+               if (g_currentToken.kind != TK_SEMICOLON)
+                       node->left = parseExpression();
+               expect(TK_SEMICOLON);
+               return node;
+       } else if (match(TK_IF)) {
+               ASTNode* node = newASTNode(AST_IF);
+               expect(TK_LPAREN);
+               node->left = parseExpression();
+               expect(TK_RPAREN);
+               node->right = parseStatement();
+               if (match(TK_ELSE))
+                       node->third = parseStatement();
+               return node;
+       } else if (match(TK_WHILE)) {
+               ASTNode* node = newASTNode(AST_WHILE);
+               expect(TK_LPAREN);
+               node->left = parseExpression();
+               expect(TK_RPAREN);
+               node->right = parseStatement();
+               return node;
+       } else if (g_currentToken.kind == TK_LBRACE) {
+               return parseCompoundStatement();
+       } else if (g_currentToken.kind == TK_INT) {
+               return parseDeclaration();
+       } else {
+               /* Expression-statement or empty statement */
+               ASTNode* node = newASTNode(AST_EXPR_STMT);
+               if (g_currentToken.kind != TK_SEMICOLON)
+                       node->left = parseExpression();
+               expect(TK_SEMICOLON);
+               return node;
+       }
+}
+
+/* expression := equality */
+static ASTNode* parseExpression(void) {
+       return parseEquality();
+}
+
+/* equality := additive ( "==" additive )* */
+static ASTNode* parseEquality(void) {
+       ASTNode* node = parseAdditive();
+       while (g_currentToken.kind == TK_EQ || g_currentToken.kind == TK_GT) {
+               TokenKind op = g_currentToken.kind;
+               nextToken();
+               ASTNode* rhs = parseAdditive();
+               if (op == TK_EQ)
+                       node = newBinaryNode(node, rhs, "==");
+               else if (op == TK_GT) 
+                       node = newBinaryNode(node, rhs, ">");
+       }
+       return node;
+}
+
+/* additive := term ( ("+" | "-") term )* */
+static ASTNode* parseAdditive(void) {
+       ASTNode* node = parseTerm();
+       while (true) {
+               if (match(TK_PLUS)) {
+                       ASTNode* rhs = parseTerm();
+                       node = newBinaryNode(node, rhs, "+");
+               } else if (match(TK_MINUS)) {
+                       ASTNode* rhs = parseTerm();
+                       node = newBinaryNode(node, rhs, "-");
+               } else {
+                       break;
+               }
+       }
+       return node;
+}
+
+/* term := factor ( ("*" | "/") factor )* */
+static ASTNode* parseTerm(void) {
+       ASTNode* node = parseFactor();
+       while (true) {
+               if (match(TK_STAR)) {
+                       ASTNode* rhs = parseFactor();
+                       node = newBinaryNode(node, rhs, "*");
+               } else if (match(TK_SLASH)) {
+                       ASTNode* rhs = parseFactor();
+                       node = newBinaryNode(node, rhs, "/");
+               } else {
+                       break;
+               }
+       }
+       return node;
+}
+
+/* factor := "(" expression ")" | IDENT | NUMBER */
+static ASTNode* parseFactor(void) {
+       if (match(TK_LPAREN)) {
+               ASTNode* node = parseExpression();
+               expect(TK_RPAREN);
+               return node;
+       }
+       if (g_currentToken.kind == TK_IDENT) {
+               ASTNode* node = newIdentNode(g_currentToken.lexeme);
+               nextToken();
+               return node;
+       }
+       if (g_currentToken.kind == TK_NUMBER) {
+               ASTNode* node = newNumNode(g_currentToken.intValue);
+               nextToken();
+               return node;
+       }
+       fprintf(stderr, "Parse err: unexpected token '%s' in factor\n", g_currentToken.lexeme);
+       exit(1);
+       return NULL; // unreachable, but silences compiler warnings.
+}
+
+/* declaration */
+static ASTNode* parseDeclaration(void) {
+       expect(TK_INT);
+       
+       if (g_currentToken.kind != TK_IDENT) {
+               fprintf(stderr, "Parse err: expected identifier in declaration, got '%s'\n", g_currentToken.lexeme);
+               exit(1);
+       }
+       char *varName = strdup(g_currentToken.lexeme);
+       nextToken();
+
+       ASTNode* decl = newASTNode(AST_DECL);
+       decl->identName = varName;
+
+       if (match(TK_ASSIGN)) { decl->left = parseExpression(); }
+
+       expect(TK_SEMICOLON);
+       return decl;
+}
+
+
+/* demonstration: ast printing*/
+static void printIndent(int indent) {
+       for (int i = 0; i < indent; i++)
+               printf("  ");
+}
+
+static void printAST(ASTNode* node, int indent) {
+       if (!node) return;
+       switch (node->kind) {
+               case AST_FUNCDEF:
+                       printIndent(indent);
+                       printf("FunctionDef: name=%s\n", node->funcName);
+                       printAST(node->funcBody, indent + 1);
+                       break;
+               case AST_BLOCK:
+                       printIndent(indent);
+                       printf("{\n");
+                       for (int i = 0; i < node->blockCount; i++)
+                               printAST(node->blockStatements[i], indent + 1);
+                       printIndent(indent);
+                       printf("}\n");
+                       break;
+               case AST_RETURN:
+                       printIndent(indent);
+                       printf("Return\n");
+                       if (node->left)
+                               printAST(node->left, indent + 1);
+                       break;
+               case AST_IF:
+                       printIndent(indent);
+                       printf("If\n");
+                       printIndent(indent + 1);
+                       printf("Condition:\n");
+                       printAST(node->left, indent + 2);
+                       printIndent(indent + 1);
+                       printf("Then:\n");
+                       printAST(node->right, indent + 2);
+                       if (node->third) {
+                               printIndent(indent + 1);
+                               printf("Else:\n");
+                               printAST(node->third, indent + 2);
+                       }
+                       break;
+               case AST_WHILE:
+                       printIndent(indent);
+                       printf("While\n");
+                       printIndent(indent + 1);
+                       printf("Condition:\n");
+                       printAST(node->left, indent + 2);
+                       printIndent(indent + 1);
+                       printf("Body:\n");
+                       printAST(node->right, indent + 2);
+                       break;
+               case AST_EXPR_STMT:
+                       printIndent(indent);
+                       printf("ExprStmt\n");
+                       if (node->left)
+                               printAST(node->left, indent + 1);
+                       break;
+               case AST_BINARY:
+                       printIndent(indent);
+                       printf("BinaryOp (%s)\n", node->identName);
+                       printAST(node->left, indent + 1);
+                       printAST(node->right, indent + 1);
+                       break;
+               case AST_NUM:
+                       printIndent(indent);
+                       printf("Number (%d)\n", node->intValue);
+                       break;
+               case AST_IDENT:
+                       printIndent(indent);
+                       printf("Identifier (%s)\n", node->identName);
+                       break;
+               default:
+                       printIndent(indent);
+                       printf("Unknown AST node kind\n");
+                       break;
+       }
+}
+
+int main(int argc, char** argv) {
+       /* initialize the lexer with the source filename */
+       initLexer(argc, argv);
+       nextToken();  // prime the token stream
+
+       ASTNode* root = parseProgram();
+
+       printf("=== AST DUMP ===\n");
+       printAST(root, 0);
+
+       return 0;
+}
diff --git a/tests/test01.bc b/tests/test01.bc
new file mode 100644 (file)
index 0000000..e72a8a0
--- /dev/null
@@ -0,0 +1,4 @@
+int main() {
+       return 42;
+}
+
diff --git a/tests/test02.bc b/tests/test02.bc
new file mode 100644 (file)
index 0000000..addd491
--- /dev/null
@@ -0,0 +1,3 @@
+int main() {
+       return 1 + 2 * (3 - 4) / 5;
+}
diff --git a/tests/test03.bc b/tests/test03.bc
new file mode 100644 (file)
index 0000000..63deb0c
--- /dev/null
@@ -0,0 +1,7 @@
+int main() {
+       if (1) {
+               return 10;
+       } else {
+               return 20;
+       }
+}
diff --git a/tests/test04.bc b/tests/test04.bc
new file mode 100644 (file)
index 0000000..ee360bb
--- /dev/null
@@ -0,0 +1,7 @@
+int main() {
+       int x = 5;
+       while (x > 0) {
+               x = x - 1;
+       }
+       return x;
+}
diff --git a/tests/test05.bc b/tests/test05.bc
new file mode 100644 (file)
index 0000000..51a295b
--- /dev/null
@@ -0,0 +1,7 @@
+int square(int x) {
+       return x * x;
+}
+
+int main() {
+       return square(5);
+}