From: Thomas Bruce <tdb@tdbio.me>
Date: Wed, 19 Feb 2025 21:25:30 +0000 (-0500)
Subject: initial lexer + parser, TODO: fix assignment kind
X-Git-Url: https://git.deglebe.com/non-work/m/p/static/w/index.html?a=commitdiff_plain;h=558c5aebdd1d733e32f0c3b4acf10f9fc6e44ad3;p=barec%2Fbarec.git

initial lexer + parser, TODO: fix assignment kind
---

558c5aebdd1d733e32f0c3b4acf10f9fc6e44ad3
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3e68405
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,17 @@
+CC = tcc
+CFLAGS += -D_POSIX_C_SOURCE=200809L -Wall -Wextra -Werror -std=c99 -g
+TARGET = barec0
+
+all: $(TARGET)
+
+$(TARGET): lexer.o parser.o
+	$(CC) $(CFLAGS) -o $(TARGET) lexer.o parser.o
+
+lexer.o: lexer.c lexer.h
+	$(CC) $(CFLAGS) -c lexer.c
+
+parser.o: parser.c lexer.h
+	$(CC) $(CFLAGS) -c parser.c
+
+clean:
+	rm -f *.o $(TARGET)
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..260165f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,7 @@
+# bareC
+
+a c dialect intended for low-level systems programming
+
+# current milestone
+
+super basic AST processing, and that's it! this isn't even a usable language
diff --git a/lexer.c b/lexer.c
new file mode 100644
index 0000000..d259454
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,146 @@
+/* lexer.c
+ * bareC lexer implementation (c source)
+ */ 
+
+#include "lexer.h"
+
+/* global state for lexer */
+static int g_currentChar = ' ';
+static FILE* g_input = NULL;
+
+/* advance: read next char from g_input */
+static void advance(void) { g_currentChar = fgetc(g_input); }
+
+/* isEOF: check if the current token is end-of-file */
+static bool isEOF(void) { return (g_currentChar == EOF); }
+
+/* skipWhitespace: skip over whitespace characters */
+static void skipWhitespace(void) { while (isspace(g_currentChar)) advance(); }
+
+/* skipSingleLineComment: skips over singleline comment on seeing `//` */
+static void skipSingleLineComment(void) {
+	advance(); // consume second `/`
+	while (!isEOF() && g_currentChar != '\n') advance();
+	if (!isEOF()) advance();
+}
+
+/* identifier helpers */
+static bool isIdentStart(int c) { return (isalpha(c) || c == '_'); }
+static bool isIdentChar(int c) { return (isalnum(c) || c == '_'); }
+
+/* makeToken: create token of given kind and lexeme */
+static Token makeToken(TokenKind kind, const char* lexeme) {
+	Token t;
+	t.kind = kind;
+	t.lexeme = strdup(lexeme);
+	t.intValue = 0;
+	return t;
+}
+
+/* makeNumberToken: create token with numeric literal value */
+static Token makeNumberToken(int value, const char *lexeme) {
+	Token t;
+	t.kind = TK_NUMBER;
+	t.intValue = value;
+	t.lexeme = strdup(lexeme);
+	return t;
+}
+
+/* getNextToken: return next token from stream */
+Token getNextToken(void) {
+	skipWhitespace();
+
+	if (isEOF()) return makeToken(TK_EOF, "EOF");
+
+	/* handle comments: `/` check next char also equals `/` */
+	if (g_currentChar == '/') {
+		int nextChar = fgetc(g_input);
+		ungetc(nextChar, g_input);
+		if (nextChar == '/') {
+			skipSingleLineComment();
+			return getNextToken();
+		} else {
+			advance();
+			return makeToken(TK_SLASH, "/");
+		}
+	}
+
+	/* Handle operators and punctuation */
+	if (g_currentChar == '=') {
+		int nextChar = fgetc(g_input);
+		if (nextChar == '=') {
+			advance(); // consume '='
+	        	advance(); // consume second '='
+			return makeToken(TK_EQ, "==");
+		} else {
+			ungetc(nextChar, g_input);
+			advance();
+			return makeToken(TK_ASSIGN, "=");
+		}
+	}
+
+	if (g_currentChar == '+') { advance(); return makeToken(TK_PLUS, "+");  }
+	if (g_currentChar == '-') { advance(); return makeToken(TK_MINUS, "-");  }
+	if (g_currentChar == '*') { advance(); return makeToken(TK_STAR, "*");  }
+	if (g_currentChar == '(') { advance(); return makeToken(TK_LPAREN, "("); }
+	if (g_currentChar == ')') { advance(); return makeToken(TK_RPAREN, ")"); }
+	if (g_currentChar == '{') { advance(); return makeToken(TK_LBRACE, "{"); }
+	if (g_currentChar == '}') { advance(); return makeToken(TK_RBRACE, "}"); }
+	if (g_currentChar == ';') { advance(); return makeToken(TK_SEMICOLON, ";"); }
+	if (g_currentChar == '>') { advance(); return makeToken(TK_GT, ">"); }
+
+	/* handle numeric literal */
+	if (isdigit(g_currentChar)) {
+		char buffer[64];
+		int idx = 0;
+		while (isdigit(g_currentChar) && idx < 63) {
+			buffer[idx++] = (char)g_currentChar;
+			advance();
+		}
+		buffer[idx] = '\0';
+		int value = atoi(buffer);
+		return makeNumberToken(value, buffer);
+	}
+
+	/* handle ident or keyword */
+	if (isIdentStart(g_currentChar)) {
+		char buffer[128];
+		int idx = 0;
+		while (isIdentChar(g_currentChar) && idx < 127) {
+			buffer[idx++] = (char)g_currentChar;
+			advance();
+		}
+		buffer[idx] = '\0';
+
+		if (strcmp(buffer, "int") == 0)
+			return makeToken(TK_INT, "int");
+		if (strcmp(buffer, "return") == 0)
+			return makeToken(TK_RETURN, "return");
+		if (strcmp(buffer, "if") == 0)
+			return makeToken(TK_IF, "if");
+		if (strcmp(buffer, "else") == 0)
+			return makeToken(TK_ELSE, "else");
+		if (strcmp(buffer, "while") == 0)
+			return makeToken(TK_WHILE, "while");
+		return makeToken(TK_IDENT, buffer);
+	}
+
+	/* else: unknown */
+	char unknownLexeme[2] = { (char)g_currentChar, '\0' };
+	advance();
+	return makeToken(TK_UNKNOWN, unknownLexeme);
+}
+
+/* initLexer: initialize lexer with source filename */
+void initLexer(int argc, char** argv) {
+	if (argc < 2) {
+		fprintf(stderr, "Usage: %s <source file>\n", argv[0]);
+		exit(1);
+	}
+	g_input = fopen(argv[1], "r");
+	if (!g_input) {
+		fprintf(stderr, "Err: could not open '%s'\n", argv[1]);
+		exit(1);
+	}
+	advance();
+}
diff --git a/lexer.h b/lexer.h
new file mode 100644
index 0000000..bfd7d19
--- /dev/null
+++ b/lexer.h
@@ -0,0 +1,47 @@
+/* lexer.h
+ * bareC lexer header (c source)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdbool.h>
+
+/* what kinds of tokens exist */
+typedef enum {
+	TK_INT,		// `int`
+	TK_RETURN,	// `return`
+	TK_IF,		// `if`
+	TK_ELSE,	// `else`
+	TK_WHILE,	// `while`
+	TK_IDENT,	// identifier
+	TK_NUMBER,	// integer literal
+	TK_EQ,		// `==`
+	TK_ASSIGN,	// `=`
+	TK_PLUS,	// `+`
+	TK_MINUS,	// `-`
+	TK_STAR,	// `*`
+	TK_SLASH,	// `/`
+	TK_LPAREN,	// `(`
+	TK_RPAREN,	// `)`
+	TK_LBRACE,	// `{`
+	TK_RBRACE,	// `}`
+	TK_SEMICOLON,	// `;`
+	TK_GT,		// `>`
+	TK_EOF,		// EOF
+	TK_UNKNOWN	// UNKNOWN
+} TokenKind;
+
+/* token structure */
+typedef struct {
+	TokenKind	kind;
+	char*		lexeme;		// textual representation
+	int		intValue;	// numeric literal value
+} Token;
+
+/* initialize the lexer; take source filename */
+void initLexer(int argc, char** argv);
+
+/* return next token from stream */
+Token getNextToken(void);
diff --git a/parser.c b/parser.c
new file mode 100644
index 0000000..1446792
--- /dev/null
+++ b/parser.c
@@ -0,0 +1,397 @@
+/* parser.c
+ * bareC parser and ast structure (c source)
+ */ 
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+
+#include "lexer.h"
+
+/* ast definitions */
+typedef enum {
+	AST_FUNCDEF,	// function def
+	AST_BLOCK,	// compound statement block
+	AST_RETURN,	// return statement
+	AST_IF,		// if statement
+	AST_WHILE,	// while loop
+	AST_EXPR_STMT,	// expresison statement
+	AST_BINARY,	// bunary expression
+	AST_NUM,	// numeric literal
+	AST_IDENT,	// identifier
+	AST_DECL	// variable declaration
+} ASTKind;
+
+typedef struct ASTNode ASTNode;
+struct ASTNode {
+	ASTKind	kind;
+	ASTNode* left;	// for binary expr, condition of if, etc.
+	ASTNode* right;	// second child
+	ASTNode* third;	// for if: else
+	ASTNode** blockStatements; // compound blocks
+	int blockCount;	// num statements in block
+	char* funcName;	// for function definitions
+	ASTNode* funcBody; // body block
+	int intValue;	// numeric literals
+	char* identName; // identifiers and operator string in binary nodes
+};
+
+/* utilities for creating new ast nodes */
+static ASTNode* newASTNode(ASTKind kind) {
+	ASTNode* node = (ASTNode*)calloc(1, sizeof(ASTNode));
+	node->kind = kind;
+	return node;
+}
+
+static ASTNode* newBinaryNode(ASTNode* lhs, ASTNode* rhs, const char* op) {
+	ASTNode* node = newASTNode(AST_BINARY);
+	node->left = lhs;
+	node->right = rhs;
+	node->identName = strdup(op);
+	return node;
+}
+
+static ASTNode* newNumNode(int value) {
+	ASTNode* node = newASTNode(AST_NUM);
+	node->intValue = value;
+	return node;
+}
+
+static ASTNode* newIdentNode(const char* name) {
+	ASTNode* node = newASTNode(AST_IDENT);
+	node->identName = strdup(name);
+	return node;
+}
+
+/* parsing state and utility */
+static Token g_currentToken;
+
+/* advance to the next token */
+static void nextToken(void) {
+	g_currentToken = getNextToken();
+}
+
+/* if match, consume */
+static bool match(TokenKind kind) {
+	if (g_currentToken.kind == kind) {
+		nextToken();
+		return true;
+	}
+	return false;
+}
+
+/* expect to be of a given kind, exit on error */
+static void expect(TokenKind kind) {
+	if (g_currentToken.kind == kind) {
+		nextToken();
+	} else {
+		fprintf(stderr, "Parse err: expected kind %d, got %d\n",
+				kind, g_currentToken.kind);
+		exit(1);
+	}
+}
+
+/* forward decl. */
+static ASTNode* parseExpression(void);
+static ASTNode* parseEquality(void);
+static ASTNode* parseAdditive(void);
+static ASTNode* parseTerm(void);
+static ASTNode* parseFactor(void);
+static ASTNode* parseDeclaration(void);
+static ASTNode* parseStatement(void);
+static ASTNode* parseCompoundStatement(void);
+static ASTNode* parseFunctionDefinition(void);
+static ASTNode* parseProgram(void);
+
+/* parsing */
+
+/* program := (function-definition)* EOF */
+
+static ASTNode* parseProgram(void) {
+	ASTNode* root = newASTNode(AST_BLOCK);
+	root->blockStatements = NULL;
+	root->blockCount = 0;
+	while (g_currentToken.kind != TK_EOF) {
+		ASTNode* func = parseFunctionDefinition();
+		root->blockCount++;
+		root->blockStatements = (ASTNode**)realloc(
+			root->blockStatements,
+			sizeof(ASTNode*) * root->blockCount
+		);
+		root->blockStatements[root->blockCount - 1] = func;
+	}
+	return root;
+}
+
+/* function-definition := "int" IDENT "(" ")" compound-statement */
+static ASTNode* parseFunctionDefinition(void) {
+	expect(TK_INT);
+	if (g_currentToken.kind != TK_IDENT) {
+		fprintf(stderr, "Parse error: expected identifier after 'int'\n");
+		exit(1);
+	}
+	/* Duplicate the function name string */
+	char* funcName = strdup(g_currentToken.lexeme);
+	nextToken();
+	expect(TK_LPAREN);
+	expect(TK_RPAREN);
+	ASTNode* body = parseCompoundStatement();
+	ASTNode* funcNode = newASTNode(AST_FUNCDEF);
+	funcNode->funcName = funcName;
+	funcNode->funcBody = body;
+	return funcNode;
+}
+
+/* compound-statement := "{" statement* "}" */
+static ASTNode* parseCompoundStatement(void) {
+	ASTNode* blockNode = newASTNode(AST_BLOCK);
+	blockNode->blockStatements = NULL;
+	blockNode->blockCount = 0;
+	expect(TK_LBRACE);
+	while (g_currentToken.kind != TK_RBRACE) {
+		ASTNode* stmt = parseStatement();
+		blockNode->blockCount++;
+		blockNode->blockStatements = (ASTNode**)realloc(
+			blockNode->blockStatements,
+			sizeof(ASTNode*) * blockNode->blockCount
+		);
+		blockNode->blockStatements[blockNode->blockCount - 1] = stmt;
+	}
+	expect(TK_RBRACE);
+	return blockNode;
+}
+
+/* statement :=
+	   return-statement
+	 | if-statement
+	 | while-statement
+	 | expression-statement
+	 | compound-statement
+*/
+static ASTNode* parseStatement(void) {
+	if (match(TK_RETURN)) {
+		ASTNode* node = newASTNode(AST_RETURN);
+		if (g_currentToken.kind != TK_SEMICOLON)
+			node->left = parseExpression();
+		expect(TK_SEMICOLON);
+		return node;
+	} else if (match(TK_IF)) {
+		ASTNode* node = newASTNode(AST_IF);
+		expect(TK_LPAREN);
+		node->left = parseExpression();
+		expect(TK_RPAREN);
+		node->right = parseStatement();
+		if (match(TK_ELSE))
+			node->third = parseStatement();
+		return node;
+	} else if (match(TK_WHILE)) {
+		ASTNode* node = newASTNode(AST_WHILE);
+		expect(TK_LPAREN);
+		node->left = parseExpression();
+		expect(TK_RPAREN);
+		node->right = parseStatement();
+		return node;
+	} else if (g_currentToken.kind == TK_LBRACE) {
+		return parseCompoundStatement();
+	} else if (g_currentToken.kind == TK_INT) {
+		return parseDeclaration();
+	} else {
+		/* Expression-statement or empty statement */
+		ASTNode* node = newASTNode(AST_EXPR_STMT);
+		if (g_currentToken.kind != TK_SEMICOLON)
+			node->left = parseExpression();
+		expect(TK_SEMICOLON);
+		return node;
+	}
+}
+
+/* expression := equality */
+static ASTNode* parseExpression(void) {
+	return parseEquality();
+}
+
+/* equality := additive ( "==" additive )* */
+static ASTNode* parseEquality(void) {
+	ASTNode* node = parseAdditive();
+	while (g_currentToken.kind == TK_EQ || g_currentToken.kind == TK_GT) {
+		TokenKind op = g_currentToken.kind;
+		nextToken();
+		ASTNode* rhs = parseAdditive();
+		if (op == TK_EQ)
+			node = newBinaryNode(node, rhs, "==");
+		else if (op == TK_GT) 
+			node = newBinaryNode(node, rhs, ">");
+	}
+	return node;
+}
+
+/* additive := term ( ("+" | "-") term )* */
+static ASTNode* parseAdditive(void) {
+	ASTNode* node = parseTerm();
+	while (true) {
+		if (match(TK_PLUS)) {
+			ASTNode* rhs = parseTerm();
+			node = newBinaryNode(node, rhs, "+");
+		} else if (match(TK_MINUS)) {
+			ASTNode* rhs = parseTerm();
+			node = newBinaryNode(node, rhs, "-");
+		} else {
+			break;
+		}
+	}
+	return node;
+}
+
+/* term := factor ( ("*" | "/") factor )* */
+static ASTNode* parseTerm(void) {
+	ASTNode* node = parseFactor();
+	while (true) {
+		if (match(TK_STAR)) {
+			ASTNode* rhs = parseFactor();
+			node = newBinaryNode(node, rhs, "*");
+		} else if (match(TK_SLASH)) {
+			ASTNode* rhs = parseFactor();
+			node = newBinaryNode(node, rhs, "/");
+		} else {
+			break;
+		}
+	}
+	return node;
+}
+
+/* factor := "(" expression ")" | IDENT | NUMBER */
+static ASTNode* parseFactor(void) {
+	if (match(TK_LPAREN)) {
+		ASTNode* node = parseExpression();
+		expect(TK_RPAREN);
+		return node;
+	}
+	if (g_currentToken.kind == TK_IDENT) {
+		ASTNode* node = newIdentNode(g_currentToken.lexeme);
+		nextToken();
+		return node;
+	}
+	if (g_currentToken.kind == TK_NUMBER) {
+		ASTNode* node = newNumNode(g_currentToken.intValue);
+		nextToken();
+		return node;
+	}
+	fprintf(stderr, "Parse err: unexpected token '%s' in factor\n", g_currentToken.lexeme);
+	exit(1);
+	return NULL; // unreachable, but silences compiler warnings.
+}
+
+/* declaration */
+static ASTNode* parseDeclaration(void) {
+	expect(TK_INT);
+	
+	if (g_currentToken.kind != TK_IDENT) {
+		fprintf(stderr, "Parse err: expected identifier in declaration, got '%s'\n", g_currentToken.lexeme);
+		exit(1);
+	}
+	char *varName = strdup(g_currentToken.lexeme);
+	nextToken();
+
+	ASTNode* decl = newASTNode(AST_DECL);
+	decl->identName = varName;
+
+	if (match(TK_ASSIGN)) { decl->left = parseExpression(); }
+
+	expect(TK_SEMICOLON);
+	return decl;
+}
+
+
+/* demonstration: ast printing*/
+static void printIndent(int indent) {
+	for (int i = 0; i < indent; i++)
+		printf("  ");
+}
+
+static void printAST(ASTNode* node, int indent) {
+	if (!node) return;
+	switch (node->kind) {
+		case AST_FUNCDEF:
+			printIndent(indent);
+			printf("FunctionDef: name=%s\n", node->funcName);
+			printAST(node->funcBody, indent + 1);
+			break;
+		case AST_BLOCK:
+			printIndent(indent);
+			printf("{\n");
+			for (int i = 0; i < node->blockCount; i++)
+				printAST(node->blockStatements[i], indent + 1);
+			printIndent(indent);
+			printf("}\n");
+			break;
+		case AST_RETURN:
+			printIndent(indent);
+			printf("Return\n");
+			if (node->left)
+				printAST(node->left, indent + 1);
+			break;
+		case AST_IF:
+			printIndent(indent);
+			printf("If\n");
+			printIndent(indent + 1);
+			printf("Condition:\n");
+			printAST(node->left, indent + 2);
+			printIndent(indent + 1);
+			printf("Then:\n");
+			printAST(node->right, indent + 2);
+			if (node->third) {
+				printIndent(indent + 1);
+				printf("Else:\n");
+				printAST(node->third, indent + 2);
+			}
+			break;
+		case AST_WHILE:
+			printIndent(indent);
+			printf("While\n");
+			printIndent(indent + 1);
+			printf("Condition:\n");
+			printAST(node->left, indent + 2);
+			printIndent(indent + 1);
+			printf("Body:\n");
+			printAST(node->right, indent + 2);
+			break;
+		case AST_EXPR_STMT:
+			printIndent(indent);
+			printf("ExprStmt\n");
+			if (node->left)
+				printAST(node->left, indent + 1);
+			break;
+		case AST_BINARY:
+			printIndent(indent);
+			printf("BinaryOp (%s)\n", node->identName);
+			printAST(node->left, indent + 1);
+			printAST(node->right, indent + 1);
+			break;
+		case AST_NUM:
+			printIndent(indent);
+			printf("Number (%d)\n", node->intValue);
+			break;
+		case AST_IDENT:
+			printIndent(indent);
+			printf("Identifier (%s)\n", node->identName);
+			break;
+		default:
+			printIndent(indent);
+			printf("Unknown AST node kind\n");
+			break;
+	}
+}
+
+int main(int argc, char** argv) {
+	/* initialize the lexer with the source filename */
+	initLexer(argc, argv);
+	nextToken();  // prime the token stream
+
+	ASTNode* root = parseProgram();
+
+	printf("=== AST DUMP ===\n");
+	printAST(root, 0);
+
+	return 0;
+}
diff --git a/tests/test01.bc b/tests/test01.bc
new file mode 100644
index 0000000..e72a8a0
--- /dev/null
+++ b/tests/test01.bc
@@ -0,0 +1,4 @@
+int main() {
+	return 42;
+}
+
diff --git a/tests/test02.bc b/tests/test02.bc
new file mode 100644
index 0000000..addd491
--- /dev/null
+++ b/tests/test02.bc
@@ -0,0 +1,3 @@
+int main() {
+	return 1 + 2 * (3 - 4) / 5;
+}
diff --git a/tests/test03.bc b/tests/test03.bc
new file mode 100644
index 0000000..63deb0c
--- /dev/null
+++ b/tests/test03.bc
@@ -0,0 +1,7 @@
+int main() {
+	if (1) {
+		return 10;
+	} else {
+		return 20;
+	}
+}
diff --git a/tests/test04.bc b/tests/test04.bc
new file mode 100644
index 0000000..ee360bb
--- /dev/null
+++ b/tests/test04.bc
@@ -0,0 +1,7 @@
+int main() {
+	int x = 5;
+	while (x > 0) {
+		x = x - 1;
+	}
+	return x;
+}
diff --git a/tests/test05.bc b/tests/test05.bc
new file mode 100644
index 0000000..51a295b
--- /dev/null
+++ b/tests/test05.bc
@@ -0,0 +1,7 @@
+int square(int x) {
+	return x * x;
+}
+
+int main() {
+	return square(5);
+}