properly parse function calls

author Thomas Bruce <tdb@tdbio.me>

Thu, 20 Feb 2025 05:56:24 +0000 (00:56 -0500)

committer Thomas Bruce <tdb@tdbio.me>

Thu, 20 Feb 2025 05:56:24 +0000 (00:56 -0500)
author Thomas Bruce <tdb@tdbio.me>
Thu, 20 Feb 2025 05:56:24 +0000 (00:56 -0500)
committer Thomas Bruce <tdb@tdbio.me>
Thu, 20 Feb 2025 05:56:24 +0000 (00:56 -0500)
diff --git a/docs/lexer-doc.txt b/docs/lexer-doc.txt

new file mode 100644 (file)

index 0000000..5040973
--- /dev/null
+++ b/docs/lexer-doc.txt
@@ -0,0 +1,57 @@
+bareC lexer documentation
+date: 20/02/2025
+=========================
+
+overview:
+---------
+the bareC lexer converts the raw source code into a stream of tokens that the
+parser can easily process. it reads the source file character-by-character,
+skips whitespace and comments (single-line only), and recognizes:
+       - keywords
+       - identifiers
+       - literals (integers)
+       - operators (+, -, /, *, ==, =, >)
+       - punctuation
+
+token types:
+------------
+the lexer recognizes the following token types:
+       - keywords: "int", "return", "if", "else", "while"
+       - identifiers: variable and function names
+       - literals: integer literals
+       - operators: "=", "==", ">", "+", "-", "/", "*"
+       - punctuation: "(", ")", "{", "}", ";", ","
+       - end-of-file: eof and unknown
+
+EBNF for lexical elements:
+--------------------------
+1. letters and digits:
+       <letter>        = "A" | "B" | ... | "Z" | "a" | "b" | ... | "z" ;
+       <digit>         = "0" | "1" | ... | "9" ;
+
+2. identifier:
+       <identifier>    = ( <letter> | "_" ) { <letter> | <digit> | "_" } ;
+
+3. integer literal:
+       <number> = <digit> { <digit> };
+
+TODO: keywords, operators
+
+whitespace and comments:
+------------------------
+       - whitespace (space, tab, newline) is skipped
+       - single-line comments start with "//" and terminate at line end
+       - unrecognized returns as TK_UNKNOWN
+
+processing flow:
+----------------
+1. open the source file and prime the first character
+2. repeat:
+       - skip whitespace and comments
+       - check current character:
+               - if digit -> accumulate a number
+               - if letter or _, accumulate an identifier and check reserved
+               - match multi-character operators
+               - directly return tokens
+3. continue until EOF
+
diff --git a/lexer.c b/lexer.c

index d25945424e3d501cd587f98a649d6e581219f08b..90982212f7c5be58ce6d76fe63d740c67e207c8d 100644 (file)
--- a/lexer.c
+++ b/lexer.c
@@ -87,6 +87,7 @@ Token getNextToken(void) {
         if (g_currentChar == '{') { advance(); return makeToken(TK_LBRACE, "{"); }
         if (g_currentChar == '}') { advance(); return makeToken(TK_RBRACE, "}"); }
         if (g_currentChar == ';') { advance(); return makeToken(TK_SEMICOLON, ";"); }
+       if (g_currentChar == ',') { advance(); return makeToken(TK_COMMA, ","); }
         if (g_currentChar == '>') { advance(); return makeToken(TK_GT, ">"); }
  
         /* handle numeric literal */
author	Thomas Bruce <tdb@tdbio.me>
	Thu, 20 Feb 2025 05:56:24 +0000 (00:56 -0500)
committer	Thomas Bruce <tdb@tdbio.me>
	Thu, 20 Feb 2025 05:56:24 +0000 (00:56 -0500)
docs/lexer-doc.txt	[new file with mode: 0644]	patch \| blob
lexer.c		patch \| blob \| history