1 files changed, 210 insertions, 0 deletions
diff --git a/lexer.c b/lexer.c
new file mode 100644
index 0000000..04aada4
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,210 @@
+#include "ccc.h"
+#include "lexer.h"
+#include <string.h>
+#include <stdckdint.h>
+
+static FILE* file = NULL;
+static int lookahead;
+static long LINE;
+
+void lexer_load(const char* path) {
+    if (file != NULL) {
+        fclose(file);
+    }
+    file = fopen(path, "r");
+    if (file == NULL) CCC_PANIC;
+
+    lookahead = fgetc(file);
+    LINE = 1;
+}
+
+bool lexer_peek(struct token* p_token) {
+    if (file == NULL) return false;
+
+    long orig_offset = ftell(file);
+    int orig_lookahead = lookahead;
+    bool rv = lexer_pop(p_token);
+    lookahead = orig_lookahead;
+    fseek(file, orig_offset, SEEK_SET);
+    return rv;
+}
+
+#define is_whitespace(c) (c == ' ' || c == '\t' || c == '\n')
+#define is_lower_alpha(c) ('a' <= c && c <= 'z')
+#define is_upper_alpha(c) ('A' <= c && c <= 'Z')
+#define is_alpha(c) (is_lower_alpha(c) || is_upper_alpha(c))
+#define is_numeric(c) ('0' <= c && c <= '9')
+#define is_alphanumeric(c) (is_alpha(c) || is_numeric(c))
+#define is_ident_legal(c) (is_alphanumeric(c) || c == '_' || c == '$')
+
+#define REFUND_CHAR fseek(file, -1, SEEK_CUR)
+
+static int consume_char() {
+    int rv = lookahead;
+    lookahead = fgetc(file);
+    return rv;
+}
+
+static void lex_ident(struct token* p_token, char ic) {
+    char buf[1024] = {ic};
+    unsigned int len = 1;
+
+    while (is_ident_legal(lookahead)) {
+        int c = consume_char();
+        if (len >= sizeof(buf) - 1)
+            CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf));
+        buf[len++] = c;
+    }
+
+    buf[len] = 0;
+    *p_token = (struct token) {
+        .type = IDENTIFIER,
+        .data.identifier = strndup(buf, sizeof(buf) - 1),
+    };
+}
+
+static void lex_float_lit(
+    struct token* p_token,
+    unsigned char base,
+    double iv
+) {
+    CCC_ERROR("lexer: floating point literals are not supported yet");
+}
+
+static void lex_int_lit(struct token* p_token, intlit_t iv) {
+    unsigned char base = 10;
+
+    /* TODO: exponentiation, 2e10 f.e. */
+    if (iv == 0) {
+        if (lookahead == 'x' || lookahead == 'X'
+                || lookahead == 'b' || lookahead == 'B') {
+            base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2;
+            int suffix = consume_char();
+            if (!is_alphanumeric(lookahead))
+                CCC_ERROR(
+                    "lexer: invalid suffix on integer constant: %c", suffix);
+        } else base = 8;
+    }
+
+    while (is_alphanumeric(lookahead)) {
+        int c = consume_char();
+        intlit_t c_val;
+
+        if (is_numeric(c)) c_val = c - '0';
+        else if (is_lower_alpha(c)) c_val = c - 'a' + 10;
+        else c_val = c - 'A' + 10;
+
+        if (c_val >= base)
+            CCC_ERROR(
+                "lexer: invalid digit in base %hhu: %c",
+                base,
+                c);
+
+        if (ckd_mul(&iv, iv, base))
+            CCC_ERROR(
+                "lexer: integer literal will overflow");
+        if (ckd_add(&iv, iv, c_val))
+            CCC_ERROR(
+                "lexer: integer literal will overflow");
+    }
+
+    if (lookahead == '.') {
+        consume_char();
+        lex_float_lit(p_token, base, iv);
+        return;
+    }
+
+    *p_token = (struct token) {
+        .type = INT_LIT,
+        .data.int_lit = iv,
+    };
+}
+
+static void lex_char_lit(struct token* p_token) {
+    int c = consume_char();
+    if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+
+    if (c == '\\') {
+        c = consume_char();
+        if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+
+        if (c == '\'') c = '\'';
+        else if (c == '\"') c = '\"';
+        else CCC_ERROR(
+            "lexer: escape sequences other than quotes are not supported yet");
+    }
+
+    int close_quote = consume_char();
+    if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+    if (close_quote != '\'')
+        CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote);
+
+    *p_token = (struct token) {
+        .type = CHAR_LIT,
+        .data.char_lit = c,
+    };
+}
+
+static void lex_str_lit(struct token* p_token) {
+
+}
+
+enum token_type lex_simple(char c) {
+    switch (c) {
+        case '*': return STAR; /* TODO: *= */
+        case '#': return HASHTAG;
+        case '(': return LPAREN;
+        case ')': return RPAREN;
+        case '{': return LCURLY;
+        case '}': return RCURLY;
+        case '[': return LSQUARE;
+        case ']': return RSQUARE;
+        case ':': return COLON;
+        case ';': return SEMI;
+        case ',': return COMMA;
+        case '.': return DOT;
+        case '?': return QMARK;
+    }
+    CCC_ERROR("lexer: unexpected token %c", c);
+}
+
+bool lexer_pop(struct token* p_token) {
+    /* TODO: e.g. float f = .25; */
+    if (file == NULL) return false;
+
+    // consume all whitespace and comments preceding the next token
+    int c;
+    for (;;) {
+        c = consume_char();
+        // one of these
+        if (c == EOF) return false;
+        else if (c == '/' && lookahead == '/') {
+            while (lookahead != EOF && lookahead != '\n') consume_char();
+        }
+        else if (c == '/' && lookahead == '*') {
+            consume_char(); /* consume the * */
+            int c = consume_char();
+            while (c != EOF && (c != '*' || lookahead != '/'))
+                c = consume_char();
+            if (c == EOF) CCC_ERROR("unterminated /* comment");
+            consume_char(); /* consume the final / */
+        }
+        else if (c == '\n') LINE++;
+        else if (!is_whitespace(c)) break;
+    }
+    
+    if (is_numeric(c))
+        lex_int_lit(p_token, c - '0');
+    else if (c == '.' && is_numeric(lookahead))
+        lex_float_lit(p_token, 10, 0);
+    else if (is_ident_legal(c))
+        lex_ident(p_token, c);
+    else if (c == '\'')
+        lex_char_lit(p_token);
+    else if (c == '"')
+        lex_str_lit(p_token);
+    else
+        *p_token = (struct token) {.type = lex_simple(c)};
+
+    return true;
+}