5 files changed, 186 insertions, 46 deletions
diff --git a/.gitignore b/.gitignore
index 4c0c7e0..85a7886 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+ccc
 *.o
 *.out
 build/**
diff --git a/ccc.h b/ccc.h
index 6b41480..36f80f9 100644
--- a/ccc.h
+++ b/ccc.h
@@ -1,12 +1,7 @@
 #ifndef CCC_H
 #define CCC_H
-#include <stdio.h>
-#include <stdlib.h>
 
 #define CCC_PANIC { perror("ccc"); exit(1); }
-#define CCC_ERROR(format, ...) {\
-    fprintf(stderr, "line %ld: " format "\n", LINE __VA_OPT__(,) __VA_ARGS__);\
-    exit(1);\
-}
+
 
 #endif
diff --git a/lexer.c b/lexer.c
index 7e2f5a4..a4ffd89 100644
--- a/lexer.c
+++ b/lexer.c
@@ -1,11 +1,23 @@
 #include "ccc.h"
 #include "lexer.h"
+#include <stdlib.h>
+#include <stdio.h>
 #include <string.h>
 #include <stdckdint.h>
 
+#define LEXER_PANIC(format, ...) {\
+    fprintf(\
+        stderr,\
+        "ccc: lexer error: line %lu, column %lu: " format "\n",\
+        LINE,\
+        COL __VA_OPT__(,)\
+        __VA_ARGS__);\
+    exit(1);\
+}
+
 static FILE* file = NULL;
 static int lookahead;
-static long LINE;
+static unsigned long LINE, COL;
 
 void lexer_load(const char* path) {
     if (file != NULL) {
@@ -16,6 +28,13 @@ void lexer_load(const char* path) {
 
     lookahead = fgetc(file);
     LINE = 1;
+    COL = 1;
+}
+
+void lexer_close() {
+    if (file == NULL) return;
+    fclose(file);
+    file = NULL;
 }
 
 bool lexer_peek(struct token* p_token) {
@@ -40,6 +59,7 @@ bool lexer_peek(struct token* p_token) {
 static int consume_char() {
     int rv = lookahead;
     lookahead = fgetc(file);
+    COL++;
     return rv;
 }
 
@@ -50,7 +70,8 @@ static void lex_ident(struct token* p_token, char ic) {
     while (is_ident_legal(lookahead)) {
         int c = consume_char();
         if (len >= sizeof(buf) - 1)
-            CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf));
+            LEXER_PANIC(
+                "identifier exceeds maximum size (%ld)", sizeof(buf) - 1);
         buf[len++] = c;
     }
 
@@ -66,10 +87,10 @@ static void lex_float_lit(
     unsigned char base,
     double iv
 ) {
-    CCC_ERROR("lexer: floating point literals are not supported yet");
+    LEXER_PANIC("floating point literals are not implemented");
 }
 
-static void lex_int_lit(struct token* p_token, intlit_t iv) {
+static void lex_int_lit(struct token* p_token, int_lit_t iv) {
     unsigned char base = 10;
 
     /* TODO: exponentiation, 2e10 f.e. */
@@ -79,31 +100,25 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) {
             base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2;
             int suffix = consume_char();
             if (!is_alphanumeric(lookahead))
-                CCC_ERROR(
-                    "lexer: invalid suffix on integer constant: %c", suffix);
+                LEXER_PANIC("invalid suffix on integer constant: %c", suffix);
         } else base = 8;
     }
 
     while (is_alphanumeric(lookahead)) {
         int c = consume_char();
-        intlit_t c_val;
+        int_lit_t c_val;
 
         if (is_numeric(c)) c_val = c - '0';
         else if (is_lower_alpha(c)) c_val = c - 'a' + 10;
         else c_val = c - 'A' + 10;
 
         if (c_val >= base)
-            CCC_ERROR(
-                "lexer: invalid digit in base %hhu: %c",
-                base,
-                c);
+            LEXER_PANIC("invalid digit in base %hhu: %c", base, c);
 
         if (ckd_mul(&iv, iv, base))
-            CCC_ERROR(
-                "lexer: integer literal will overflow");
+            LEXER_PANIC("integer literal will overflow");
         if (ckd_add(&iv, iv, c_val))
-            CCC_ERROR(
-                "lexer: integer literal will overflow");
+            LEXER_PANIC("integer literal will overflow");
     }
 
     if (lookahead == '.') {
@@ -118,24 +133,31 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) {
     };
 }
 
+static char replace_escape_sequence(char c) {
+    if (c == '\'') return '\'';
+    else if (c == '\"') return '\"';
+    else if (c == '\\') return '\\';
+    else if (c == 'r') return '\r';
+    else if (c == 'n') return '\n';
+    else if (c == 't') return '\t';
+    else LEXER_PANIC("escape sequence not implemented");
+}
+
 static void lex_char_lit(struct token* p_token) {
     int c = consume_char();
-    if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+    if (c == EOF) LEXER_PANIC("unexpected EOF in char literal");
 
     if (c == '\\') {
         c = consume_char();
-        if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
-
-        if (c == '\'') c = '\'';
-        else if (c == '\"') c = '\"';
-        else CCC_ERROR(
-            "lexer: escape sequences other than quotes are not supported yet");
+        if (c == EOF) LEXER_PANIC("unexpected EOF in char literal");
+        c = replace_escape_sequence(c);
     }
 
     int close_quote = consume_char();
-    if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+    if (close_quote == EOF) LEXER_PANIC("unexpected EOF in char literal");
     if (close_quote != '\'')
-        CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote);
+        LEXER_PANIC(
+            "expected end of char literal, not \"%c\"", close_quote);
 
     *p_token = (struct token) {
         .type = CHAR_LIT,
@@ -144,17 +166,82 @@ static void lex_char_lit(struct token* p_token) {
 }
 
 static void lex_str_lit(struct token* p_token) {
-    /* TODO: impl */
+    if (lookahead == '"') {
+        consume_char();
+        *p_token = (struct token) {
+            .type = STR_LIT,
+            .data.str_lit = strdup(""),
+        };
+        return;
+    }
+
+    char buf[65536];
+    unsigned int len = 0;
+    int c;
+    for (;;) {
+        c = consume_char();
+        if (c == '"') break;
+        if (c == EOF || c == '\n') LEXER_PANIC("unterminated string literal");
+
+        if (c == '\\') {
+            c = consume_char();
+            if (c == EOF) LEXER_PANIC("unterminated string literal");
+            c = replace_escape_sequence(c);
+        }
+
+        if (len >= sizeof(buf) - 1)
+            LEXER_PANIC(
+                "string literal exceeds maximum length (%ld)",
+                sizeof(buf) - 1);
+        buf[len++] = c;
+    }
+    buf[len] = 0;
+
+    *p_token = (struct token) {
+        .type = STR_LIT,
+        .data.str_lit = strndup(buf, sizeof(buf) - 1),
+    };
+}
+
+static enum token_type two_char_operator_type(char c) {
+    if (c == '!' && lookahead == '=') return NEQ;
+    if (c == '^' && lookahead == '=') return XEQ;
+    if (c == '&' && lookahead == '=') return AND_EQ;
+    if (c == '&' && lookahead == '&') return LOG_AND;
+    if (c == '*' && lookahead == '=') return MUL_EQ;
+    if (c == '-' && lookahead == '=') return NEG_EQ;
+    if (c == '-' && lookahead == '>') return ARROW;
+    if (c == '=' && lookahead == '=') return TEST_EQ;
+    if (c == '+' && lookahead == '=') return PLUS_EQ;
+    if (c == '|' && lookahead == '|') return LOG_PIPE;
+    if (c == '|' && lookahead == '=') return PIPE_EQ;
+    if (c == '/' && lookahead == '=') return DIV_EQ;
+    if (c == '%' && lookahead == '=') return MOD_EQ;
+    if (c == '<' && lookahead == '=') return LEQ;
+    if (c == '>' && lookahead == '=') return GEQ;
+    if (c == '<' && lookahead == '<') return SHL;
+    if (c == '>' && lookahead == '>') return SHR;
+    return NOT_FOUND;
 }
 
-static bool lex_complex_operator(enum token_type* p_token_type, char c) {
-    /* TODO: impl 2 char operators */
-    return false;
+static bool lex_complex_operator(struct token* p_token, char c) {
+    enum token_type type = two_char_operator_type(c);
+    if (type == NOT_FOUND) return false;
+    consume_char();
+    if (type == SHL && lookahead == '=') {
+        consume_char();
+        type = SHL_EQ;
+    }
+    if (type == SHR && lookahead == '=') {
+        consume_char();
+        type = SHR_EQ;
+    }
+    *p_token = (struct token) {.type = type};
+    return type;
 }
 
 static enum token_type lex_simple_operator(char c) {
     switch (c) {
-        case '*': return STAR;
         case '#': return HASHTAG;
         case '(': return LPAREN;
         case ')': return RPAREN;
@@ -167,22 +254,32 @@ static enum token_type lex_simple_operator(char c) {
         case ',': return COMMA;
         case '.': return DOT;
         case '?': return QMARK;
-        /* TODO: fill in */
+        case '!': return NOT;
+        case '^': return XOR;
+        case '&': return AMP;
+        case '*': return STAR;
+        case '-': return NEG;
+        case '=': return ASSIGN;
+        case '+': return PLUS;
+        case '\\': return BSLASH;
+        case '|': return PIPE;
+        case '/': return DIV;
+        case '%': return MOD;
+        case '<': return LT;
+        case '>': return GT;
     }
-    CCC_ERROR("lexer: unexpected token %c", c);
+    LEXER_PANIC("unexpected token %c", c);
 }
 
 bool lexer_pop(struct token* p_token) {
-    /* TODO: e.g. float f = .25; */
     if (file == NULL) return false;
 
     // consume all whitespace and comments preceding the next token
     int c;
     for (;;) {
         c = consume_char();
-        // one of these
         if (c == EOF) return false;
-        else if (c == '/' && lookahead == '/') {
+        else if (c == '/' && lookahead == '/') { // one of these
             while (lookahead != EOF && lookahead != '\n') consume_char();
         }
         else if (c == '/' && lookahead == '*') {
@@ -190,10 +287,13 @@ bool lexer_pop(struct token* p_token) {
             int c = consume_char();
             while (c != EOF && (c != '*' || lookahead != '/'))
                 c = consume_char();
-            if (c == EOF) CCC_ERROR("unterminated /* comment");
-            consume_char(); /* consume the final / */
+            if (c == EOF) LEXER_PANIC("unterminated /* comment");
+            consume_char(); /* consume the final slash */
+        }
+        else if (c == '\n') {
+            LINE++;
+            COL = 1;
         }
-        else if (c == '\n') LINE++;
         else if (!is_whitespace(c)) break;
     }
     
@@ -207,7 +307,7 @@ bool lexer_pop(struct token* p_token) {
         lex_char_lit(p_token);
     else if (c == '"')
         lex_str_lit(p_token);
-    else if (!lex_complex_operator(&p_token->type, c))
+    else if (!lex_complex_operator(p_token, c))
         p_token->type = lex_simple_operator(c);
 
     return true;
diff --git a/lexer.h b/lexer.h
index 24fb22d..30848a8 100644
--- a/lexer.h
+++ b/lexer.h
@@ -2,8 +2,10 @@
 #define LEXER_H
 
 enum token_type {
+    NOT_FOUND,
     IDENTIFIER,
     INT_LIT,
+    FLOAT_LIT, // TODO
     CHAR_LIT,
     STR_LIT,
     HASHTAG,
@@ -52,13 +54,15 @@ enum token_type {
     SHL_EQ
 };
 
-typedef unsigned long long intlit_t;
+typedef unsigned long long int_lit_t;
+typedef double float_lit_t;
 
 struct token {
     enum token_type type;
     union {
         char* identifier;
-        intlit_t int_lit;
+        int_lit_t int_lit;
+        float_lit_t float_lit;
         char char_lit;
         char* str_lit;
         void* unused;
@@ -66,6 +70,7 @@ struct token {
 };
 
 void lexer_load(const char* path);
+void lexer_close();
 bool lexer_peek(struct token* p_token);
 bool lexer_pop(struct token* p_token);
 
diff --git a/main.c b/main.c
index e69de29..d2a6ef5 100644
--- a/main.c
+++ b/main.c
@@ -0,0 +1,39 @@
+#include "lexer.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+    if (argc < 2) {
+        fprintf(stderr, "ccc: no input files");
+        return 1;
+    }
+
+    struct token token;
+    for (int i = 1; i < argc; i++) {
+        lexer_load(argv[i]);
+        while (lexer_pop(&token)) {
+            switch (token.type) {
+                case IDENTIFIER:
+                    printf("got identifier: %s\n", token.data.identifier);
+                    free(token.data.identifier);
+                    break;
+                case STR_LIT:
+                    printf("got string: %s\n", token.data.str_lit);
+                    free(token.data.str_lit);
+                    break;
+                case INT_LIT:
+                    printf("got int: %lld\n", token.data.int_lit);
+                    break;
+                case FLOAT_LIT:
+                    printf("got float: %lf\n", token.data.float_lit);
+                    break;
+                case CHAR_LIT:
+                    printf("got char: %c\n", token.data.char_lit);
+                    break;
+                default:
+                    printf("got simple token: %d\n", token.type);
+            }
+        }
+        lexer_close();
+    }
+}