lexer.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208

#include "ccc.h"
#include "lexer.h"
#include <string.h>
#include <stdckdint.h>

static FILE* file = NULL;
static int lookahead;
static long LINE;

void lexer_load(const char* path) {
    if (file != NULL) {
        fclose(file);
    }
    file = fopen(path, "r");
    if (file == NULL) CCC_PANIC;

    lookahead = fgetc(file);
    LINE = 1;
}

bool lexer_peek(struct token* p_token) {
    if (file == NULL) return false;

    long orig_offset = ftell(file);
    int orig_lookahead = lookahead;
    bool rv = lexer_pop(p_token);
    lookahead = orig_lookahead;
    fseek(file, orig_offset, SEEK_SET);
    return rv;
}

#define is_whitespace(c) (c == ' ' || c == '\t' || c == '\n')
#define is_lower_alpha(c) ('a' <= c && c <= 'z')
#define is_upper_alpha(c) ('A' <= c && c <= 'Z')
#define is_alpha(c) (is_lower_alpha(c) || is_upper_alpha(c))
#define is_numeric(c) ('0' <= c && c <= '9')
#define is_alphanumeric(c) (is_alpha(c) || is_numeric(c))
#define is_ident_legal(c) (is_alphanumeric(c) || c == '_' || c == '$')

static int consume_char() {
    int rv = lookahead;
    lookahead = fgetc(file);
    return rv;
}

static void lex_ident(struct token* p_token, char ic) {
    char buf[1024] = {ic};
    unsigned int len = 1;

    while (is_ident_legal(lookahead)) {
        int c = consume_char();
        if (len >= sizeof(buf) - 1)
            CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf));
        buf[len++] = c;
    }

    buf[len] = 0;
    *p_token = (struct token) {
        .type = IDENTIFIER,
        .data.identifier = strndup(buf, sizeof(buf) - 1),
    };
}

static void lex_float_lit(
    struct token* p_token,
    unsigned char base,
    double iv
) {
    CCC_ERROR("lexer: floating point literals are not supported yet");
}

static void lex_int_lit(struct token* p_token, intlit_t iv) {
    unsigned char base = 10;

    /* TODO: exponentiation, 2e10 f.e. */
    if (iv == 0) {
        if (lookahead == 'x' || lookahead == 'X'
                || lookahead == 'b' || lookahead == 'B') {
            base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2;
            int suffix = consume_char();
            if (!is_alphanumeric(lookahead))
                CCC_ERROR(
                    "lexer: invalid suffix on integer constant: %c", suffix);
        } else base = 8;
    }

    while (is_alphanumeric(lookahead)) {
        int c = consume_char();
        intlit_t c_val;

        if (is_numeric(c)) c_val = c - '0';
        else if (is_lower_alpha(c)) c_val = c - 'a' + 10;
        else c_val = c - 'A' + 10;

        if (c_val >= base)
            CCC_ERROR(
                "lexer: invalid digit in base %hhu: %c",
                base,
                c);

        if (ckd_mul(&iv, iv, base))
            CCC_ERROR(
                "lexer: integer literal will overflow");
        if (ckd_add(&iv, iv, c_val))
            CCC_ERROR(
                "lexer: integer literal will overflow");
    }

    if (lookahead == '.') {
        consume_char();
        lex_float_lit(p_token, base, iv);
        return;
    }

    *p_token = (struct token) {
        .type = INT_LIT,
        .data.int_lit = iv,
    };
}

static void lex_char_lit(struct token* p_token) {
    int c = consume_char();
    if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");

    if (c == '\\') {
        c = consume_char();
        if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");

        if (c == '\'') c = '\'';
        else if (c == '\"') c = '\"';
        else CCC_ERROR(
            "lexer: escape sequences other than quotes are not supported yet");
    }

    int close_quote = consume_char();
    if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
    if (close_quote != '\'')
        CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote);

    *p_token = (struct token) {
        .type = CHAR_LIT,
        .data.char_lit = c,
    };
}

static void lex_str_lit(struct token* p_token) {

}

enum token_type lex_simple(char c) {
    switch (c) {
        case '*': return STAR; /* TODO: *= */
        case '#': return HASHTAG;
        case '(': return LPAREN;
        case ')': return RPAREN;
        case '{': return LCURLY;
        case '}': return RCURLY;
        case '[': return LSQUARE;
        case ']': return RSQUARE;
        case ':': return COLON;
        case ';': return SEMI;
        case ',': return COMMA;
        case '.': return DOT;
        case '?': return QMARK;
    }
    CCC_ERROR("lexer: unexpected token %c", c);
}

bool lexer_pop(struct token* p_token) {
    /* TODO: e.g. float f = .25; */
    if (file == NULL) return false;

    // consume all whitespace and comments preceding the next token
    int c;
    for (;;) {
        c = consume_char();
        // one of these
        if (c == EOF) return false;
        else if (c == '/' && lookahead == '/') {
            while (lookahead != EOF && lookahead != '\n') consume_char();
        }
        else if (c == '/' && lookahead == '*') {
            consume_char(); /* consume the * */
            int c = consume_char();
            while (c != EOF && (c != '*' || lookahead != '/'))
                c = consume_char();
            if (c == EOF) CCC_ERROR("unterminated /* comment");
            consume_char(); /* consume the final / */
        }
        else if (c == '\n') LINE++;
        else if (!is_whitespace(c)) break;
    }
    
    if (is_numeric(c))
        lex_int_lit(p_token, c - '0');
    else if (c == '.' && is_numeric(lookahead))
        lex_float_lit(p_token, 10, 0);
    else if (is_ident_legal(c))
        lex_ident(p_token, c);
    else if (c == '\'')
        lex_char_lit(p_token);
    else if (c == '"')
        lex_str_lit(p_token);
    else
        *p_token = (struct token) {.type = lex_simple(c)};

    return true;
}