diff options
Diffstat (limited to 'lexer.c')
| -rw-r--r-- | lexer.c | 210 |
1 files changed, 210 insertions, 0 deletions
@@ -0,0 +1,210 @@ +#include "ccc.h" +#include "lexer.h" +#include <string.h> +#include <stdckdint.h> + +static FILE* file = NULL; +static int lookahead; +static long LINE; + +void lexer_load(const char* path) { + if (file != NULL) { + fclose(file); + } + file = fopen(path, "r"); + if (file == NULL) CCC_PANIC; + + lookahead = fgetc(file); + LINE = 1; +} + +bool lexer_peek(struct token* p_token) { + if (file == NULL) return false; + + long orig_offset = ftell(file); + int orig_lookahead = lookahead; + bool rv = lexer_pop(p_token); + lookahead = orig_lookahead; + fseek(file, orig_offset, SEEK_SET); + return rv; +} + +#define is_whitespace(c) (c == ' ' || c == '\t' || c == '\n') +#define is_lower_alpha(c) ('a' <= c && c <= 'z') +#define is_upper_alpha(c) ('A' <= c && c <= 'Z') +#define is_alpha(c) (is_lower_alpha(c) || is_upper_alpha(c)) +#define is_numeric(c) ('0' <= c && c <= '9') +#define is_alphanumeric(c) (is_alpha(c) || is_numeric(c)) +#define is_ident_legal(c) (is_alphanumeric(c) || c == '_' || c == '$') + +#define REFUND_CHAR fseek(file, -1, SEEK_CUR) + +static int consume_char() { + int rv = lookahead; + lookahead = fgetc(file); + return rv; +} + +static void lex_ident(struct token* p_token, char ic) { + char buf[1024] = {ic}; + unsigned int len = 1; + + while (is_ident_legal(lookahead)) { + int c = consume_char(); + if (len >= sizeof(buf) - 1) + CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf)); + buf[len++] = c; + } + + buf[len] = 0; + *p_token = (struct token) { + .type = IDENTIFIER, + .data.identifier = strndup(buf, sizeof(buf) - 1), + }; +} + +static void lex_float_lit( + struct token* p_token, + unsigned char base, + double iv +) { + CCC_ERROR("lexer: floating point literals are not supported yet"); +} + +static void lex_int_lit(struct token* p_token, intlit_t iv) { + unsigned char base = 10; + + /* TODO: exponentiation, 2e10 f.e. */ + if (iv == 0) { + if (lookahead == 'x' || lookahead == 'X' + || lookahead == 'b' || lookahead == 'B') { + base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2; + int suffix = consume_char(); + if (!is_alphanumeric(lookahead)) + CCC_ERROR( + "lexer: invalid suffix on integer constant: %c", suffix); + } else base = 8; + } + + while (is_alphanumeric(lookahead)) { + int c = consume_char(); + intlit_t c_val; + + if (is_numeric(c)) c_val = c - '0'; + else if (is_lower_alpha(c)) c_val = c - 'a' + 10; + else c_val = c - 'A' + 10; + + if (c_val >= base) + CCC_ERROR( + "lexer: invalid digit in base %hhu: %c", + base, + c); + + if (ckd_mul(&iv, iv, base)) + CCC_ERROR( + "lexer: integer literal will overflow"); + if (ckd_add(&iv, iv, c_val)) + CCC_ERROR( + "lexer: integer literal will overflow"); + } + + if (lookahead == '.') { + consume_char(); + lex_float_lit(p_token, base, iv); + return; + } + + *p_token = (struct token) { + .type = INT_LIT, + .data.int_lit = iv, + }; +} + +static void lex_char_lit(struct token* p_token) { + int c = consume_char(); + if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); + + if (c == '\\') { + c = consume_char(); + if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); + + if (c == '\'') c = '\''; + else if (c == '\"') c = '\"'; + else CCC_ERROR( + "lexer: escape sequences other than quotes are not supported yet"); + } + + int close_quote = consume_char(); + if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal"); + if (close_quote != '\'') + CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote); + + *p_token = (struct token) { + .type = CHAR_LIT, + .data.char_lit = c, + }; +} + +static void lex_str_lit(struct token* p_token) { + +} + +enum token_type lex_simple(char c) { + switch (c) { + case '*': return STAR; /* TODO: *= */ + case '#': return HASHTAG; + case '(': return LPAREN; + case ')': return RPAREN; + case '{': return LCURLY; + case '}': return RCURLY; + case '[': return LSQUARE; + case ']': return RSQUARE; + case ':': return COLON; + case ';': return SEMI; + case ',': return COMMA; + case '.': return DOT; + case '?': return QMARK; + } + CCC_ERROR("lexer: unexpected token %c", c); +} + +bool lexer_pop(struct token* p_token) { + /* TODO: e.g. float f = .25; */ + if (file == NULL) return false; + + // consume all whitespace and comments preceding the next token + int c; + for (;;) { + c = consume_char(); + // one of these + if (c == EOF) return false; + else if (c == '/' && lookahead == '/') { + while (lookahead != EOF && lookahead != '\n') consume_char(); + } + else if (c == '/' && lookahead == '*') { + consume_char(); /* consume the * */ + int c = consume_char(); + while (c != EOF && (c != '*' || lookahead != '/')) + c = consume_char(); + if (c == EOF) CCC_ERROR("unterminated /* comment"); + consume_char(); /* consume the final / */ + } + else if (c == '\n') LINE++; + else if (!is_whitespace(c)) break; + } + + if (is_numeric(c)) + lex_int_lit(p_token, c - '0'); + else if (c == '.' && is_numeric(lookahead)) + lex_float_lit(p_token, 10, 0); + else if (is_ident_legal(c)) + lex_ident(p_token, c); + else if (c == '\'') + lex_char_lit(p_token); + else if (c == '"') + lex_str_lit(p_token); + else + *p_token = (struct token) {.type = lex_simple(c)}; + + return true; +} |
