summaryrefslogtreecommitdiff
path: root/lexer.c
diff options
context:
space:
mode:
Diffstat (limited to 'lexer.c')
-rw-r--r--lexer.c210
1 files changed, 210 insertions, 0 deletions
diff --git a/lexer.c b/lexer.c
new file mode 100644
index 0000000..04aada4
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,210 @@
+#include "ccc.h"
+#include "lexer.h"
+#include <string.h>
+#include <stdckdint.h>
+
+static FILE* file = NULL;
+static int lookahead;
+static long LINE;
+
+void lexer_load(const char* path) {
+ if (file != NULL) {
+ fclose(file);
+ }
+ file = fopen(path, "r");
+ if (file == NULL) CCC_PANIC;
+
+ lookahead = fgetc(file);
+ LINE = 1;
+}
+
+bool lexer_peek(struct token* p_token) {
+ if (file == NULL) return false;
+
+ long orig_offset = ftell(file);
+ int orig_lookahead = lookahead;
+ bool rv = lexer_pop(p_token);
+ lookahead = orig_lookahead;
+ fseek(file, orig_offset, SEEK_SET);
+ return rv;
+}
+
+#define is_whitespace(c) (c == ' ' || c == '\t' || c == '\n')
+#define is_lower_alpha(c) ('a' <= c && c <= 'z')
+#define is_upper_alpha(c) ('A' <= c && c <= 'Z')
+#define is_alpha(c) (is_lower_alpha(c) || is_upper_alpha(c))
+#define is_numeric(c) ('0' <= c && c <= '9')
+#define is_alphanumeric(c) (is_alpha(c) || is_numeric(c))
+#define is_ident_legal(c) (is_alphanumeric(c) || c == '_' || c == '$')
+
+#define REFUND_CHAR fseek(file, -1, SEEK_CUR)
+
+static int consume_char() {
+ int rv = lookahead;
+ lookahead = fgetc(file);
+ return rv;
+}
+
+static void lex_ident(struct token* p_token, char ic) {
+ char buf[1024] = {ic};
+ unsigned int len = 1;
+
+ while (is_ident_legal(lookahead)) {
+ int c = consume_char();
+ if (len >= sizeof(buf) - 1)
+ CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf));
+ buf[len++] = c;
+ }
+
+ buf[len] = 0;
+ *p_token = (struct token) {
+ .type = IDENTIFIER,
+ .data.identifier = strndup(buf, sizeof(buf) - 1),
+ };
+}
+
+static void lex_float_lit(
+ struct token* p_token,
+ unsigned char base,
+ double iv
+) {
+ CCC_ERROR("lexer: floating point literals are not supported yet");
+}
+
+static void lex_int_lit(struct token* p_token, intlit_t iv) {
+ unsigned char base = 10;
+
+ /* TODO: exponentiation, 2e10 f.e. */
+ if (iv == 0) {
+ if (lookahead == 'x' || lookahead == 'X'
+ || lookahead == 'b' || lookahead == 'B') {
+ base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2;
+ int suffix = consume_char();
+ if (!is_alphanumeric(lookahead))
+ CCC_ERROR(
+ "lexer: invalid suffix on integer constant: %c", suffix);
+ } else base = 8;
+ }
+
+ while (is_alphanumeric(lookahead)) {
+ int c = consume_char();
+ intlit_t c_val;
+
+ if (is_numeric(c)) c_val = c - '0';
+ else if (is_lower_alpha(c)) c_val = c - 'a' + 10;
+ else c_val = c - 'A' + 10;
+
+ if (c_val >= base)
+ CCC_ERROR(
+ "lexer: invalid digit in base %hhu: %c",
+ base,
+ c);
+
+ if (ckd_mul(&iv, iv, base))
+ CCC_ERROR(
+ "lexer: integer literal will overflow");
+ if (ckd_add(&iv, iv, c_val))
+ CCC_ERROR(
+ "lexer: integer literal will overflow");
+ }
+
+ if (lookahead == '.') {
+ consume_char();
+ lex_float_lit(p_token, base, iv);
+ return;
+ }
+
+ *p_token = (struct token) {
+ .type = INT_LIT,
+ .data.int_lit = iv,
+ };
+}
+
+static void lex_char_lit(struct token* p_token) {
+ int c = consume_char();
+ if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+
+ if (c == '\\') {
+ c = consume_char();
+ if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+
+ if (c == '\'') c = '\'';
+ else if (c == '\"') c = '\"';
+ else CCC_ERROR(
+ "lexer: escape sequences other than quotes are not supported yet");
+ }
+
+ int close_quote = consume_char();
+ if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+ if (close_quote != '\'')
+ CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote);
+
+ *p_token = (struct token) {
+ .type = CHAR_LIT,
+ .data.char_lit = c,
+ };
+}
+
+static void lex_str_lit(struct token* p_token) {
+
+}
+
+enum token_type lex_simple(char c) {
+ switch (c) {
+ case '*': return STAR; /* TODO: *= */
+ case '#': return HASHTAG;
+ case '(': return LPAREN;
+ case ')': return RPAREN;
+ case '{': return LCURLY;
+ case '}': return RCURLY;
+ case '[': return LSQUARE;
+ case ']': return RSQUARE;
+ case ':': return COLON;
+ case ';': return SEMI;
+ case ',': return COMMA;
+ case '.': return DOT;
+ case '?': return QMARK;
+ }
+ CCC_ERROR("lexer: unexpected token %c", c);
+}
+
+bool lexer_pop(struct token* p_token) {
+ /* TODO: e.g. float f = .25; */
+ if (file == NULL) return false;
+
+ // consume all whitespace and comments preceding the next token
+ int c;
+ for (;;) {
+ c = consume_char();
+ // one of these
+ if (c == EOF) return false;
+ else if (c == '/' && lookahead == '/') {
+ while (lookahead != EOF && lookahead != '\n') consume_char();
+ }
+ else if (c == '/' && lookahead == '*') {
+ consume_char(); /* consume the * */
+ int c = consume_char();
+ while (c != EOF && (c != '*' || lookahead != '/'))
+ c = consume_char();
+ if (c == EOF) CCC_ERROR("unterminated /* comment");
+ consume_char(); /* consume the final / */
+ }
+ else if (c == '\n') LINE++;
+ else if (!is_whitespace(c)) break;
+ }
+
+ if (is_numeric(c))
+ lex_int_lit(p_token, c - '0');
+ else if (c == '.' && is_numeric(lookahead))
+ lex_float_lit(p_token, 10, 0);
+ else if (is_ident_legal(c))
+ lex_ident(p_token, c);
+ else if (c == '\'')
+ lex_char_lit(p_token);
+ else if (c == '"')
+ lex_str_lit(p_token);
+ else
+ *p_token = (struct token) {.type = lex_simple(c)};
+
+ return true;
+}