summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--ccc.h7
-rw-r--r--lexer.c176
-rw-r--r--lexer.h9
-rw-r--r--main.c39
5 files changed, 186 insertions, 46 deletions
diff --git a/.gitignore b/.gitignore
index 4c0c7e0..85a7886 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+ccc
*.o
*.out
build/**
diff --git a/ccc.h b/ccc.h
index 6b41480..36f80f9 100644
--- a/ccc.h
+++ b/ccc.h
@@ -1,12 +1,7 @@
#ifndef CCC_H
#define CCC_H
-#include <stdio.h>
-#include <stdlib.h>
#define CCC_PANIC { perror("ccc"); exit(1); }
-#define CCC_ERROR(format, ...) {\
- fprintf(stderr, "line %ld: " format "\n", LINE __VA_OPT__(,) __VA_ARGS__);\
- exit(1);\
-}
+
#endif
diff --git a/lexer.c b/lexer.c
index 7e2f5a4..a4ffd89 100644
--- a/lexer.c
+++ b/lexer.c
@@ -1,11 +1,23 @@
#include "ccc.h"
#include "lexer.h"
+#include <stdlib.h>
+#include <stdio.h>
#include <string.h>
#include <stdckdint.h>
+#define LEXER_PANIC(format, ...) {\
+ fprintf(\
+ stderr,\
+ "ccc: lexer error: line %lu, column %lu: " format "\n",\
+ LINE,\
+ COL __VA_OPT__(,)\
+ __VA_ARGS__);\
+ exit(1);\
+}
+
static FILE* file = NULL;
static int lookahead;
-static long LINE;
+static unsigned long LINE, COL;
void lexer_load(const char* path) {
if (file != NULL) {
@@ -16,6 +28,13 @@ void lexer_load(const char* path) {
lookahead = fgetc(file);
LINE = 1;
+ COL = 1;
+}
+
+void lexer_close() {
+ if (file == NULL) return;
+ fclose(file);
+ file = NULL;
}
bool lexer_peek(struct token* p_token) {
@@ -40,6 +59,7 @@ bool lexer_peek(struct token* p_token) {
static int consume_char() {
int rv = lookahead;
lookahead = fgetc(file);
+ COL++;
return rv;
}
@@ -50,7 +70,8 @@ static void lex_ident(struct token* p_token, char ic) {
while (is_ident_legal(lookahead)) {
int c = consume_char();
if (len >= sizeof(buf) - 1)
- CCC_ERROR("identifier exceeds maximum size (%ld)", sizeof(buf));
+ LEXER_PANIC(
+ "identifier exceeds maximum size (%ld)", sizeof(buf) - 1);
buf[len++] = c;
}
@@ -66,10 +87,10 @@ static void lex_float_lit(
unsigned char base,
double iv
) {
- CCC_ERROR("lexer: floating point literals are not supported yet");
+ LEXER_PANIC("floating point literals are not implemented");
}
-static void lex_int_lit(struct token* p_token, intlit_t iv) {
+static void lex_int_lit(struct token* p_token, int_lit_t iv) {
unsigned char base = 10;
/* TODO: exponentiation, 2e10 f.e. */
@@ -79,31 +100,25 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) {
base = (lookahead == 'x' || lookahead == 'X') ? 16 : 2;
int suffix = consume_char();
if (!is_alphanumeric(lookahead))
- CCC_ERROR(
- "lexer: invalid suffix on integer constant: %c", suffix);
+ LEXER_PANIC("invalid suffix on integer constant: %c", suffix);
} else base = 8;
}
while (is_alphanumeric(lookahead)) {
int c = consume_char();
- intlit_t c_val;
+ int_lit_t c_val;
if (is_numeric(c)) c_val = c - '0';
else if (is_lower_alpha(c)) c_val = c - 'a' + 10;
else c_val = c - 'A' + 10;
if (c_val >= base)
- CCC_ERROR(
- "lexer: invalid digit in base %hhu: %c",
- base,
- c);
+ LEXER_PANIC("invalid digit in base %hhu: %c", base, c);
if (ckd_mul(&iv, iv, base))
- CCC_ERROR(
- "lexer: integer literal will overflow");
+ LEXER_PANIC("integer literal will overflow");
if (ckd_add(&iv, iv, c_val))
- CCC_ERROR(
- "lexer: integer literal will overflow");
+ LEXER_PANIC("integer literal will overflow");
}
if (lookahead == '.') {
@@ -118,24 +133,31 @@ static void lex_int_lit(struct token* p_token, intlit_t iv) {
};
}
+static char replace_escape_sequence(char c) {
+ if (c == '\'') return '\'';
+ else if (c == '\"') return '\"';
+ else if (c == '\\') return '\\';
+ else if (c == 'r') return '\r';
+ else if (c == 'n') return '\n';
+ else if (c == 't') return '\t';
+ else LEXER_PANIC("escape sequence not implemented");
+}
+
static void lex_char_lit(struct token* p_token) {
int c = consume_char();
- if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+ if (c == EOF) LEXER_PANIC("unexpected EOF in char literal");
if (c == '\\') {
c = consume_char();
- if (c == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
-
- if (c == '\'') c = '\'';
- else if (c == '\"') c = '\"';
- else CCC_ERROR(
- "lexer: escape sequences other than quotes are not supported yet");
+ if (c == EOF) LEXER_PANIC("unexpected EOF in char literal");
+ c = replace_escape_sequence(c);
}
int close_quote = consume_char();
- if (close_quote == EOF) CCC_ERROR("lexer: unexpected EOF in char literal");
+ if (close_quote == EOF) LEXER_PANIC("unexpected EOF in char literal");
if (close_quote != '\'')
- CCC_ERROR("lexer: expected \"'\", not \"%c\"", close_quote);
+ LEXER_PANIC(
+ "expected end of char literal, not \"%c\"", close_quote);
*p_token = (struct token) {
.type = CHAR_LIT,
@@ -144,17 +166,82 @@ static void lex_char_lit(struct token* p_token) {
}
static void lex_str_lit(struct token* p_token) {
- /* TODO: impl */
+ if (lookahead == '"') {
+ consume_char();
+ *p_token = (struct token) {
+ .type = STR_LIT,
+ .data.str_lit = strdup(""),
+ };
+ return;
+ }
+
+ char buf[65536];
+ unsigned int len = 0;
+ int c;
+ for (;;) {
+ c = consume_char();
+ if (c == '"') break;
+ if (c == EOF || c == '\n') LEXER_PANIC("unterminated string literal");
+
+ if (c == '\\') {
+ c = consume_char();
+ if (c == EOF) LEXER_PANIC("unterminated string literal");
+ c = replace_escape_sequence(c);
+ }
+
+ if (len >= sizeof(buf) - 1)
+ LEXER_PANIC(
+ "string literal exceeds maximum length (%ld)",
+ sizeof(buf) - 1);
+ buf[len++] = c;
+ }
+ buf[len] = 0;
+
+ *p_token = (struct token) {
+ .type = STR_LIT,
+ .data.str_lit = strndup(buf, sizeof(buf) - 1),
+ };
+}
+
+static enum token_type two_char_operator_type(char c) {
+ if (c == '!' && lookahead == '=') return NEQ;
+ if (c == '^' && lookahead == '=') return XEQ;
+ if (c == '&' && lookahead == '=') return AND_EQ;
+ if (c == '&' && lookahead == '&') return LOG_AND;
+ if (c == '*' && lookahead == '=') return MUL_EQ;
+ if (c == '-' && lookahead == '=') return NEG_EQ;
+ if (c == '-' && lookahead == '>') return ARROW;
+ if (c == '=' && lookahead == '=') return TEST_EQ;
+ if (c == '+' && lookahead == '=') return PLUS_EQ;
+ if (c == '|' && lookahead == '|') return LOG_PIPE;
+ if (c == '|' && lookahead == '=') return PIPE_EQ;
+ if (c == '/' && lookahead == '=') return DIV_EQ;
+ if (c == '%' && lookahead == '=') return MOD_EQ;
+ if (c == '<' && lookahead == '=') return LEQ;
+ if (c == '>' && lookahead == '=') return GEQ;
+ if (c == '<' && lookahead == '<') return SHL;
+ if (c == '>' && lookahead == '>') return SHR;
+ return NOT_FOUND;
}
-static bool lex_complex_operator(enum token_type* p_token_type, char c) {
- /* TODO: impl 2 char operators */
- return false;
+static bool lex_complex_operator(struct token* p_token, char c) {
+ enum token_type type = two_char_operator_type(c);
+ if (type == NOT_FOUND) return false;
+ consume_char();
+ if (type == SHL && lookahead == '=') {
+ consume_char();
+ type = SHL_EQ;
+ }
+ if (type == SHR && lookahead == '=') {
+ consume_char();
+ type = SHR_EQ;
+ }
+ *p_token = (struct token) {.type = type};
+ return type;
}
static enum token_type lex_simple_operator(char c) {
switch (c) {
- case '*': return STAR;
case '#': return HASHTAG;
case '(': return LPAREN;
case ')': return RPAREN;
@@ -167,22 +254,32 @@ static enum token_type lex_simple_operator(char c) {
case ',': return COMMA;
case '.': return DOT;
case '?': return QMARK;
- /* TODO: fill in */
+ case '!': return NOT;
+ case '^': return XOR;
+ case '&': return AMP;
+ case '*': return STAR;
+ case '-': return NEG;
+ case '=': return ASSIGN;
+ case '+': return PLUS;
+ case '\\': return BSLASH;
+ case '|': return PIPE;
+ case '/': return DIV;
+ case '%': return MOD;
+ case '<': return LT;
+ case '>': return GT;
}
- CCC_ERROR("lexer: unexpected token %c", c);
+ LEXER_PANIC("unexpected token %c", c);
}
bool lexer_pop(struct token* p_token) {
- /* TODO: e.g. float f = .25; */
if (file == NULL) return false;
// consume all whitespace and comments preceding the next token
int c;
for (;;) {
c = consume_char();
- // one of these
if (c == EOF) return false;
- else if (c == '/' && lookahead == '/') {
+ else if (c == '/' && lookahead == '/') { // one of these
while (lookahead != EOF && lookahead != '\n') consume_char();
}
else if (c == '/' && lookahead == '*') {
@@ -190,10 +287,13 @@ bool lexer_pop(struct token* p_token) {
int c = consume_char();
while (c != EOF && (c != '*' || lookahead != '/'))
c = consume_char();
- if (c == EOF) CCC_ERROR("unterminated /* comment");
- consume_char(); /* consume the final / */
+ if (c == EOF) LEXER_PANIC("unterminated /* comment");
+ consume_char(); /* consume the final slash */
+ }
+ else if (c == '\n') {
+ LINE++;
+ COL = 1;
}
- else if (c == '\n') LINE++;
else if (!is_whitespace(c)) break;
}
@@ -207,7 +307,7 @@ bool lexer_pop(struct token* p_token) {
lex_char_lit(p_token);
else if (c == '"')
lex_str_lit(p_token);
- else if (!lex_complex_operator(&p_token->type, c))
+ else if (!lex_complex_operator(p_token, c))
p_token->type = lex_simple_operator(c);
return true;
diff --git a/lexer.h b/lexer.h
index 24fb22d..30848a8 100644
--- a/lexer.h
+++ b/lexer.h
@@ -2,8 +2,10 @@
#define LEXER_H
enum token_type {
+ NOT_FOUND,
IDENTIFIER,
INT_LIT,
+ FLOAT_LIT, // TODO
CHAR_LIT,
STR_LIT,
HASHTAG,
@@ -52,13 +54,15 @@ enum token_type {
SHL_EQ
};
-typedef unsigned long long intlit_t;
+typedef unsigned long long int_lit_t;
+typedef double float_lit_t;
struct token {
enum token_type type;
union {
char* identifier;
- intlit_t int_lit;
+ int_lit_t int_lit;
+ float_lit_t float_lit;
char char_lit;
char* str_lit;
void* unused;
@@ -66,6 +70,7 @@ struct token {
};
void lexer_load(const char* path);
+void lexer_close();
bool lexer_peek(struct token* p_token);
bool lexer_pop(struct token* p_token);
diff --git a/main.c b/main.c
index e69de29..d2a6ef5 100644
--- a/main.c
+++ b/main.c
@@ -0,0 +1,39 @@
+#include "lexer.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+int main(int argc, char** argv) {
+ if (argc < 2) {
+ fprintf(stderr, "ccc: no input files");
+ return 1;
+ }
+
+ struct token token;
+ for (int i = 1; i < argc; i++) {
+ lexer_load(argv[i]);
+ while (lexer_pop(&token)) {
+ switch (token.type) {
+ case IDENTIFIER:
+ printf("got identifier: %s\n", token.data.identifier);
+ free(token.data.identifier);
+ break;
+ case STR_LIT:
+ printf("got string: %s\n", token.data.str_lit);
+ free(token.data.str_lit);
+ break;
+ case INT_LIT:
+ printf("got int: %lld\n", token.data.int_lit);
+ break;
+ case FLOAT_LIT:
+ printf("got float: %lf\n", token.data.float_lit);
+ break;
+ case CHAR_LIT:
+ printf("got char: %c\n", token.data.char_lit);
+ break;
+ default:
+ printf("got simple token: %d\n", token.type);
+ }
+ }
+ lexer_close();
+ }
+}