From f6e034097db98507d731fac4642c2c063ffb8c58 Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Mon, 4 Mar 2024 14:50:00 +0100 Subject: [PATCH] feat(lexer): matching more things --- include/mrk/lexer.h | 11 ++-- src/_include/mrk/lexer_internal.h | 11 ++++ src/lexer/lexer.c | 89 ++++++++++++++++++++++--------- test/lexer/lexer.c | 29 +++++++++- 4 files changed, 111 insertions(+), 29 deletions(-) diff --git a/include/mrk/lexer.h b/include/mrk/lexer.h index 10dca83..ae4baa3 100644 --- a/include/mrk/lexer.h +++ b/include/mrk/lexer.h @@ -14,11 +14,16 @@ typedef enum mrk_lexer_err { } mrk_lexer_err; typedef enum mrk_token_type { - mrk_token_type_header = 0, + mrk_token_type_pounds = 0, + mrk_token_type_backticks, + mrk_token_type_dashes, + mrk_token_type_underscores, + mrk_token_type_stars, mrk_token_type_blank_line, - mrk_token_type_star_star, - mrk_token_type_space_space, + mrk_token_type_space, mrk_token_type_line_break, + mrk_token_type_right_angle_bracket, + mrk_token_type_tab, } mrk_token_type; typedef struct mrk_token { diff --git a/src/_include/mrk/lexer_internal.h b/src/_include/mrk/lexer_internal.h index c07ff81..871238a 100644 --- a/src/_include/mrk/lexer_internal.h +++ b/src/_include/mrk/lexer_internal.h @@ -26,6 +26,12 @@ struct mrk_lexer { */ char mrk_lexer_peek(mrk_lexer *lexer); +/** + * Return the n'th next character that would be consumed. If `n` is zero, this + * function is equivalent to calling peek. + */ +char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n); + /** * Returns true if the nul-terminated string s is equal to the next characters * in the token stream. @@ -38,6 +44,11 @@ bool mrk_lexer_match(mrk_lexer *lexer, const char *s); */ char mrk_lexer_advance(mrk_lexer *lexer); +/** + * Advance until the next element to peek is not equal to c. + */ +void mrk_lexer_advance_eq(mrk_lexer *lexer, char c); + /** * Advance `n` positions; equivalent to running advance `n` times and returning * the last call's result. diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 5b30399..7d56aa9 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -1,3 +1,6 @@ +#include +#include + #include "mrk/lexer.h" #include "mrk/lexer_internal.h" @@ -55,11 +58,18 @@ char mrk_lexer_advance(mrk_lexer *lexer) { char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) { while (n > 1) { mrk_lexer_advance(lexer); + n--; } return mrk_lexer_advance(lexer); } +void mrk_lexer_advance_eq(mrk_lexer *lexer, char c) { + while (mrk_lexer_peek(lexer) == c) { + mrk_lexer_advance(lexer); + } +} + char mrk_lexer_peek(mrk_lexer *lexer) { if (mrk_lexer_done(lexer)) { return '\0'; @@ -68,6 +78,27 @@ char mrk_lexer_peek(mrk_lexer *lexer) { return lexer->buf.s[lexer->pos.buf_index]; } +char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) { + // Check whether the lexer would be done in n steps + bool done_in_n = false; + + for (size_t i = 0; i < n && !done_in_n; i++) { + done_in_n = + (lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) || + (lexer->buf.s[lexer->pos.buf_index + i] == '\0'); + } + + return done_in_n ? '\0' : lexer->buf.s[lexer->pos.buf_index + n]; +} + +/* bool mrk_lexer_match(mrk_lexer *lexer, const char *s) { */ +/* size_t s_len = strlen(s); */ +/* if (mrk_lexer_done(lexer) && s[0] != '\0') { */ +/* return false; */ +/* } */ + +/* } */ + void mrk_lexer_reset(mrk_lexer *lexer) { lexer->token.start = lexer->pos.buf_index; lexer->token.end = lexer->pos.buf_index; @@ -90,14 +121,29 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) { mrk_lexer_reset(lexer); while (!lexer->token.emitted && !mrk_lexer_done(lexer)) { - switch (mrk_lexer_advance(lexer)) { - // Match one or more hashtags as a single header definition + char c = mrk_lexer_advance(lexer); + switch (c) { + // All these characters have multiple meanings depending on their location + // in the file and how many there are case '#': - while (mrk_lexer_peek(lexer) == '#') { - mrk_lexer_advance(lexer); - } - - mrk_lexer_emit(out, lexer, mrk_token_type_header); + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_pounds); + break; + case '`': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_backticks); + break; + case '-': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_dashes); + break; + case '_': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_underscores); + break; + case '*': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_stars); break; // Two consecutive newlines constitute a blank line, otherwise they're // ignored as whitespace @@ -110,29 +156,22 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) { } break; case ' ': { + /* if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer,)) */ // Either a double space or a line break - if (mrk_lexer_peek(lexer) == ' ') { - mrk_lexer_advance(lexer); + if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer, 1) == '\n') { + mrk_lexer_advance_n(lexer, 2); - if (mrk_lexer_peek(lexer) == '\n') { - mrk_lexer_advance(lexer); - mrk_lexer_emit(out, lexer, mrk_token_type_line_break); - } else { - mrk_lexer_emit(out, lexer, mrk_token_type_space_space); - } + mrk_lexer_emit(out, lexer, mrk_token_type_line_break); } else { - mrk_lexer_reset(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_space); } } break; - /* case '*': */ - /* if (mrk_lexer_peek(lexer) == '*') { */ - /* mrk_lexer_advance(lexer); */ - /* mrk_lexer_emit(out, lexer, mrk_token_type_star_star); */ - /* } else { */ - /* // TODO match word */ - /* } */ - /* default: */ - /* return mrk_lexer_err_unexpected_char; */ + case '\t': + mrk_lexer_emit(out, lexer, mrk_token_type_tab); + break; + case '>': + mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_bracket); + break; } } diff --git a/test/lexer/lexer.c b/test/lexer/lexer.c index 95d49a1..b46985d 100644 --- a/test/lexer/lexer.c +++ b/test/lexer/lexer.c @@ -14,14 +14,41 @@ void test_lexer_header() { mrk_token t; TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_header); + TEST_CHECK(t.type == mrk_token_type_pounds); TEST_CHECK_(t.start == 0, "t.start == %lu", t.start); TEST_CHECK(t.end == 4); mrk_lexer_free(lxr); } +void test_lexer_line_break() { + LEXER_INIT(); + + const char *buf = " \n"; + mrk_lexer_open(lxr, buf, 0); + + mrk_token t; + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_line_break); + + TEST_CHECK(mrk_lexer_done(lxr)); + + const char *buf2 = " "; + mrk_lexer_open(lxr, buf2, 0); + + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_space); + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_space); + + TEST_CHECK(mrk_lexer_done(lxr)); + + mrk_lexer_free(lxr); + +} + TEST_LIST = { { "lexer header", test_lexer_header }, + { "lexer line break", test_lexer_line_break}, { NULL, NULL } };