From ec076a56a59701c7d36d831723298276efb5471f Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Thu, 7 Mar 2024 11:02:28 +0100 Subject: [PATCH] feat(lexer): match regular words --- Makefile | 2 +- include/mrk/lexer.h | 2 ++ src/_include/mrk/lexer_internal.h | 10 ++++++++++ src/lexer/lexer.c | 33 ++++++++++++++++++++++++++++++- src/lexer/util.c | 15 ++++++++++++++ test/lexer/lexer.c | 25 ++++++++++++++++++++--- 6 files changed, 82 insertions(+), 5 deletions(-) create mode 100644 src/lexer/util.c diff --git a/Makefile b/Makefile index e52bb1f..b91c53a 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ TARGETS_TEST := $(BINS_TEST:%=test-%) TARGETS_MEM_TEST := $(BINS_TEST:%=test-mem-%) TARGETS_EXAMPLE := $(BINS_EXAMPLE:%=example-%) -_CFLAGS := $(addprefix -I,$(INC_DIRS)) $(CFLAGS) -Wall -Wextra +_CFLAGS := $(addprefix -I,$(INC_DIRS)) $(CFLAGS) .PHONY: all all: lib diff --git a/include/mrk/lexer.h b/include/mrk/lexer.h index 01feae7..f824749 100644 --- a/include/mrk/lexer.h +++ b/include/mrk/lexer.h @@ -32,6 +32,8 @@ typedef enum mrk_token_type { mrk_token_type_left_paren, mrk_token_type_right_paren, mrk_token_type_backslash, + mrk_token_type_dotted_number, + mrk_token_type_word, } mrk_token_type; typedef struct mrk_token { diff --git a/src/_include/mrk/lexer_internal.h b/src/_include/mrk/lexer_internal.h index 871238a..dc5f85e 100644 --- a/src/_include/mrk/lexer_internal.h +++ b/src/_include/mrk/lexer_internal.h @@ -55,6 +55,11 @@ void mrk_lexer_advance_eq(mrk_lexer *lexer, char c); */ char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n); +/** + * Advance the lexer as long as the next character is part of a regular word. + */ +void mrk_lexer_advance_word(mrk_lexer *lexer); + /** * Reset the lexer's current token context. */ @@ -66,4 +71,9 @@ void mrk_lexer_reset(mrk_lexer *lexer); */ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type); +/** + * Returns whether the given character can be part of a word, and is thus not a + * special character. + */ +bool mrk_is_special_char(char c); #endif diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 089c576..c4a4fab 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -78,6 +78,12 @@ char mrk_lexer_peek(mrk_lexer *lexer) { return lexer->buf.s[lexer->pos.buf_index]; } +void mrk_lexer_advance_word(mrk_lexer *lexer) { + while (!mrk_is_special_char(mrk_lexer_peek(lexer))) { + mrk_lexer_advance(lexer); + } +} + char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) { // Check whether the lexer would be done in n steps bool done_in_n = false; @@ -201,9 +207,34 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) { mrk_lexer_emit(out, lexer, mrk_token_type_spaces); } } break; + default: { + // Match ordered list headers + if (isdigit(c)) { + mrk_lexer_advance(lexer); + + while (isdigit(mrk_lexer_peek(lexer))) { + mrk_lexer_advance(lexer); + } + + if (mrk_lexer_peek(lexer) == '.') { + mrk_lexer_advance(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_dotted_number); + } + // Doesn't end with a dot, so it's just a word that happens to start + // with a number + else { + mrk_lexer_advance_word(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_word); + } + } + // Any other special scenarios we simply parse as a word + else { + mrk_lexer_advance_word(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_word); + } + } break; } } return lexer->token.emitted ? mrk_lexer_err_ok : mrk_lexer_err_done; - ; } diff --git a/src/lexer/util.c b/src/lexer/util.c new file mode 100644 index 0000000..d40b448 --- /dev/null +++ b/src/lexer/util.c @@ -0,0 +1,15 @@ +#include "mrk/lexer_internal.h" + +const char special_chars[] = {'#', '`', '-', '_', '*', '=', '\t', '>', + '!', '[', ']', '(', ')', '\\', '\n', ' '}; +const size_t special_chars_len = sizeof(special_chars); + +bool mrk_is_special_char(char c) { + bool is_special = false; + + for (size_t i = 0; i < special_chars_len && !is_special; i++) { + is_special = c == special_chars[i]; + } + + return is_special; +} diff --git a/test/lexer/lexer.c b/test/lexer/lexer.c index b46985d..c16d423 100644 --- a/test/lexer/lexer.c +++ b/test/lexer/lexer.c @@ -37,18 +37,37 @@ void test_lexer_line_break() { mrk_lexer_open(lxr, buf2, 0); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_space); - TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_space); + TEST_CHECK(t.type == mrk_token_type_spaces); TEST_CHECK(mrk_lexer_done(lxr)); mrk_lexer_free(lxr); +} +void test_lexer_simple1() { + LEXER_INIT(); + + const char *buf = "### hello world\n\nthis is a paragraph"; + mrk_lexer_open(lxr, buf, 0); + + mrk_token t; + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_pounds); + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_spaces); + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_word); + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_spaces); + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_word); + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_blank_line); } TEST_LIST = { { "lexer header", test_lexer_header }, { "lexer line break", test_lexer_line_break}, + { "lexer simple 1", test_lexer_simple1 }, { NULL, NULL } };