From 6dba1a8291f03c4155cfd4e4f390b597f49b7a42 Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Mon, 11 Mar 2024 11:11:19 +0100 Subject: [PATCH] feat(lexer): parse sequences of regular text --- include/mrk/lexer.h | 2 -- src/_include/mrk/lexer_internal.h | 17 +++++++++++ src/lexer/lexer.c | 48 ++++++++++++++++++++----------- 3 files changed, 48 insertions(+), 19 deletions(-) diff --git a/include/mrk/lexer.h b/include/mrk/lexer.h index 525f0bb..cfd6269 100644 --- a/include/mrk/lexer.h +++ b/include/mrk/lexer.h @@ -20,12 +20,10 @@ typedef enum mrk_token_type { mrk_token_type_none = 0, mrk_token_type_backtick, mrk_token_type_triple_backtick, - mrk_token_type_dash, mrk_token_type_underscore, mrk_token_type_double_underscore, mrk_token_type_star, mrk_token_type_double_star, - mrk_token_type_equals, mrk_token_type_newline, mrk_token_type_line_break, mrk_token_type_right_angle_brackets, diff --git a/src/_include/mrk/lexer_internal.h b/src/_include/mrk/lexer_internal.h index 08cdea9..d2c9bff 100644 --- a/src/_include/mrk/lexer_internal.h +++ b/src/_include/mrk/lexer_internal.h @@ -78,4 +78,21 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type); * special character. */ bool mrk_is_special_char(char c); + +/** + * Lex a token at the start of a line (including after 1 or more indents). + */ +void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer); + +/** + * Lex a token that starts in the middle of a line. + */ +void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer); + +/** + * Advance the lexer until it encounters a character that's always parsed as a + * separate token, regardless of context (e,g. newlines, brackets). + */ +void mrk_lexer_advance_text(mrk_lexer *lexer); + #endif diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 770e559..8b819a6 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -1,5 +1,6 @@ #include #include +#include #include "mrk/lexer.h" #include "mrk/lexer_internal.h" @@ -131,12 +132,17 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) { lexer->last_emitted = type; } -/* void mrk_lexer_advance_text(mrk_lexer *lexer) { */ -/* const char */ -/* /1* while (!mrk_lexer_done(lexer)) { *1/ */ -/* /1* /2* switch () *2/ *1/ */ -/* /1* } *1/ */ -/* } */ +void mrk_lexer_advance_text(mrk_lexer *lexer) { + const char *special_chars = "*\n[]()\\"; + + while (!mrk_lexer_done(lexer)) { + if (strchr(special_chars, mrk_lexer_peek(lexer)) == NULL) { + mrk_lexer_advance(lexer); + } else { + break; + } + } +} void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { char c = mrk_lexer_advance(lexer); @@ -149,7 +155,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { if (lexer->token.end - lexer->token.start <= MRK_MAX_HEADER_LEN) { mrk_lexer_emit(out, lexer, mrk_token_type_header_start); } else { - // TODO match rest of text and emit + mrk_lexer_advance_text(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_text); } break; case '-': @@ -163,7 +170,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { mrk_lexer_peek(lexer) == '\n') { mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule); } else { - // TODO match rest of text and emit + mrk_lexer_advance_text(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_text); } } break; @@ -172,7 +180,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered); } else { - // TODO match rest of text and emit + mrk_lexer_advance_text(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_text); } break; case '_': @@ -251,9 +260,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { mrk_lexer_emit(out, lexer, mrk_token_type_line_break); } else { - /* mrk_lexer_advance_eq(lexer, ' '); */ - /* mrk_lexer_emit(out, lexer, mrk_token_type_spaces); */ - // TODO match rest of text and emir + mrk_lexer_advance_text(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_text); } } break; case '\n': @@ -269,19 +277,22 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { mrk_lexer_advance(lexer); } - if (mrk_lexer_peek(lexer) == '.') { + // Ordered list item numbers should be followed by a dot and then a space + if (mrk_lexer_peek_str(lexer, ". ")) { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered); } // Doesn't end with a dot, so it's just a word that happens to start // with a number else { - // TODO lex text and emit + mrk_lexer_advance_text(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_text); } } // Any other special scenarios we simply parse as a word else { - // TODO lex text and emit + mrk_lexer_advance_text(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_text); } } break; } @@ -325,17 +336,20 @@ void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) { break; case '!': mrk_lexer_emit(out, lexer, mrk_token_type_bang); + break; case ' ': { if (mrk_lexer_peek_str(lexer, " \n")) { mrk_lexer_advance_n(lexer, 2); mrk_lexer_emit(out, lexer, mrk_token_type_line_break); } else { - // TODO match rest of text and emir + mrk_lexer_advance_text(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_text); } } break; default: - // TODO lex text and emit + mrk_lexer_advance_text(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_text); break; } }