From f5b32354556fe0d3afe2224f6d5fb2afcc5d84ff Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Sun, 10 Mar 2024 22:54:13 +0100 Subject: [PATCH] feat(lexer): restructure lexer for hopefully better results --- include/mrk/lexer.h | 31 +-- src/_include/mrk/lexer_internal.h | 1 + src/lexer/lexer.c | 322 ++++++++++++++---------------- src/parser/parser.c | 22 +- test/lexer/lexer.c | 16 +- 5 files changed, 187 insertions(+), 205 deletions(-) diff --git a/include/mrk/lexer.h b/include/mrk/lexer.h index bb181c0..525f0bb 100644 --- a/include/mrk/lexer.h +++ b/include/mrk/lexer.h @@ -5,6 +5,9 @@ #include "mrk/common.h" +#define MRK_MAX_HEADER_LEN 6 +#define MRK_MIN_HORIZ_RULE_LEN 3 + typedef struct mrk_lexer mrk_lexer; typedef enum mrk_lexer_err { @@ -15,29 +18,31 @@ typedef enum mrk_lexer_err { typedef enum mrk_token_type { mrk_token_type_none = 0, - mrk_token_type_pounds, - mrk_token_type_backticks, - mrk_token_type_dashes, - mrk_token_type_underscores, - mrk_token_type_stars, + mrk_token_type_backtick, + mrk_token_type_triple_backtick, + mrk_token_type_dash, + mrk_token_type_underscore, + mrk_token_type_double_underscore, + mrk_token_type_star, + mrk_token_type_double_star, mrk_token_type_equals, - mrk_token_type_blank_line, mrk_token_type_newline, - mrk_token_type_spaces, mrk_token_type_line_break, mrk_token_type_right_angle_brackets, - mrk_token_type_tabs, mrk_token_type_left_bracket, mrk_token_type_right_bracket, - mrk_token_type_bang, mrk_token_type_left_paren, mrk_token_type_right_paren, + mrk_token_type_bang, mrk_token_type_backslash, - mrk_token_type_dotted_number, - mrk_token_type_word, - mrk_token_type_checked_box, - mrk_token_type_unchecked_box, mrk_token_type_text, + mrk_token_type_header_start, + mrk_token_type_horizontal_rule, + mrk_token_type_indent, + mrk_token_type_list_item_unordered, + mrk_token_type_list_item_ordered, + mrk_token_type_list_item_checked, + mrk_token_type_list_item_unchecked, } mrk_token_type; typedef struct mrk_token { diff --git a/src/_include/mrk/lexer_internal.h b/src/_include/mrk/lexer_internal.h index 0eca5f6..08cdea9 100644 --- a/src/_include/mrk/lexer_internal.h +++ b/src/_include/mrk/lexer_internal.h @@ -18,6 +18,7 @@ struct mrk_lexer { size_t end; bool emitted; } token; + mrk_token_type last_emitted; }; /** diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index 20efe96..770e559 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -26,6 +26,7 @@ void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) { lexer->token.start = 0; lexer->token.end = 0; lexer->token.emitted = false; + lexer->last_emitted = mrk_token_type_none; } bool mrk_lexer_done(const mrk_lexer *lexer) { @@ -42,7 +43,7 @@ char mrk_lexer_advance(mrk_lexer *lexer) { // A newline is still part of the previous line, so if the last character was // a newline, we now go to the next line - if (lexer->buf.s[lexer->pos.buf_index] == '\0') { + if (c == '\n') { lexer->pos.line++; lexer->pos.line_index = 0; } else { @@ -127,58 +128,96 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) { out->end = lexer->token.end; lexer->token.emitted = true; + lexer->last_emitted = type; } -void mrk_lexer_lex_special(mrk_token *out, mrk_lexer *lexer) { +/* void mrk_lexer_advance_text(mrk_lexer *lexer) { */ +/* const char */ +/* /1* while (!mrk_lexer_done(lexer)) { *1/ */ +/* /1* /2* switch () *2/ *1/ */ +/* /1* } *1/ */ +/* } */ + +void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { char c = mrk_lexer_advance(lexer); + switch (c) { - // All these characters have multiple meanings depending on their location - // in the file and how many there are, so the lexer can only match them as - // one or more grouped characters + // Headers case '#': mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_pounds); - break; - case '`': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_backticks); + + if (lexer->token.end - lexer->token.start <= MRK_MAX_HEADER_LEN) { + mrk_lexer_emit(out, lexer, mrk_token_type_header_start); + } else { + // TODO match rest of text and emit + } break; case '-': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_dashes); + if (mrk_lexer_peek(lexer) == ' ') { + mrk_lexer_advance(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered); + } else { + mrk_lexer_advance_eq(lexer, c); + + if (lexer->token.end - lexer->token.start >= MRK_MIN_HORIZ_RULE_LEN && + mrk_lexer_peek(lexer) == '\n') { + mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule); + } else { + // TODO match rest of text and emit + } + } + break; + case '+': + if (mrk_lexer_peek(lexer) == ' ') { + mrk_lexer_advance(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered); + } else { + // TODO match rest of text and emit + } break; case '_': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_underscores); - break; - case '*': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_stars); - break; - case '=': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_equals); - break; - case '\t': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_tabs); - break; + case '*': { + if (mrk_lexer_peek(lexer) == ' ') { + mrk_lexer_advance(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered); + } else { + // We first check if the entire line consists of stars; otherwise, we + // match it as a regular single or double star + size_t i = 0; + + while (mrk_lexer_peek_n(lexer, i) == c) { + i++; + } + + if (mrk_lexer_peek_n(lexer, i) == '\n' && + (i + 1) >= MRK_MIN_HORIZ_RULE_LEN) { + mrk_lexer_advance_n(lexer, i + 1); + mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule); + } else if (mrk_lexer_peek(lexer) == c) { + mrk_lexer_advance(lexer); + mrk_lexer_emit(out, lexer, + c == '_' ? mrk_token_type_double_underscore + : mrk_token_type_double_star); + } else { + mrk_lexer_emit(out, lexer, + c == '_' ? mrk_token_type_underscore + : mrk_token_type_star); + } + } + } break; case '>': mrk_lexer_advance_eq(lexer, c); mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets); break; - case '!': - mrk_lexer_emit(out, lexer, mrk_token_type_bang); - break; case '[': // Checkboxes for lists are lexed separately to simplify the parser later // on if (mrk_lexer_peek_str(lexer, " ]")) { mrk_lexer_advance_n(lexer, 2); - mrk_lexer_emit(out, lexer, mrk_token_type_unchecked_box); + mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unchecked); } else if (mrk_lexer_peek_str(lexer, "x]")) { mrk_lexer_advance_n(lexer, 2); - mrk_lexer_emit(out, lexer, mrk_token_type_checked_box); + mrk_lexer_emit(out, lexer, mrk_token_type_list_item_checked); } else { mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket); } @@ -193,190 +232,127 @@ void mrk_lexer_lex_special(mrk_token *out, mrk_lexer *lexer) { mrk_lexer_emit(out, lexer, mrk_token_type_right_paren); break; case '\\': + // TODO better handle escaped elements if (mrk_lexer_peek(lexer) == '\n') { mrk_lexer_emit(out, lexer, mrk_token_type_line_break); } else { mrk_lexer_emit(out, lexer, mrk_token_type_backslash); } break; - // Two consecutive newlines constitute a blank line, otherwise they're - // ignored as whitespace - case '\n': - if (mrk_lexer_peek(lexer) == '\n') { - mrk_lexer_advance(lexer); - mrk_lexer_emit(out, lexer, mrk_token_type_blank_line); - } else { - mrk_lexer_emit(out, lexer, mrk_token_type_newline); - } - break; case ' ': { + // Indents consist of four spaces + if (mrk_lexer_peek_str(lexer, " ")) { + mrk_lexer_advance_n(lexer, 3); + mrk_lexer_emit(out, lexer, mrk_token_type_indent); + } // Either a double space or a line break - if (mrk_lexer_peek_str(lexer, " \n")) { + else if (mrk_lexer_peek_str(lexer, " \n")) { mrk_lexer_advance_n(lexer, 2); mrk_lexer_emit(out, lexer, mrk_token_type_line_break); } else { - mrk_lexer_advance_eq(lexer, ' '); - mrk_lexer_emit(out, lexer, mrk_token_type_spaces); + /* mrk_lexer_advance_eq(lexer, ' '); */ + /* mrk_lexer_emit(out, lexer, mrk_token_type_spaces); */ + // TODO match rest of text and emir } } break; + case '\n': + mrk_lexer_emit(out, lexer, mrk_token_type_newline); + break; + case '\t': + mrk_lexer_emit(out, lexer, mrk_token_type_indent); + break; default: { // Match ordered list headers if (isdigit(c)) { - mrk_lexer_advance(lexer); - while (isdigit(mrk_lexer_peek(lexer))) { mrk_lexer_advance(lexer); } if (mrk_lexer_peek(lexer) == '.') { mrk_lexer_advance(lexer); - mrk_lexer_emit(out, lexer, mrk_token_type_dotted_number); + mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered); } // Doesn't end with a dot, so it's just a word that happens to start // with a number else { - mrk_lexer_advance_word(lexer); - mrk_lexer_emit(out, lexer, mrk_token_type_word); + // TODO lex text and emit } } // Any other special scenarios we simply parse as a word else { - mrk_lexer_advance_word(lexer); - mrk_lexer_emit(out, lexer, mrk_token_type_word); + // TODO lex text and emit } } break; } } +void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) { + char c = mrk_lexer_advance(lexer); + + switch (c) { + case '*': + case '_': + if (mrk_lexer_peek(lexer) == c) { + mrk_lexer_advance(lexer); + mrk_lexer_emit(out, lexer, + c == '_' ? mrk_token_type_double_underscore + : mrk_token_type_double_star); + } else { + mrk_lexer_emit(out, lexer, + c == '_' ? mrk_token_type_underscore + : mrk_token_type_star); + } + break; + case '[': + mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket); + break; + case ']': + mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket); + break; + case '(': + mrk_lexer_emit(out, lexer, mrk_token_type_left_paren); + break; + case ')': + mrk_lexer_emit(out, lexer, mrk_token_type_right_paren); + break; + case '\\': + // TODO better handle escaped characters + mrk_lexer_emit(out, lexer, mrk_token_type_backslash); + break; + case '\n': + mrk_lexer_emit(out, lexer, mrk_token_type_newline); + break; + case '!': + mrk_lexer_emit(out, lexer, mrk_token_type_bang); + case ' ': { + if (mrk_lexer_peek_str(lexer, " \n")) { + mrk_lexer_advance_n(lexer, 2); + + mrk_lexer_emit(out, lexer, mrk_token_type_line_break); + } else { + // TODO match rest of text and emir + } + } break; + default: + // TODO lex text and emit + break; + } +} + mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) { if (mrk_lexer_done(lexer)) { return mrk_lexer_err_done; } - mrk_lexer_reset(lexer); - - while (!lexer->token.emitted && !mrk_lexer_done(lexer)) { - char c = mrk_lexer_advance(lexer); - switch (c) { - // All these characters have multiple meanings depending on their location - // in the file and how many there are, so the lexer can only match them as - // one or more grouped characters - case '#': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_pounds); - break; - case '`': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_backticks); - break; - case '-': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_dashes); - break; - case '_': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_underscores); - break; - case '*': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_stars); - break; - case '=': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_equals); - break; - case '\t': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_tabs); - break; - case '>': - mrk_lexer_advance_eq(lexer, c); - mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets); - break; - case '!': - mrk_lexer_emit(out, lexer, mrk_token_type_bang); - break; - case '[': - // Checkboxes for lists are lexed separately to simplify the parser later - // on - if (mrk_lexer_peek_str(lexer, " ]")) { - mrk_lexer_advance_n(lexer, 2); - mrk_lexer_emit(out, lexer, mrk_token_type_unchecked_box); - } else if (mrk_lexer_peek_str(lexer, "x]")) { - mrk_lexer_advance_n(lexer, 2); - mrk_lexer_emit(out, lexer, mrk_token_type_checked_box); - } else { - mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket); - } - break; - case ']': - mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket); - break; - case '(': - mrk_lexer_emit(out, lexer, mrk_token_type_left_paren); - break; - case ')': - mrk_lexer_emit(out, lexer, mrk_token_type_right_paren); - break; - case '\\': - if (mrk_lexer_peek(lexer) == '\n') { - mrk_lexer_emit(out, lexer, mrk_token_type_line_break); - } else { - mrk_lexer_emit(out, lexer, mrk_token_type_backslash); - } - break; - // Two consecutive newlines constitute a blank line, otherwise they're - // ignored as whitespace - case '\n': - if (mrk_lexer_peek(lexer) == '\n') { - mrk_lexer_advance(lexer); - mrk_lexer_emit(out, lexer, mrk_token_type_blank_line); - } else { - mrk_lexer_emit(out, lexer, mrk_token_type_newline); - } - break; - case ' ': { - // Either a double space or a line break - if (mrk_lexer_peek_str(lexer, " \n")) { - mrk_lexer_advance_n(lexer, 2); - - mrk_lexer_emit(out, lexer, mrk_token_type_line_break); - } else { - mrk_lexer_advance_eq(lexer, ' '); - mrk_lexer_emit(out, lexer, mrk_token_type_spaces); - } - } break; - default: { - // Match ordered list headers - if (isdigit(c)) { - mrk_lexer_advance(lexer); - - while (isdigit(mrk_lexer_peek(lexer))) { - mrk_lexer_advance(lexer); - } - - if (mrk_lexer_peek(lexer) == '.') { - mrk_lexer_advance(lexer); - mrk_lexer_emit(out, lexer, mrk_token_type_dotted_number); - } - // Doesn't end with a dot, so it's just a word that happens to start - // with a number - else { - mrk_lexer_advance_word(lexer); - mrk_lexer_emit(out, lexer, mrk_token_type_word); - } - } - // Any other special scenarios we simply parse as a word - else { - mrk_lexer_advance_word(lexer); - mrk_lexer_emit(out, lexer, mrk_token_type_word); - } - } break; - } + if (lexer->pos.line_index == 0 || + lexer->last_emitted == mrk_token_type_indent) { + mrk_lexer_lex_start_of_line(out, lexer); + } else { + mrk_lexer_lex_middle_of_line(out, lexer); } - return lexer->token.emitted ? mrk_lexer_err_ok : mrk_lexer_err_done; + return mrk_lexer_err_ok; } size_t mrk_token_len(mrk_token t) { return t.end - t.start; } diff --git a/src/parser/parser.c b/src/parser/parser.c index 9d8a369..046d83c 100644 --- a/src/parser/parser.c +++ b/src/parser/parser.c @@ -31,12 +31,12 @@ mrk_err mrk_parser_parse_block(mrk_ast_node **out, mrk_parser *parser) { mrk_err (*parse_fn)(mrk_ast_node **, mrk_parser *) = NULL; - switch (t.type) { - case mrk_token_type_pounds: { - parse_fn = mrk_parser_parse_header; - break; - } - } + /* switch (t.type) { */ + /* case mrk_token_type_pounds: { */ + /* parse_fn = mrk_parser_parse_header; */ + /* break; */ + /* } */ + /* } */ if (parse_fn == NULL) { MRK_PARSE_ERR(parser, t, "Unexpected token."); @@ -61,10 +61,12 @@ mrk_err mrk_parser_parse_header(mrk_ast_node **out, mrk_parser *parser) { header->args[0].num = mrk_token_len(t); // Headers are blocks of their own, so they're delimited by blank lines - while (!mrk_parser_done(parser) && - (t = mrk_parser_peek(parser)).type != mrk_token_type_blank_line) { - switch (t.type) { /* case */ } - } + /* while (!mrk_parser_done(parser) && */ + /* (t = mrk_parser_peek(parser)).type != mrk_token_type_blank_line) { + */ + /* switch (t.type) { /1* case *1/ */ + /* } */ + /* } */ // Skip blank line mrk_parser_advance(parser); diff --git a/test/lexer/lexer.c b/test/lexer/lexer.c index c16d423..6a8e61c 100644 --- a/test/lexer/lexer.c +++ b/test/lexer/lexer.c @@ -14,7 +14,7 @@ void test_lexer_header() { mrk_token t; TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_pounds); + TEST_CHECK(t.type == mrk_token_type_header_start); TEST_CHECK_(t.start == 0, "t.start == %lu", t.start); TEST_CHECK(t.end == 4); @@ -37,7 +37,7 @@ void test_lexer_line_break() { mrk_lexer_open(lxr, buf2, 0); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_spaces); + TEST_CHECK(t.type == mrk_token_type_text); TEST_CHECK(mrk_lexer_done(lxr)); @@ -52,17 +52,15 @@ void test_lexer_simple1() { mrk_token t; TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_pounds); + TEST_CHECK(t.type == mrk_token_type_header_start); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_spaces); + TEST_CHECK(t.type == mrk_token_type_text); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_word); + TEST_CHECK(t.type == mrk_token_type_newline); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_spaces); + TEST_CHECK(t.type == mrk_token_type_newline); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_word); - TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); - TEST_CHECK(t.type == mrk_token_type_blank_line); + TEST_CHECK(t.type == mrk_token_type_text); } TEST_LIST = {