From f65942697ed47b123026340dca33d578b6a5e885 Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Mon, 18 Mar 2024 22:53:39 +0100 Subject: [PATCH] feat(parser): starting to believe in lists --- src/_include/mrk/parser_internal.h | 15 +++ src/parser/parser.c | 184 +++++++++++++++++++++-------- test/parser/specific.c | 6 +- 3 files changed, 153 insertions(+), 52 deletions(-) diff --git a/src/_include/mrk/parser_internal.h b/src/_include/mrk/parser_internal.h index 123f50c..450babd 100644 --- a/src/_include/mrk/parser_internal.h +++ b/src/_include/mrk/parser_internal.h @@ -50,6 +50,12 @@ mrk_err mrk_parser_eat(mrk_token *out, mrk_parser *parser, mrk_token_type type); */ void mrk_parser_indent(mrk_parser *parser); +/** + * Parse all elements on the parser's current indentation level. If the level is + * 0, this will parse the entire file. + */ +mrk_err mrk_parser_parse_indent_block(mrk_parser *parser, mrk_ast_node *parent); + /** * Parse an entire header block. */ @@ -69,6 +75,13 @@ mrk_err mrk_parser_parse_paragraph(mrk_ast_node *out, mrk_parser *parser); */ mrk_err mrk_parser_parse_text(mrk_ast_node *out, mrk_parser *parser); +/** + * Parse any tokens that have the same meaning no matter the context and append + * them to the parent. This includes text tokens and links. Only tokens within + * the same indentation level are matched. + */ +mrk_err mrk_parser_parse_common(mrk_parser *parser, mrk_ast_node *parent); + /** * Parse a link construct */ @@ -79,4 +92,6 @@ mrk_err mrk_parser_parse_link(mrk_ast_node *out, mrk_parser *parser); */ mrk_err mrk_parser_parse_list(mrk_ast_node *out, mrk_parser *parser); +mrk_err mrk_parser_parse_list_item(mrk_ast_node *out, mrk_parser *parser); + #endif diff --git a/src/parser/parser.c b/src/parser/parser.c index dc8cc8e..d1a4ddb 100644 --- a/src/parser/parser.c +++ b/src/parser/parser.c @@ -17,12 +17,24 @@ void mrk_parser_open(mrk_parser *parser, mrk_lexer *lexer) { mrk_err mrk_parser_parse(mrk_ast_node **out, mrk_parser *parser) { mrk_ast_node *root; MRK_RES(mrk_ast_node_init(&root)); + mrk_parser_indent(parser); while (!mrk_parser_done(parser)) { - mrk_parser_indent(parser); + MRK_RES(mrk_parser_parse_indent_block(parser, root)); + } + *out = root; + + return mrk_err_ok; +} + +mrk_err mrk_parser_parse_indent_block(mrk_parser *parser, + mrk_ast_node *parent) { + size_t indent = parser->indent; + + while (!mrk_parser_done(parser) && parser->indent == indent) { mrk_ast_node *child; - MRK_RES(mrk_ast_node_child_append(&child, root)); + MRK_RES(mrk_ast_node_child_append(&child, parent)); switch (mrk_parser_peek(parser).type) { case mrk_token_type_header_start: @@ -32,11 +44,15 @@ mrk_err mrk_parser_parse(mrk_ast_node **out, mrk_parser *parser) { case mrk_token_type_left_bracket: MRK_RES(mrk_parser_parse_paragraph(child, parser)); break; + case mrk_token_type_list_item_unordered: + case mrk_token_type_list_item_checked: + case mrk_token_type_list_item_unchecked: + case mrk_token_type_list_item_ordered: + MRK_RES(mrk_parser_parse_list(child, parser)); + break; } } - *out = root; - return mrk_err_ok; } @@ -47,35 +63,11 @@ mrk_err mrk_parser_parse_header(mrk_ast_node *out, mrk_parser *parser) { out->type = mrk_ast_node_type_header; out->d.header.depth = mrk_token_len(header_token); - size_t header_indent = parser->indent; + // All continuation lines of the header should be one indentation deeper than + // the definition of the header + parser->indent++; - // Parse subsections of header - while (!mrk_parser_done(parser) && parser->indent == header_indent) { - mrk_ast_node *child; - - switch (mrk_parser_peek(parser).type) { - case mrk_token_type_text: - MRK_RES(mrk_ast_node_child_append(&child, out)); - MRK_RES(mrk_parser_parse_text(child, parser)); - break; - // Newlines are interpreted as spaces - case mrk_token_type_newline: - MRK_RES(mrk_ast_node_child_append(&child, out)); - child->type = mrk_ast_node_type_space; - mrk_parser_advance(parser); - mrk_parser_indent(parser); - break; - case mrk_token_type_left_bracket: - MRK_RES(mrk_ast_node_child_append(&child, out)); - MRK_RES(mrk_parser_parse_link(child, parser)); - break; - // Header definition ends at newline - case mrk_token_type_blank_line: - mrk_parser_advance(parser); - mrk_parser_indent(parser); - return mrk_err_ok; - } - } + MRK_RES(mrk_parser_parse_common(parser, out)); return mrk_err_ok; } @@ -91,6 +83,40 @@ mrk_err mrk_parser_parse_text(mrk_ast_node *out, mrk_parser *parser) { return mrk_err_ok; } +mrk_err mrk_parser_parse_common(mrk_parser *parser, mrk_ast_node *parent) { + size_t indent = parser->indent; + + while (!mrk_parser_done(parser) && parser->indent == indent) { + mrk_ast_node *child; + + switch (mrk_parser_peek(parser).type) { + case mrk_token_type_text: + MRK_RES(mrk_ast_node_child_append(&child, parent)); + MRK_RES(mrk_parser_parse_text(child, parser)); + break; + case mrk_token_type_left_bracket: + MRK_RES(mrk_ast_node_child_append(&child, parent)); + MRK_RES(mrk_parser_parse_link(child, parser)); + break; + case mrk_token_type_newline: + MRK_RES(mrk_ast_node_child_append(&child, parent)); + child->type = mrk_ast_node_type_space; + mrk_parser_advance(parser); + mrk_parser_indent(parser); + break; + case mrk_token_type_blank_line: + mrk_parser_advance(parser); + mrk_parser_indent(parser); + return mrk_err_ok; + // Any other tokens aren't part of the common section so we just exit + default: + return mrk_err_ok; + } + } + + return mrk_err_ok; +} + mrk_err mrk_parser_parse_link(mrk_ast_node *out, mrk_parser *parser) { out->type = mrk_ast_node_type_link; @@ -132,29 +158,56 @@ mrk_err mrk_parser_parse_link(mrk_ast_node *out, mrk_parser *parser) { mrk_err mrk_parser_parse_paragraph(mrk_ast_node *out, mrk_parser *parser) { out->type = mrk_ast_node_type_paragraph; + MRK_RES(mrk_parser_parse_common(parser, out)); + + return mrk_err_ok; +} + +mrk_err mrk_parser_parse_list(mrk_ast_node *out, mrk_parser *parser) { + out->type = mrk_ast_node_type_list; + size_t indent = parser->indent; - while (!mrk_parser_done(parser) && parser->indent == indent) { - mrk_ast_node *child; + switch (mrk_parser_peek(parser).type) { + case mrk_token_type_list_item_unordered: + case mrk_token_type_list_item_checked: + case mrk_token_type_list_item_unchecked: + out->d.list.ordered = false; + break; + case mrk_token_type_list_item_ordered: + out->d.list.ordered = true; + break; + // TODO error on default + } + mrk_ast_node *child; + MRK_RES(mrk_ast_node_child_append(&child, out)); + MRK_RES(mrk_parser_parse_list_item(child, parser)); + + while (!mrk_parser_done(parser) && parser->indent == indent) { switch (mrk_parser_peek(parser).type) { - case mrk_token_type_text: + case mrk_token_type_list_item_unordered: + case mrk_token_type_list_item_checked: + case mrk_token_type_list_item_unchecked: + // Two lists of different types can follow up on each other + if (out->d.list.ordered) { + return mrk_err_ok; + } + MRK_RES(mrk_ast_node_child_append(&child, out)); - MRK_RES(mrk_parser_parse_text(child, parser)); + MRK_RES(mrk_parser_parse_list_item(child, parser)); break; - case mrk_token_type_left_bracket: + case mrk_token_type_list_item_ordered: + // Two lists of different types can follow up on each other + if (!out->d.list.ordered) { + return mrk_err_ok; + } + MRK_RES(mrk_ast_node_child_append(&child, out)); - MRK_RES(mrk_parser_parse_link(child, parser)); + MRK_RES(mrk_parser_parse_list_item(child, parser)); break; - case mrk_token_type_newline: - MRK_RES(mrk_ast_node_child_append(&child, out)); - child->type = mrk_ast_node_type_space; - mrk_parser_advance(parser); - mrk_parser_indent(parser); - break; - case mrk_token_type_blank_line: - mrk_parser_advance(parser); - mrk_parser_indent(parser); + // Other tokens imply end of list + default: return mrk_err_ok; } } @@ -162,6 +215,39 @@ mrk_err mrk_parser_parse_paragraph(mrk_ast_node *out, mrk_parser *parser) { return mrk_err_ok; } -mrk_err mrk_parser_parse_list(mrk_ast_node *out, mrk_parser *parser) { - return mrk_err_invalid; +mrk_err mrk_parser_parse_list_item(mrk_ast_node *out, mrk_parser *parser) { + out->type = mrk_ast_node_type_list_item; + + size_t indent = parser->indent; + + switch (mrk_parser_peek(parser).type) { + case mrk_token_type_list_item_unordered: + case mrk_token_type_list_item_ordered: + out->d.list_item.checkbox = false; + out->d.list_item.checked = false; + break; + case mrk_token_type_list_item_checked: + out->d.list_item.checkbox = true; + out->d.list_item.checked = true; + break; + case mrk_token_type_list_item_unchecked: + out->d.list_item.checkbox = true; + out->d.list_item.checked = false; + break; + // This path should never be taken + default:; + } + + mrk_parser_advance(parser); + + // Parse the textual part of the list + parser->indent++; + MRK_RES(mrk_parser_parse_common(parser, out)); + + // This allows lists to be arbitrarily nested + if (!mrk_parser_done(parser) && parser->indent > indent) { + MRK_RES(mrk_parser_parse_indent_block(parser, out)); + } + + return mrk_err_ok; } diff --git a/test/parser/specific.c b/test/parser/specific.c index efc697a..fa376f2 100644 --- a/test/parser/specific.c +++ b/test/parser/specific.c @@ -58,7 +58,7 @@ void test_parse_link() { } void test_parse_unordered_list() { - const char *buf = "* element one\n* element two\n paragraph in element two"; + const char *buf = "* element one\n* element two\n\n paragraph in element two"; PARSER_OPEN(buf); mrk_ast_node *list; @@ -94,8 +94,8 @@ void test_parse_unordered_list() { TEST_CHECK(subchild->children.len == 1); TEST_CHECK(subchild->children.arr[0]->type == mrk_ast_node_type_text); - TEST_CHECK(subchild->children.arr[0]->d.text.start == 31); - TEST_CHECK(subchild->children.arr[0]->d.text.end == 55); + TEST_CHECK(subchild->children.arr[0]->d.text.start == 33); + TEST_CHECK(subchild->children.arr[0]->d.text.end == 57); } TEST_LIST = {