From 812d6a0733468907c77fc99487b5d83cf2eae577 Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Fri, 15 Mar 2024 21:59:34 +0100 Subject: [PATCH] feat(parser): restructuring, hopefully start TTD, link parser --- include/mrk/ast.h | 20 +++-- src/_include/mrk/parser_internal.h | 11 +++ src/parser/parser.c | 121 +++++++++++++++++++---------- test/parser/parser.c | 38 --------- test/parser/specific.c | 58 ++++++++++++++ 5 files changed, 165 insertions(+), 83 deletions(-) delete mode 100644 test/parser/parser.c create mode 100644 test/parser/specific.c diff --git a/include/mrk/ast.h b/include/mrk/ast.h index 8d2bfb2..6d924cd 100644 --- a/include/mrk/ast.h +++ b/include/mrk/ast.h @@ -13,6 +13,8 @@ typedef enum mrk_ast_node_type { mrk_ast_node_type_header, mrk_ast_node_type_text, mrk_ast_node_type_space, + mrk_ast_node_type_link, + mrk_ast_node_type_paragraph, } mrk_ast_node_type; typedef struct mrk_ast_node { @@ -21,11 +23,19 @@ typedef struct mrk_ast_node { size_t len; } children; mrk_ast_node_type type; - struct { - void *ptr; - size_t num; - bool state; - } args[MRK_AST_NODE_ARGS]; + union { + struct { + size_t depth; + } header; + struct { + size_t start; + size_t end; + } text; + struct { + size_t url_start; + size_t url_end; + } link; + } d; } mrk_ast_node; /** diff --git a/src/_include/mrk/parser_internal.h b/src/_include/mrk/parser_internal.h index 3d1219e..8722bf5 100644 --- a/src/_include/mrk/parser_internal.h +++ b/src/_include/mrk/parser_internal.h @@ -48,8 +48,19 @@ mrk_err mrk_parser_eat(mrk_token *out, mrk_parser *parser, mrk_token_type type); mrk_err mrk_parser_parse_block(mrk_ast_node *out, mrk_parser *parser); +/** + * Parse an entire header block. + */ mrk_err mrk_parser_parse_header(mrk_ast_node *out, mrk_parser *parser); +/** + * Parse a paragraph. Note that a paragraph does not necessarily end with a + * blank line, but rather when it either encounters a blank line, end of file, + * or some other token that indicates a new block should start (e.g. start of a + * list). + */ +mrk_err mrk_parser_parse_paragraph(mrk_ast_node *out, mrk_parser *parser); + /** * Parse a text token, as well as any following text tokens, delimited by a * single newline which will be converted to a space. diff --git a/src/parser/parser.c b/src/parser/parser.c index 6653883..dbfeb13 100644 --- a/src/parser/parser.c +++ b/src/parser/parser.c @@ -22,9 +22,13 @@ mrk_err mrk_parser_parse(mrk_ast_node **out, mrk_parser *parser) { while (!mrk_parser_done(parser)) { mrk_ast_node *child; - MRK_RES(mrk_ast_node_child_append(&child, root)); - MRK_RES(mrk_parser_parse_block(child, parser)); + + switch (mrk_parser_peek(parser).type) { + case mrk_token_type_header_start: + MRK_RES(mrk_parser_parse_header(child, parser)); + break; + } } *out = root; @@ -32,63 +36,39 @@ mrk_err mrk_parser_parse(mrk_ast_node **out, mrk_parser *parser) { return mrk_err_ok; } -mrk_err mrk_parser_parse_block(mrk_ast_node *out, mrk_parser *parser) { - mrk_token t = mrk_parser_peek(parser); - - switch (t.type) { - case mrk_token_type_header_start: - MRK_RES(mrk_parser_parse_header(out, parser)); - break; - } - - if (!mrk_parser_done(parser)) { - MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_blank_line)); - } - - return mrk_err_ok; -} - -/* mrk_err mrk_parser_parse_ */ - mrk_err mrk_parser_parse_header(mrk_ast_node *out, mrk_parser *parser) { mrk_token header_token; mrk_parser_eat(&header_token, parser, mrk_token_type_header_start); out->type = mrk_ast_node_type_header; - out->args[0].num = mrk_token_len(header_token); + out->d.header.depth = mrk_token_len(header_token); // Parse subsections of header - while (!mrk_parser_done(parser) && - mrk_parser_peek(parser).type != mrk_token_type_blank_line) { + while (!mrk_parser_done(parser)) { mrk_ast_node *child; - MRK_RES(mrk_ast_node_child_append(&child, out)); switch (mrk_parser_peek(parser).type) { case mrk_token_type_text: + MRK_RES(mrk_ast_node_child_append(&child, out)); MRK_RES(mrk_parser_parse_text(child, parser)); break; // Newlines are interpreted as spaces case mrk_token_type_newline: + MRK_RES(mrk_ast_node_child_append(&child, out)); child->type = mrk_ast_node_type_space; mrk_parser_advance(parser); break; case mrk_token_type_left_bracket: - mrk_parser_parse_link(child, parser); + MRK_RES(mrk_ast_node_child_append(&child, out)); + MRK_RES(mrk_parser_parse_link(child, parser)); break; + // Header definition ends at newline + case mrk_token_type_blank_line: + mrk_parser_advance(parser); + return mrk_err_ok; } } - // Headers are blocks of their own, so they're delimited by blank lines - /* while (!mrk_parser_done(parser) && */ - /* (t = mrk_parser_peek(parser)).type != mrk_token_type_blank_line) { - */ - /* switch (t.type) { /1* case *1/ */ - /* } */ - /* } */ - - // Skip blank line - mrk_parser_advance(parser); - return mrk_err_ok; } @@ -97,14 +77,75 @@ mrk_err mrk_parser_parse_text(mrk_ast_node *out, mrk_parser *parser) { MRK_RES(mrk_parser_eat(&text_token, parser, mrk_token_type_text)); out->type = mrk_ast_node_type_text; - // Start in input buffer - out->args[0].num = text_token.start; - // End in input buffer - out->args[1].num = text_token.end; + out->d.text.start = text_token.start; + out->d.text.end = text_token.end; return mrk_err_ok; } mrk_err mrk_parser_parse_link(mrk_ast_node *out, mrk_parser *parser) { + out->type = mrk_ast_node_type_link; + MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_left_bracket)); + + if (mrk_parser_done(parser)) { + parser->error.msg = "Unclosed brackets"; + return mrk_err_invalid; + } + + mrk_ast_node *child; + MRK_RES(mrk_ast_node_child_append(&child, out)); + + switch (mrk_parser_peek(parser).type) { + case mrk_token_type_text: + MRK_RES(mrk_parser_parse_text(child, parser)); + break; + // TODO allow other types of text (e.g. cursive) + // TODO image links + default: + // TODO throw error + break; + } + + MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_right_bracket)); + MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_left_paren)); + + mrk_token url_token; + MRK_RES(mrk_parser_eat(&url_token, parser, mrk_token_type_text)); + + MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_right_paren)); + + out->d.link.url_start = url_token.start; + out->d.link.url_end = url_token.end; + + return mrk_err_ok; +} + +mrk_err mrk_parser_parse_paragraph(mrk_ast_node *out, mrk_parser *parser) { + out->type = mrk_ast_node_type_paragraph; + + while (!mrk_parser_done(parser)) { + mrk_ast_node *child; + + switch (mrk_parser_peek(parser).type) { + case mrk_token_type_text: + MRK_RES(mrk_ast_node_child_append(&child, out)); + MRK_RES(mrk_parser_parse_text(child, parser)); + break; + case mrk_token_type_left_bracket: + MRK_RES(mrk_ast_node_child_append(&child, out)); + MRK_RES(mrk_parser_parse_link(child, parser)); + break; + case mrk_token_type_newline: + MRK_RES(mrk_ast_node_child_append(&child, out)); + child->type = mrk_ast_node_type_space; + mrk_parser_advance(parser); + break; + case mrk_token_type_blank_line: + mrk_parser_advance(parser); + return mrk_err_ok; + } + } + + return mrk_err_ok; } diff --git a/test/parser/parser.c b/test/parser/parser.c deleted file mode 100644 index cb70983..0000000 --- a/test/parser/parser.c +++ /dev/null @@ -1,38 +0,0 @@ -#include "test.h" - -#include "mrk/lexer.h" -#include "mrk/parser.h" - -#define LEXER_INIT() \ - mrk_lexer *lxr; \ - TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok) - -#define PARSER_INIT() \ - mrk_parser *psr; \ - TEST_CHECK(mrk_parser_init(&psr) == mrk_err_ok) - -#define PARSER_OPEN(buf) \ - mrk_lexer *lxr; \ - TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok); \ - mrk_parser *psr; \ - TEST_CHECK(mrk_parser_init(&psr) == mrk_err_ok); \ - mrk_lexer_open(lxr, buf, 0); \ - mrk_parser_open(psr, lxr) - - -void test_parse_header() { - const char *buf = "### hello world"; - PARSER_OPEN(buf); - - mrk_ast_node *root; - TEST_CHECK(mrk_parser_parse(&root, psr) == mrk_err_ok); - - mrk_ast_node *header = root->children.arr[0]; - TEST_CHECK(header->type == mrk_ast_node_type_header); - TEST_CHECK(header->children.arr[0]->type == mrk_ast_node_type_text); -} - -TEST_LIST = { - { "parser header", test_parse_header }, - { NULL, NULL } -}; diff --git a/test/parser/specific.c b/test/parser/specific.c new file mode 100644 index 0000000..001c61a --- /dev/null +++ b/test/parser/specific.c @@ -0,0 +1,58 @@ +#include "test.h" + +#include "mrk/lexer.h" +#include "mrk/parser_internal.h" + +#define LEXER_INIT() \ + mrk_lexer *lxr; \ + TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok) + +#define PARSER_INIT() \ + mrk_parser *parser; \ + TEST_CHECK(mrk_parser_init(&parser) == mrk_err_ok) + +#define PARSER_OPEN(buf) \ + mrk_lexer *lxr; \ + TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok); \ + mrk_parser *parser; \ + TEST_CHECK(mrk_parser_init(&parser) == mrk_err_ok); \ + mrk_lexer_open(lxr, buf, 0); \ + mrk_parser_open(parser, lxr) + + +void test_parse_header() { + const char *buf = "### hello world"; + PARSER_OPEN(buf); + + mrk_ast_node *root; + TEST_CHECK(mrk_parser_parse(&root, parser) == mrk_err_ok); + + mrk_ast_node *header = root->children.arr[0]; + TEST_CHECK(header->type == mrk_ast_node_type_header); + TEST_CHECK(header->children.arr[0]->type == mrk_ast_node_type_text); +} + +void test_parse_link() { + const char *buf = "[hello world](https://example.com)"; + PARSER_OPEN(buf); + + mrk_ast_node *link; + mrk_ast_node_init(&link); + TEST_CHECK(mrk_parser_parse_link(link, parser) == mrk_err_ok); + + TEST_CHECK(link->type == mrk_ast_node_type_link); + TEST_CHECK(link->d.link.url_start == 14); + TEST_CHECK(link->d.link.url_end == 33); + TEST_CHECK(link->children.len == 1); + + mrk_ast_node *link_text = link->children.arr[0]; + TEST_CHECK(link_text->type == mrk_ast_node_type_text); + TEST_CHECK(link_text->d.text.start == 1); + TEST_CHECK(link_text->d.text.end == 12); +} + +TEST_LIST = { + { "parser header", test_parse_header }, + { "parser link", test_parse_link }, + { NULL, NULL } +};