From c653d437bd565ffe1abe4569ad46e94a395d0e8a Mon Sep 17 00:00:00 2001 From: Jef Roosens Date: Mon, 25 Mar 2024 17:03:00 +0100 Subject: [PATCH] feat: lex and parse single-line code blocks --- include/mrk/ast.h | 1 + include/mrk/parser.h | 5 +- src/_include/mrk/parser_internal.h | 5 ++ src/lexer/lexer.c | 2 - src/parser/err.c | 4 +- src/parser/parser.c | 83 ++++++++++++++++++++++++++++-- test/lexer/lexer.c | 16 ++++++ test/parser/specific.c | 22 ++++++++ 8 files changed, 128 insertions(+), 10 deletions(-) diff --git a/include/mrk/ast.h b/include/mrk/ast.h index ccdbd1f..5da314b 100644 --- a/include/mrk/ast.h +++ b/include/mrk/ast.h @@ -15,6 +15,7 @@ typedef enum mrk_ast_node_type { mrk_ast_node_type_paragraph, mrk_ast_node_type_list, mrk_ast_node_type_list_item, + mrk_ast_node_type_code, } mrk_ast_node_type; typedef struct mrk_ast_node { diff --git a/include/mrk/parser.h b/include/mrk/parser.h index 5f21910..c9b82df 100644 --- a/include/mrk/parser.h +++ b/include/mrk/parser.h @@ -13,8 +13,9 @@ typedef enum mrk_parser_err { mrk_parser_err_ok = 0, mrk_parser_err_unexpected_eat, mrk_parser_err_unexpected_token, - mrk_parser_unclosed_brackets, - mrk_parser_unexpected_path, + mrk_parser_err_unclosed_brackets, + mrk_parser_err_unclosed_backticks, + mrk_parser_err_unexpected_path, } mrk_parser_err; /** diff --git a/src/_include/mrk/parser_internal.h b/src/_include/mrk/parser_internal.h index 247e73e..cf89ef3 100644 --- a/src/_include/mrk/parser_internal.h +++ b/src/_include/mrk/parser_internal.h @@ -92,4 +92,9 @@ mrk_err mrk_parser_parse_list(mrk_ast_node *out, mrk_parser *parser); mrk_err mrk_parser_parse_list_item(mrk_ast_node *out, mrk_parser *parser); +/** + * Parse a single-line code segment. + */ +mrk_err mrk_parser_parse_code(mrk_ast_node *out, mrk_parser *parser); + #endif diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index c737e2c..595fed8 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -286,7 +286,6 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { mrk_lexer_advance_n(lexer, 2); mrk_lexer_emit(out, lexer, mrk_token_type_triple_backtick); } else { - mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_backtick); } break; @@ -373,7 +372,6 @@ void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) { } } break; case '`': - mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_backtick); break; default: diff --git a/src/parser/err.c b/src/parser/err.c index 920161a..d687ead 100644 --- a/src/parser/err.c +++ b/src/parser/err.c @@ -32,12 +32,12 @@ const char *mrk_parser_err_msg(mrk_parser *parser) { mrk_token_type_names[parser->error.token.type], parser->error.token.type); break; - case mrk_parser_unclosed_brackets: + case mrk_parser_err_unclosed_brackets: sprintf(parser->error.buf, "%lu:%lu: unclosed bracket", parser->error.token.start_line + 1, parser->error.token.start_line_index + 1); break; - case mrk_parser_unexpected_path: + case mrk_parser_err_unexpected_path: return unexpected_path_msg; } diff --git a/src/parser/parser.c b/src/parser/parser.c index 45d0599..d687655 100644 --- a/src/parser/parser.c +++ b/src/parser/parser.c @@ -1,3 +1,5 @@ +#include "mrk/parser.h" +#include "mrk/lexer.h" #include "mrk/parser_internal.h" #include @@ -93,7 +95,9 @@ mrk_err mrk_parser_parse_common(mrk_parser *parser, mrk_ast_node *parent) { while (!mrk_parser_done(parser) && parser->indent == indent) { mrk_ast_node *child; - switch (mrk_parser_peek(parser).type) { + mrk_token token = mrk_parser_peek(parser); + + switch (token.type) { case mrk_token_type_text: MRK_RES(mrk_ast_node_child_append(&child, parent)); MRK_RES(mrk_parser_parse_text(child, parser)); @@ -102,6 +106,11 @@ mrk_err mrk_parser_parse_common(mrk_parser *parser, mrk_ast_node *parent) { MRK_RES(mrk_ast_node_child_append(&child, parent)); MRK_RES(mrk_parser_parse_link(child, parser)); break; + case mrk_token_type_backtick: + + MRK_RES(mrk_ast_node_child_append(&child, parent)); + MRK_RES(mrk_parser_parse_code(child, parser)); + break; case mrk_token_type_newline: MRK_RES(mrk_ast_node_child_append(&child, parent)); child->type = mrk_ast_node_type_space; @@ -112,6 +121,16 @@ mrk_err mrk_parser_parse_common(mrk_parser *parser, mrk_ast_node *parent) { mrk_parser_advance(parser); mrk_parser_indent(parser); return mrk_err_ok; + // All these tokens have no special meaning in this context + case mrk_token_type_left_paren: + case mrk_token_type_right_paren: + mrk_parser_advance(parser); + + MRK_RES(mrk_ast_node_child_append(&child, parent)); + child->type = mrk_ast_node_type_text; + child->d.text.start = token.start; + child->d.text.end = token.end; + break; // Any other tokens aren't part of the common section so we just exit default: return mrk_err_ok; @@ -128,7 +147,7 @@ mrk_err mrk_parser_parse_link(mrk_ast_node *out, mrk_parser *parser) { MRK_RES(mrk_parser_eat(&left_bracket, parser, mrk_token_type_left_bracket)); if (mrk_parser_done(parser)) { - parser->error.code = mrk_parser_unclosed_brackets; + parser->error.code = mrk_parser_err_unclosed_brackets; parser->error.token = left_bracket; return mrk_err_invalid_md; @@ -185,7 +204,7 @@ mrk_err mrk_parser_parse_list(mrk_ast_node *out, mrk_parser *parser) { out->d.list.ordered = true; break; default: - parser->error.code = mrk_parser_unexpected_path; + parser->error.code = mrk_parser_err_unexpected_path; return mrk_err_invalid_md; } @@ -245,7 +264,7 @@ mrk_err mrk_parser_parse_list_item(mrk_ast_node *out, mrk_parser *parser) { break; // This path should never be taken default: - parser->error.code = mrk_parser_unexpected_path; + parser->error.code = mrk_parser_err_unexpected_path; return mrk_err_invalid_md; } @@ -262,3 +281,59 @@ mrk_err mrk_parser_parse_list_item(mrk_ast_node *out, mrk_parser *parser) { return mrk_err_ok; } + +mrk_err mrk_parser_parse_code(mrk_ast_node *out, mrk_parser *parser) { + out->type = mrk_ast_node_type_code; + + mrk_token start_backtick; + MRK_RES(mrk_parser_eat(&start_backtick, parser, mrk_token_type_backtick)); + + size_t indent = parser->indent; + + while (!mrk_parser_done(parser) && parser->indent == indent) { + mrk_ast_node *child; + mrk_token token = mrk_parser_peek(parser); + + switch (token.type) { + case mrk_token_type_blank_line: + parser->error.code = mrk_parser_err_unclosed_brackets; + parser->error.token = start_backtick; + + return mrk_err_invalid_md; + case mrk_token_type_backtick: + mrk_parser_advance(parser); + + return mrk_err_ok; + case mrk_token_type_newline: + MRK_RES(mrk_ast_node_child_append(&child, out)); + child->type = mrk_ast_node_type_space; + mrk_parser_advance(parser); + mrk_parser_indent(parser); + break; + default: + mrk_parser_advance(parser); + + if (out->children.len == 0 || + out->children.arr[out->children.len - 1]->type != + mrk_ast_node_type_text) { + MRK_RES(mrk_ast_node_child_append(&child, out)); + child->type = mrk_ast_node_type_text; + child->d.text.start = token.start; + child->d.text.end = token.end; + } + // Simply append current text to previous one + else { + child = out->children.arr[out->children.len - 1]; + child->d.text.end = token.end; + } + break; + } + } + + // Successful exit of the function should've already happened in the while + // loop + parser->error.code = mrk_parser_err_unclosed_brackets; + parser->error.token = start_backtick; + + return mrk_err_invalid_md; +} diff --git a/test/lexer/lexer.c b/test/lexer/lexer.c index 4e3ca51..69285f6 100644 --- a/test/lexer/lexer.c +++ b/test/lexer/lexer.c @@ -66,9 +66,25 @@ void test_lexer_simple1() { TEST_CHECK(t.type == mrk_token_type_text); } +void test_lexer_code() { + LEXER_INIT(); + + const char *buf = "`world [hello](link)`"; + mrk_lexer_open(lxr, buf, 0); + + mrk_token t; + + TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); + TEST_CHECK(t.type == mrk_token_type_backtick); + TEST_CHECK(t.start == 0); + TEST_CHECK(t.end == 1); + +} + TEST_LIST = { { "lexer header", test_lexer_header }, { "lexer line break", test_lexer_line_break}, { "lexer simple 1", test_lexer_simple1 }, + { "lexer code", test_lexer_code }, { NULL, NULL } }; diff --git a/test/parser/specific.c b/test/parser/specific.c index fa376f2..de2f742 100644 --- a/test/parser/specific.c +++ b/test/parser/specific.c @@ -1,3 +1,4 @@ +#include "mrk/ast.h" #include "test.h" #include "mrk/lexer.h" @@ -98,9 +99,30 @@ void test_parse_unordered_list() { TEST_CHECK(subchild->children.arr[0]->d.text.end == 57); } +void test_parse_code() { + const char *buf = "`world [hello](link)`"; + PARSER_OPEN(buf); + + mrk_ast_node *code; + mrk_ast_node_init(&code); + + TEST_CHECK(mrk_parser_parse_code(code, parser) == mrk_err_ok); + + TEST_CHECK(code->type == mrk_ast_node_type_code); + TEST_CHECK(code->children.len == 1); + + mrk_ast_node *child = code->children.arr[0]; + TEST_CHECK(child->type == mrk_ast_node_type_text); + TEST_CHECK(child->d.text.start == 1); + TEST_MSG("start: %lu", child->d.text.start); + TEST_CHECK(child->d.text.end == 20); + TEST_MSG("end: %lu", child->d.text.end); +} + TEST_LIST = { { "parser header", test_parse_header }, { "parser link", test_parse_link }, { "parser unordered list", test_parse_unordered_list }, + { "parser code", test_parse_code }, { NULL, NULL } };