feat(parser): restructuring, hopefully start TTD, link parser

main
Jef Roosens 2024-03-15 21:59:34 +01:00
parent 184cc79a4c
commit 812d6a0733
Signed by: Jef Roosens
GPG Key ID: B75D4F293C7052DB
5 changed files with 165 additions and 83 deletions

View File

@ -13,6 +13,8 @@ typedef enum mrk_ast_node_type {
mrk_ast_node_type_header, mrk_ast_node_type_header,
mrk_ast_node_type_text, mrk_ast_node_type_text,
mrk_ast_node_type_space, mrk_ast_node_type_space,
mrk_ast_node_type_link,
mrk_ast_node_type_paragraph,
} mrk_ast_node_type; } mrk_ast_node_type;
typedef struct mrk_ast_node { typedef struct mrk_ast_node {
@ -21,11 +23,19 @@ typedef struct mrk_ast_node {
size_t len; size_t len;
} children; } children;
mrk_ast_node_type type; mrk_ast_node_type type;
union {
struct { struct {
void *ptr; size_t depth;
size_t num; } header;
bool state; struct {
} args[MRK_AST_NODE_ARGS]; size_t start;
size_t end;
} text;
struct {
size_t url_start;
size_t url_end;
} link;
} d;
} mrk_ast_node; } mrk_ast_node;
/** /**

View File

@ -48,8 +48,19 @@ mrk_err mrk_parser_eat(mrk_token *out, mrk_parser *parser, mrk_token_type type);
mrk_err mrk_parser_parse_block(mrk_ast_node *out, mrk_parser *parser); mrk_err mrk_parser_parse_block(mrk_ast_node *out, mrk_parser *parser);
/**
* Parse an entire header block.
*/
mrk_err mrk_parser_parse_header(mrk_ast_node *out, mrk_parser *parser); mrk_err mrk_parser_parse_header(mrk_ast_node *out, mrk_parser *parser);
/**
* Parse a paragraph. Note that a paragraph does not necessarily end with a
* blank line, but rather when it either encounters a blank line, end of file,
* or some other token that indicates a new block should start (e.g. start of a
* list).
*/
mrk_err mrk_parser_parse_paragraph(mrk_ast_node *out, mrk_parser *parser);
/** /**
* Parse a text token, as well as any following text tokens, delimited by a * Parse a text token, as well as any following text tokens, delimited by a
* single newline which will be converted to a space. * single newline which will be converted to a space.

View File

@ -22,9 +22,13 @@ mrk_err mrk_parser_parse(mrk_ast_node **out, mrk_parser *parser) {
while (!mrk_parser_done(parser)) { while (!mrk_parser_done(parser)) {
mrk_ast_node *child; mrk_ast_node *child;
MRK_RES(mrk_ast_node_child_append(&child, root)); MRK_RES(mrk_ast_node_child_append(&child, root));
MRK_RES(mrk_parser_parse_block(child, parser));
switch (mrk_parser_peek(parser).type) {
case mrk_token_type_header_start:
MRK_RES(mrk_parser_parse_header(child, parser));
break;
}
} }
*out = root; *out = root;
@ -32,62 +36,38 @@ mrk_err mrk_parser_parse(mrk_ast_node **out, mrk_parser *parser) {
return mrk_err_ok; return mrk_err_ok;
} }
mrk_err mrk_parser_parse_block(mrk_ast_node *out, mrk_parser *parser) {
mrk_token t = mrk_parser_peek(parser);
switch (t.type) {
case mrk_token_type_header_start:
MRK_RES(mrk_parser_parse_header(out, parser));
break;
}
if (!mrk_parser_done(parser)) {
MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_blank_line));
}
return mrk_err_ok;
}
/* mrk_err mrk_parser_parse_ */
mrk_err mrk_parser_parse_header(mrk_ast_node *out, mrk_parser *parser) { mrk_err mrk_parser_parse_header(mrk_ast_node *out, mrk_parser *parser) {
mrk_token header_token; mrk_token header_token;
mrk_parser_eat(&header_token, parser, mrk_token_type_header_start); mrk_parser_eat(&header_token, parser, mrk_token_type_header_start);
out->type = mrk_ast_node_type_header; out->type = mrk_ast_node_type_header;
out->args[0].num = mrk_token_len(header_token); out->d.header.depth = mrk_token_len(header_token);
// Parse subsections of header // Parse subsections of header
while (!mrk_parser_done(parser) && while (!mrk_parser_done(parser)) {
mrk_parser_peek(parser).type != mrk_token_type_blank_line) {
mrk_ast_node *child; mrk_ast_node *child;
MRK_RES(mrk_ast_node_child_append(&child, out));
switch (mrk_parser_peek(parser).type) { switch (mrk_parser_peek(parser).type) {
case mrk_token_type_text: case mrk_token_type_text:
MRK_RES(mrk_ast_node_child_append(&child, out));
MRK_RES(mrk_parser_parse_text(child, parser)); MRK_RES(mrk_parser_parse_text(child, parser));
break; break;
// Newlines are interpreted as spaces // Newlines are interpreted as spaces
case mrk_token_type_newline: case mrk_token_type_newline:
MRK_RES(mrk_ast_node_child_append(&child, out));
child->type = mrk_ast_node_type_space; child->type = mrk_ast_node_type_space;
mrk_parser_advance(parser); mrk_parser_advance(parser);
break; break;
case mrk_token_type_left_bracket: case mrk_token_type_left_bracket:
mrk_parser_parse_link(child, parser); MRK_RES(mrk_ast_node_child_append(&child, out));
MRK_RES(mrk_parser_parse_link(child, parser));
break; break;
} // Header definition ends at newline
} case mrk_token_type_blank_line:
// Headers are blocks of their own, so they're delimited by blank lines
/* while (!mrk_parser_done(parser) && */
/* (t = mrk_parser_peek(parser)).type != mrk_token_type_blank_line) {
*/
/* switch (t.type) { /1* case *1/ */
/* } */
/* } */
// Skip blank line
mrk_parser_advance(parser); mrk_parser_advance(parser);
return mrk_err_ok;
}
}
return mrk_err_ok; return mrk_err_ok;
} }
@ -97,14 +77,75 @@ mrk_err mrk_parser_parse_text(mrk_ast_node *out, mrk_parser *parser) {
MRK_RES(mrk_parser_eat(&text_token, parser, mrk_token_type_text)); MRK_RES(mrk_parser_eat(&text_token, parser, mrk_token_type_text));
out->type = mrk_ast_node_type_text; out->type = mrk_ast_node_type_text;
// Start in input buffer out->d.text.start = text_token.start;
out->args[0].num = text_token.start; out->d.text.end = text_token.end;
// End in input buffer
out->args[1].num = text_token.end;
return mrk_err_ok; return mrk_err_ok;
} }
mrk_err mrk_parser_parse_link(mrk_ast_node *out, mrk_parser *parser) { mrk_err mrk_parser_parse_link(mrk_ast_node *out, mrk_parser *parser) {
out->type = mrk_ast_node_type_link;
MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_left_bracket)); MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_left_bracket));
if (mrk_parser_done(parser)) {
parser->error.msg = "Unclosed brackets";
return mrk_err_invalid;
}
mrk_ast_node *child;
MRK_RES(mrk_ast_node_child_append(&child, out));
switch (mrk_parser_peek(parser).type) {
case mrk_token_type_text:
MRK_RES(mrk_parser_parse_text(child, parser));
break;
// TODO allow other types of text (e.g. cursive)
// TODO image links
default:
// TODO throw error
break;
}
MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_right_bracket));
MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_left_paren));
mrk_token url_token;
MRK_RES(mrk_parser_eat(&url_token, parser, mrk_token_type_text));
MRK_RES(mrk_parser_eat(NULL, parser, mrk_token_type_right_paren));
out->d.link.url_start = url_token.start;
out->d.link.url_end = url_token.end;
return mrk_err_ok;
}
mrk_err mrk_parser_parse_paragraph(mrk_ast_node *out, mrk_parser *parser) {
out->type = mrk_ast_node_type_paragraph;
while (!mrk_parser_done(parser)) {
mrk_ast_node *child;
switch (mrk_parser_peek(parser).type) {
case mrk_token_type_text:
MRK_RES(mrk_ast_node_child_append(&child, out));
MRK_RES(mrk_parser_parse_text(child, parser));
break;
case mrk_token_type_left_bracket:
MRK_RES(mrk_ast_node_child_append(&child, out));
MRK_RES(mrk_parser_parse_link(child, parser));
break;
case mrk_token_type_newline:
MRK_RES(mrk_ast_node_child_append(&child, out));
child->type = mrk_ast_node_type_space;
mrk_parser_advance(parser);
break;
case mrk_token_type_blank_line:
mrk_parser_advance(parser);
return mrk_err_ok;
}
}
return mrk_err_ok;
} }

View File

@ -1,38 +0,0 @@
#include "test.h"
#include "mrk/lexer.h"
#include "mrk/parser.h"
#define LEXER_INIT() \
mrk_lexer *lxr; \
TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok)
#define PARSER_INIT() \
mrk_parser *psr; \
TEST_CHECK(mrk_parser_init(&psr) == mrk_err_ok)
#define PARSER_OPEN(buf) \
mrk_lexer *lxr; \
TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok); \
mrk_parser *psr; \
TEST_CHECK(mrk_parser_init(&psr) == mrk_err_ok); \
mrk_lexer_open(lxr, buf, 0); \
mrk_parser_open(psr, lxr)
void test_parse_header() {
const char *buf = "### hello world";
PARSER_OPEN(buf);
mrk_ast_node *root;
TEST_CHECK(mrk_parser_parse(&root, psr) == mrk_err_ok);
mrk_ast_node *header = root->children.arr[0];
TEST_CHECK(header->type == mrk_ast_node_type_header);
TEST_CHECK(header->children.arr[0]->type == mrk_ast_node_type_text);
}
TEST_LIST = {
{ "parser header", test_parse_header },
{ NULL, NULL }
};

View File

@ -0,0 +1,58 @@
#include "test.h"
#include "mrk/lexer.h"
#include "mrk/parser_internal.h"
#define LEXER_INIT() \
mrk_lexer *lxr; \
TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok)
#define PARSER_INIT() \
mrk_parser *parser; \
TEST_CHECK(mrk_parser_init(&parser) == mrk_err_ok)
#define PARSER_OPEN(buf) \
mrk_lexer *lxr; \
TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok); \
mrk_parser *parser; \
TEST_CHECK(mrk_parser_init(&parser) == mrk_err_ok); \
mrk_lexer_open(lxr, buf, 0); \
mrk_parser_open(parser, lxr)
void test_parse_header() {
const char *buf = "### hello world";
PARSER_OPEN(buf);
mrk_ast_node *root;
TEST_CHECK(mrk_parser_parse(&root, parser) == mrk_err_ok);
mrk_ast_node *header = root->children.arr[0];
TEST_CHECK(header->type == mrk_ast_node_type_header);
TEST_CHECK(header->children.arr[0]->type == mrk_ast_node_type_text);
}
void test_parse_link() {
const char *buf = "[hello world](https://example.com)";
PARSER_OPEN(buf);
mrk_ast_node *link;
mrk_ast_node_init(&link);
TEST_CHECK(mrk_parser_parse_link(link, parser) == mrk_err_ok);
TEST_CHECK(link->type == mrk_ast_node_type_link);
TEST_CHECK(link->d.link.url_start == 14);
TEST_CHECK(link->d.link.url_end == 33);
TEST_CHECK(link->children.len == 1);
mrk_ast_node *link_text = link->children.arr[0];
TEST_CHECK(link_text->type == mrk_ast_node_type_text);
TEST_CHECK(link_text->d.text.start == 1);
TEST_CHECK(link_text->d.text.end == 12);
}
TEST_LIST = {
{ "parser header", test_parse_header },
{ "parser link", test_parse_link },
{ NULL, NULL }
};