feat(lexer): restructure lexer for hopefully better results

main
Jef Roosens 2024-03-10 22:54:13 +01:00
parent 8c0105639f
commit f5b3235455
Signed by: Jef Roosens
GPG Key ID: B75D4F293C7052DB
5 changed files with 187 additions and 205 deletions

View File

@ -5,6 +5,9 @@
#include "mrk/common.h" #include "mrk/common.h"
#define MRK_MAX_HEADER_LEN 6
#define MRK_MIN_HORIZ_RULE_LEN 3
typedef struct mrk_lexer mrk_lexer; typedef struct mrk_lexer mrk_lexer;
typedef enum mrk_lexer_err { typedef enum mrk_lexer_err {
@ -15,29 +18,31 @@ typedef enum mrk_lexer_err {
typedef enum mrk_token_type { typedef enum mrk_token_type {
mrk_token_type_none = 0, mrk_token_type_none = 0,
mrk_token_type_pounds, mrk_token_type_backtick,
mrk_token_type_backticks, mrk_token_type_triple_backtick,
mrk_token_type_dashes, mrk_token_type_dash,
mrk_token_type_underscores, mrk_token_type_underscore,
mrk_token_type_stars, mrk_token_type_double_underscore,
mrk_token_type_star,
mrk_token_type_double_star,
mrk_token_type_equals, mrk_token_type_equals,
mrk_token_type_blank_line,
mrk_token_type_newline, mrk_token_type_newline,
mrk_token_type_spaces,
mrk_token_type_line_break, mrk_token_type_line_break,
mrk_token_type_right_angle_brackets, mrk_token_type_right_angle_brackets,
mrk_token_type_tabs,
mrk_token_type_left_bracket, mrk_token_type_left_bracket,
mrk_token_type_right_bracket, mrk_token_type_right_bracket,
mrk_token_type_bang,
mrk_token_type_left_paren, mrk_token_type_left_paren,
mrk_token_type_right_paren, mrk_token_type_right_paren,
mrk_token_type_bang,
mrk_token_type_backslash, mrk_token_type_backslash,
mrk_token_type_dotted_number,
mrk_token_type_word,
mrk_token_type_checked_box,
mrk_token_type_unchecked_box,
mrk_token_type_text, mrk_token_type_text,
mrk_token_type_header_start,
mrk_token_type_horizontal_rule,
mrk_token_type_indent,
mrk_token_type_list_item_unordered,
mrk_token_type_list_item_ordered,
mrk_token_type_list_item_checked,
mrk_token_type_list_item_unchecked,
} mrk_token_type; } mrk_token_type;
typedef struct mrk_token { typedef struct mrk_token {

View File

@ -18,6 +18,7 @@ struct mrk_lexer {
size_t end; size_t end;
bool emitted; bool emitted;
} token; } token;
mrk_token_type last_emitted;
}; };
/** /**

View File

@ -26,6 +26,7 @@ void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
lexer->token.start = 0; lexer->token.start = 0;
lexer->token.end = 0; lexer->token.end = 0;
lexer->token.emitted = false; lexer->token.emitted = false;
lexer->last_emitted = mrk_token_type_none;
} }
bool mrk_lexer_done(const mrk_lexer *lexer) { bool mrk_lexer_done(const mrk_lexer *lexer) {
@ -42,7 +43,7 @@ char mrk_lexer_advance(mrk_lexer *lexer) {
// A newline is still part of the previous line, so if the last character was // A newline is still part of the previous line, so if the last character was
// a newline, we now go to the next line // a newline, we now go to the next line
if (lexer->buf.s[lexer->pos.buf_index] == '\0') { if (c == '\n') {
lexer->pos.line++; lexer->pos.line++;
lexer->pos.line_index = 0; lexer->pos.line_index = 0;
} else { } else {
@ -127,58 +128,96 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
out->end = lexer->token.end; out->end = lexer->token.end;
lexer->token.emitted = true; lexer->token.emitted = true;
lexer->last_emitted = type;
} }
void mrk_lexer_lex_special(mrk_token *out, mrk_lexer *lexer) { /* void mrk_lexer_advance_text(mrk_lexer *lexer) { */
/* const char */
/* /1* while (!mrk_lexer_done(lexer)) { *1/ */
/* /1* /2* switch () *2/ *1/ */
/* /1* } *1/ */
/* } */
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
char c = mrk_lexer_advance(lexer); char c = mrk_lexer_advance(lexer);
switch (c) { switch (c) {
// All these characters have multiple meanings depending on their location // Headers
// in the file and how many there are, so the lexer can only match them as
// one or more grouped characters
case '#': case '#':
mrk_lexer_advance_eq(lexer, c); mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_pounds);
break; if (lexer->token.end - lexer->token.start <= MRK_MAX_HEADER_LEN) {
case '`': mrk_lexer_emit(out, lexer, mrk_token_type_header_start);
mrk_lexer_advance_eq(lexer, c); } else {
mrk_lexer_emit(out, lexer, mrk_token_type_backticks); // TODO match rest of text and emit
}
break; break;
case '-': case '-':
if (mrk_lexer_peek(lexer) == ' ') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
} else {
mrk_lexer_advance_eq(lexer, c); mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_dashes);
if (lexer->token.end - lexer->token.start >= MRK_MIN_HORIZ_RULE_LEN &&
mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
} else {
// TODO match rest of text and emit
}
}
break;
case '+':
if (mrk_lexer_peek(lexer) == ' ') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
} else {
// TODO match rest of text and emit
}
break; break;
case '_': case '_':
mrk_lexer_advance_eq(lexer, c); case '*': {
mrk_lexer_emit(out, lexer, mrk_token_type_underscores); if (mrk_lexer_peek(lexer) == ' ') {
break; mrk_lexer_advance(lexer);
case '*': mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
mrk_lexer_advance_eq(lexer, c); } else {
mrk_lexer_emit(out, lexer, mrk_token_type_stars); // We first check if the entire line consists of stars; otherwise, we
break; // match it as a regular single or double star
case '=': size_t i = 0;
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_equals); while (mrk_lexer_peek_n(lexer, i) == c) {
break; i++;
case '\t': }
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_tabs); if (mrk_lexer_peek_n(lexer, i) == '\n' &&
break; (i + 1) >= MRK_MIN_HORIZ_RULE_LEN) {
mrk_lexer_advance_n(lexer, i + 1);
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
} else if (mrk_lexer_peek(lexer) == c) {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer,
c == '_' ? mrk_token_type_double_underscore
: mrk_token_type_double_star);
} else {
mrk_lexer_emit(out, lexer,
c == '_' ? mrk_token_type_underscore
: mrk_token_type_star);
}
}
} break;
case '>': case '>':
mrk_lexer_advance_eq(lexer, c); mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets); mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets);
break; break;
case '!':
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
break;
case '[': case '[':
// Checkboxes for lists are lexed separately to simplify the parser later // Checkboxes for lists are lexed separately to simplify the parser later
// on // on
if (mrk_lexer_peek_str(lexer, " ]")) { if (mrk_lexer_peek_str(lexer, " ]")) {
mrk_lexer_advance_n(lexer, 2); mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_unchecked_box); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unchecked);
} else if (mrk_lexer_peek_str(lexer, "x]")) { } else if (mrk_lexer_peek_str(lexer, "x]")) {
mrk_lexer_advance_n(lexer, 2); mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_checked_box); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_checked);
} else { } else {
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket); mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
} }
@ -193,190 +232,127 @@ void mrk_lexer_lex_special(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren); mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
break; break;
case '\\': case '\\':
// TODO better handle escaped elements
if (mrk_lexer_peek(lexer) == '\n') { if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_emit(out, lexer, mrk_token_type_line_break); mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else { } else {
mrk_lexer_emit(out, lexer, mrk_token_type_backslash); mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
} }
break; break;
// Two consecutive newlines constitute a blank line, otherwise they're
// ignored as whitespace
case '\n':
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
}
break;
case ' ': { case ' ': {
// Indents consist of four spaces
if (mrk_lexer_peek_str(lexer, " ")) {
mrk_lexer_advance_n(lexer, 3);
mrk_lexer_emit(out, lexer, mrk_token_type_indent);
}
// Either a double space or a line break // Either a double space or a line break
if (mrk_lexer_peek_str(lexer, " \n")) { else if (mrk_lexer_peek_str(lexer, " \n")) {
mrk_lexer_advance_n(lexer, 2); mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break); mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else { } else {
mrk_lexer_advance_eq(lexer, ' '); /* mrk_lexer_advance_eq(lexer, ' '); */
mrk_lexer_emit(out, lexer, mrk_token_type_spaces); /* mrk_lexer_emit(out, lexer, mrk_token_type_spaces); */
// TODO match rest of text and emir
} }
} break; } break;
case '\n':
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
break;
case '\t':
mrk_lexer_emit(out, lexer, mrk_token_type_indent);
break;
default: { default: {
// Match ordered list headers // Match ordered list headers
if (isdigit(c)) { if (isdigit(c)) {
mrk_lexer_advance(lexer);
while (isdigit(mrk_lexer_peek(lexer))) { while (isdigit(mrk_lexer_peek(lexer))) {
mrk_lexer_advance(lexer); mrk_lexer_advance(lexer);
} }
if (mrk_lexer_peek(lexer) == '.') { if (mrk_lexer_peek(lexer) == '.') {
mrk_lexer_advance(lexer); mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_dotted_number); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered);
} }
// Doesn't end with a dot, so it's just a word that happens to start // Doesn't end with a dot, so it's just a word that happens to start
// with a number // with a number
else { else {
mrk_lexer_advance_word(lexer); // TODO lex text and emit
mrk_lexer_emit(out, lexer, mrk_token_type_word);
} }
} }
// Any other special scenarios we simply parse as a word // Any other special scenarios we simply parse as a word
else { else {
mrk_lexer_advance_word(lexer); // TODO lex text and emit
mrk_lexer_emit(out, lexer, mrk_token_type_word);
} }
} break; } break;
} }
} }
void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) {
char c = mrk_lexer_advance(lexer);
switch (c) {
case '*':
case '_':
if (mrk_lexer_peek(lexer) == c) {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer,
c == '_' ? mrk_token_type_double_underscore
: mrk_token_type_double_star);
} else {
mrk_lexer_emit(out, lexer,
c == '_' ? mrk_token_type_underscore
: mrk_token_type_star);
}
break;
case '[':
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
break;
case ']':
mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket);
break;
case '(':
mrk_lexer_emit(out, lexer, mrk_token_type_left_paren);
break;
case ')':
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
break;
case '\\':
// TODO better handle escaped characters
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
break;
case '\n':
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
break;
case '!':
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
case ' ': {
if (mrk_lexer_peek_str(lexer, " \n")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
// TODO match rest of text and emir
}
} break;
default:
// TODO lex text and emit
break;
}
}
mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) { mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
if (mrk_lexer_done(lexer)) { if (mrk_lexer_done(lexer)) {
return mrk_lexer_err_done; return mrk_lexer_err_done;
} }
mrk_lexer_reset(lexer); if (lexer->pos.line_index == 0 ||
lexer->last_emitted == mrk_token_type_indent) {
while (!lexer->token.emitted && !mrk_lexer_done(lexer)) { mrk_lexer_lex_start_of_line(out, lexer);
char c = mrk_lexer_advance(lexer);
switch (c) {
// All these characters have multiple meanings depending on their location
// in the file and how many there are, so the lexer can only match them as
// one or more grouped characters
case '#':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_pounds);
break;
case '`':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_backticks);
break;
case '-':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_dashes);
break;
case '_':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_underscores);
break;
case '*':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_stars);
break;
case '=':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_equals);
break;
case '\t':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_tabs);
break;
case '>':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets);
break;
case '!':
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
break;
case '[':
// Checkboxes for lists are lexed separately to simplify the parser later
// on
if (mrk_lexer_peek_str(lexer, " ]")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_unchecked_box);
} else if (mrk_lexer_peek_str(lexer, "x]")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_checked_box);
} else { } else {
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket); mrk_lexer_lex_middle_of_line(out, lexer);
}
break;
case ']':
mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket);
break;
case '(':
mrk_lexer_emit(out, lexer, mrk_token_type_left_paren);
break;
case ')':
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
break;
case '\\':
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
}
break;
// Two consecutive newlines constitute a blank line, otherwise they're
// ignored as whitespace
case '\n':
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
}
break;
case ' ': {
// Either a double space or a line break
if (mrk_lexer_peek_str(lexer, " \n")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
mrk_lexer_advance_eq(lexer, ' ');
mrk_lexer_emit(out, lexer, mrk_token_type_spaces);
}
} break;
default: {
// Match ordered list headers
if (isdigit(c)) {
mrk_lexer_advance(lexer);
while (isdigit(mrk_lexer_peek(lexer))) {
mrk_lexer_advance(lexer);
} }
if (mrk_lexer_peek(lexer) == '.') { return mrk_lexer_err_ok;
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_dotted_number);
}
// Doesn't end with a dot, so it's just a word that happens to start
// with a number
else {
mrk_lexer_advance_word(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_word);
}
}
// Any other special scenarios we simply parse as a word
else {
mrk_lexer_advance_word(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_word);
}
} break;
}
}
return lexer->token.emitted ? mrk_lexer_err_ok : mrk_lexer_err_done;
} }
size_t mrk_token_len(mrk_token t) { return t.end - t.start; } size_t mrk_token_len(mrk_token t) { return t.end - t.start; }

View File

@ -31,12 +31,12 @@ mrk_err mrk_parser_parse_block(mrk_ast_node **out, mrk_parser *parser) {
mrk_err (*parse_fn)(mrk_ast_node **, mrk_parser *) = NULL; mrk_err (*parse_fn)(mrk_ast_node **, mrk_parser *) = NULL;
switch (t.type) { /* switch (t.type) { */
case mrk_token_type_pounds: { /* case mrk_token_type_pounds: { */
parse_fn = mrk_parser_parse_header; /* parse_fn = mrk_parser_parse_header; */
break; /* break; */
} /* } */
} /* } */
if (parse_fn == NULL) { if (parse_fn == NULL) {
MRK_PARSE_ERR(parser, t, "Unexpected token."); MRK_PARSE_ERR(parser, t, "Unexpected token.");
@ -61,10 +61,12 @@ mrk_err mrk_parser_parse_header(mrk_ast_node **out, mrk_parser *parser) {
header->args[0].num = mrk_token_len(t); header->args[0].num = mrk_token_len(t);
// Headers are blocks of their own, so they're delimited by blank lines // Headers are blocks of their own, so they're delimited by blank lines
while (!mrk_parser_done(parser) && /* while (!mrk_parser_done(parser) && */
(t = mrk_parser_peek(parser)).type != mrk_token_type_blank_line) { /* (t = mrk_parser_peek(parser)).type != mrk_token_type_blank_line) {
switch (t.type) { /* case */ } */
} /* switch (t.type) { /1* case *1/ */
/* } */
/* } */
// Skip blank line // Skip blank line
mrk_parser_advance(parser); mrk_parser_advance(parser);

View File

@ -14,7 +14,7 @@ void test_lexer_header() {
mrk_token t; mrk_token t;
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_pounds); TEST_CHECK(t.type == mrk_token_type_header_start);
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start); TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
TEST_CHECK(t.end == 4); TEST_CHECK(t.end == 4);
@ -37,7 +37,7 @@ void test_lexer_line_break() {
mrk_lexer_open(lxr, buf2, 0); mrk_lexer_open(lxr, buf2, 0);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_spaces); TEST_CHECK(t.type == mrk_token_type_text);
TEST_CHECK(mrk_lexer_done(lxr)); TEST_CHECK(mrk_lexer_done(lxr));
@ -52,17 +52,15 @@ void test_lexer_simple1() {
mrk_token t; mrk_token t;
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_pounds); TEST_CHECK(t.type == mrk_token_type_header_start);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_spaces); TEST_CHECK(t.type == mrk_token_type_text);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_word); TEST_CHECK(t.type == mrk_token_type_newline);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_spaces); TEST_CHECK(t.type == mrk_token_type_newline);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_word); TEST_CHECK(t.type == mrk_token_type_text);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_blank_line);
} }
TEST_LIST = { TEST_LIST = {