feat(lexer): restructure lexer for hopefully better results
parent
8c0105639f
commit
f5b3235455
|
@ -5,6 +5,9 @@
|
||||||
|
|
||||||
#include "mrk/common.h"
|
#include "mrk/common.h"
|
||||||
|
|
||||||
|
#define MRK_MAX_HEADER_LEN 6
|
||||||
|
#define MRK_MIN_HORIZ_RULE_LEN 3
|
||||||
|
|
||||||
typedef struct mrk_lexer mrk_lexer;
|
typedef struct mrk_lexer mrk_lexer;
|
||||||
|
|
||||||
typedef enum mrk_lexer_err {
|
typedef enum mrk_lexer_err {
|
||||||
|
@ -15,29 +18,31 @@ typedef enum mrk_lexer_err {
|
||||||
|
|
||||||
typedef enum mrk_token_type {
|
typedef enum mrk_token_type {
|
||||||
mrk_token_type_none = 0,
|
mrk_token_type_none = 0,
|
||||||
mrk_token_type_pounds,
|
mrk_token_type_backtick,
|
||||||
mrk_token_type_backticks,
|
mrk_token_type_triple_backtick,
|
||||||
mrk_token_type_dashes,
|
mrk_token_type_dash,
|
||||||
mrk_token_type_underscores,
|
mrk_token_type_underscore,
|
||||||
mrk_token_type_stars,
|
mrk_token_type_double_underscore,
|
||||||
|
mrk_token_type_star,
|
||||||
|
mrk_token_type_double_star,
|
||||||
mrk_token_type_equals,
|
mrk_token_type_equals,
|
||||||
mrk_token_type_blank_line,
|
|
||||||
mrk_token_type_newline,
|
mrk_token_type_newline,
|
||||||
mrk_token_type_spaces,
|
|
||||||
mrk_token_type_line_break,
|
mrk_token_type_line_break,
|
||||||
mrk_token_type_right_angle_brackets,
|
mrk_token_type_right_angle_brackets,
|
||||||
mrk_token_type_tabs,
|
|
||||||
mrk_token_type_left_bracket,
|
mrk_token_type_left_bracket,
|
||||||
mrk_token_type_right_bracket,
|
mrk_token_type_right_bracket,
|
||||||
mrk_token_type_bang,
|
|
||||||
mrk_token_type_left_paren,
|
mrk_token_type_left_paren,
|
||||||
mrk_token_type_right_paren,
|
mrk_token_type_right_paren,
|
||||||
|
mrk_token_type_bang,
|
||||||
mrk_token_type_backslash,
|
mrk_token_type_backslash,
|
||||||
mrk_token_type_dotted_number,
|
|
||||||
mrk_token_type_word,
|
|
||||||
mrk_token_type_checked_box,
|
|
||||||
mrk_token_type_unchecked_box,
|
|
||||||
mrk_token_type_text,
|
mrk_token_type_text,
|
||||||
|
mrk_token_type_header_start,
|
||||||
|
mrk_token_type_horizontal_rule,
|
||||||
|
mrk_token_type_indent,
|
||||||
|
mrk_token_type_list_item_unordered,
|
||||||
|
mrk_token_type_list_item_ordered,
|
||||||
|
mrk_token_type_list_item_checked,
|
||||||
|
mrk_token_type_list_item_unchecked,
|
||||||
} mrk_token_type;
|
} mrk_token_type;
|
||||||
|
|
||||||
typedef struct mrk_token {
|
typedef struct mrk_token {
|
||||||
|
|
|
@ -18,6 +18,7 @@ struct mrk_lexer {
|
||||||
size_t end;
|
size_t end;
|
||||||
bool emitted;
|
bool emitted;
|
||||||
} token;
|
} token;
|
||||||
|
mrk_token_type last_emitted;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -26,6 +26,7 @@ void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
|
||||||
lexer->token.start = 0;
|
lexer->token.start = 0;
|
||||||
lexer->token.end = 0;
|
lexer->token.end = 0;
|
||||||
lexer->token.emitted = false;
|
lexer->token.emitted = false;
|
||||||
|
lexer->last_emitted = mrk_token_type_none;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mrk_lexer_done(const mrk_lexer *lexer) {
|
bool mrk_lexer_done(const mrk_lexer *lexer) {
|
||||||
|
@ -42,7 +43,7 @@ char mrk_lexer_advance(mrk_lexer *lexer) {
|
||||||
|
|
||||||
// A newline is still part of the previous line, so if the last character was
|
// A newline is still part of the previous line, so if the last character was
|
||||||
// a newline, we now go to the next line
|
// a newline, we now go to the next line
|
||||||
if (lexer->buf.s[lexer->pos.buf_index] == '\0') {
|
if (c == '\n') {
|
||||||
lexer->pos.line++;
|
lexer->pos.line++;
|
||||||
lexer->pos.line_index = 0;
|
lexer->pos.line_index = 0;
|
||||||
} else {
|
} else {
|
||||||
|
@ -127,58 +128,96 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
|
||||||
out->end = lexer->token.end;
|
out->end = lexer->token.end;
|
||||||
|
|
||||||
lexer->token.emitted = true;
|
lexer->token.emitted = true;
|
||||||
|
lexer->last_emitted = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
void mrk_lexer_lex_special(mrk_token *out, mrk_lexer *lexer) {
|
/* void mrk_lexer_advance_text(mrk_lexer *lexer) { */
|
||||||
|
/* const char */
|
||||||
|
/* /1* while (!mrk_lexer_done(lexer)) { *1/ */
|
||||||
|
/* /1* /2* switch () *2/ *1/ */
|
||||||
|
/* /1* } *1/ */
|
||||||
|
/* } */
|
||||||
|
|
||||||
|
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
|
||||||
char c = mrk_lexer_advance(lexer);
|
char c = mrk_lexer_advance(lexer);
|
||||||
|
|
||||||
switch (c) {
|
switch (c) {
|
||||||
// All these characters have multiple meanings depending on their location
|
// Headers
|
||||||
// in the file and how many there are, so the lexer can only match them as
|
|
||||||
// one or more grouped characters
|
|
||||||
case '#':
|
case '#':
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
mrk_lexer_advance_eq(lexer, c);
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_pounds);
|
|
||||||
break;
|
if (lexer->token.end - lexer->token.start <= MRK_MAX_HEADER_LEN) {
|
||||||
case '`':
|
mrk_lexer_emit(out, lexer, mrk_token_type_header_start);
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
} else {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_backticks);
|
// TODO match rest of text and emit
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case '-':
|
case '-':
|
||||||
|
if (mrk_lexer_peek(lexer) == ' ') {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
|
||||||
|
} else {
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
mrk_lexer_advance_eq(lexer, c);
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_dashes);
|
|
||||||
|
if (lexer->token.end - lexer->token.start >= MRK_MIN_HORIZ_RULE_LEN &&
|
||||||
|
mrk_lexer_peek(lexer) == '\n') {
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
|
||||||
|
} else {
|
||||||
|
// TODO match rest of text and emit
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case '+':
|
||||||
|
if (mrk_lexer_peek(lexer) == ' ') {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
|
||||||
|
} else {
|
||||||
|
// TODO match rest of text and emit
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case '_':
|
case '_':
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
case '*': {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_underscores);
|
if (mrk_lexer_peek(lexer) == ' ') {
|
||||||
break;
|
mrk_lexer_advance(lexer);
|
||||||
case '*':
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
} else {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_stars);
|
// We first check if the entire line consists of stars; otherwise, we
|
||||||
break;
|
// match it as a regular single or double star
|
||||||
case '=':
|
size_t i = 0;
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_equals);
|
while (mrk_lexer_peek_n(lexer, i) == c) {
|
||||||
break;
|
i++;
|
||||||
case '\t':
|
}
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_tabs);
|
if (mrk_lexer_peek_n(lexer, i) == '\n' &&
|
||||||
break;
|
(i + 1) >= MRK_MIN_HORIZ_RULE_LEN) {
|
||||||
|
mrk_lexer_advance_n(lexer, i + 1);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
|
||||||
|
} else if (mrk_lexer_peek(lexer) == c) {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer,
|
||||||
|
c == '_' ? mrk_token_type_double_underscore
|
||||||
|
: mrk_token_type_double_star);
|
||||||
|
} else {
|
||||||
|
mrk_lexer_emit(out, lexer,
|
||||||
|
c == '_' ? mrk_token_type_underscore
|
||||||
|
: mrk_token_type_star);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case '>':
|
case '>':
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
mrk_lexer_advance_eq(lexer, c);
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets);
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets);
|
||||||
break;
|
break;
|
||||||
case '!':
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
|
|
||||||
break;
|
|
||||||
case '[':
|
case '[':
|
||||||
// Checkboxes for lists are lexed separately to simplify the parser later
|
// Checkboxes for lists are lexed separately to simplify the parser later
|
||||||
// on
|
// on
|
||||||
if (mrk_lexer_peek_str(lexer, " ]")) {
|
if (mrk_lexer_peek_str(lexer, " ]")) {
|
||||||
mrk_lexer_advance_n(lexer, 2);
|
mrk_lexer_advance_n(lexer, 2);
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_unchecked_box);
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unchecked);
|
||||||
} else if (mrk_lexer_peek_str(lexer, "x]")) {
|
} else if (mrk_lexer_peek_str(lexer, "x]")) {
|
||||||
mrk_lexer_advance_n(lexer, 2);
|
mrk_lexer_advance_n(lexer, 2);
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_checked_box);
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_checked);
|
||||||
} else {
|
} else {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
|
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
|
||||||
}
|
}
|
||||||
|
@ -193,190 +232,127 @@ void mrk_lexer_lex_special(mrk_token *out, mrk_lexer *lexer) {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
|
||||||
break;
|
break;
|
||||||
case '\\':
|
case '\\':
|
||||||
|
// TODO better handle escaped elements
|
||||||
if (mrk_lexer_peek(lexer) == '\n') {
|
if (mrk_lexer_peek(lexer) == '\n') {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
||||||
} else {
|
} else {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
|
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
// Two consecutive newlines constitute a blank line, otherwise they're
|
|
||||||
// ignored as whitespace
|
|
||||||
case '\n':
|
|
||||||
if (mrk_lexer_peek(lexer) == '\n') {
|
|
||||||
mrk_lexer_advance(lexer);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
|
|
||||||
} else {
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case ' ': {
|
case ' ': {
|
||||||
|
// Indents consist of four spaces
|
||||||
|
if (mrk_lexer_peek_str(lexer, " ")) {
|
||||||
|
mrk_lexer_advance_n(lexer, 3);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_indent);
|
||||||
|
}
|
||||||
// Either a double space or a line break
|
// Either a double space or a line break
|
||||||
if (mrk_lexer_peek_str(lexer, " \n")) {
|
else if (mrk_lexer_peek_str(lexer, " \n")) {
|
||||||
mrk_lexer_advance_n(lexer, 2);
|
mrk_lexer_advance_n(lexer, 2);
|
||||||
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
||||||
} else {
|
} else {
|
||||||
mrk_lexer_advance_eq(lexer, ' ');
|
/* mrk_lexer_advance_eq(lexer, ' '); */
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_spaces);
|
/* mrk_lexer_emit(out, lexer, mrk_token_type_spaces); */
|
||||||
|
// TODO match rest of text and emir
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case '\n':
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
|
||||||
|
break;
|
||||||
|
case '\t':
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_indent);
|
||||||
|
break;
|
||||||
default: {
|
default: {
|
||||||
// Match ordered list headers
|
// Match ordered list headers
|
||||||
if (isdigit(c)) {
|
if (isdigit(c)) {
|
||||||
mrk_lexer_advance(lexer);
|
|
||||||
|
|
||||||
while (isdigit(mrk_lexer_peek(lexer))) {
|
while (isdigit(mrk_lexer_peek(lexer))) {
|
||||||
mrk_lexer_advance(lexer);
|
mrk_lexer_advance(lexer);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mrk_lexer_peek(lexer) == '.') {
|
if (mrk_lexer_peek(lexer) == '.') {
|
||||||
mrk_lexer_advance(lexer);
|
mrk_lexer_advance(lexer);
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_dotted_number);
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered);
|
||||||
}
|
}
|
||||||
// Doesn't end with a dot, so it's just a word that happens to start
|
// Doesn't end with a dot, so it's just a word that happens to start
|
||||||
// with a number
|
// with a number
|
||||||
else {
|
else {
|
||||||
mrk_lexer_advance_word(lexer);
|
// TODO lex text and emit
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_word);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Any other special scenarios we simply parse as a word
|
// Any other special scenarios we simply parse as a word
|
||||||
else {
|
else {
|
||||||
mrk_lexer_advance_word(lexer);
|
// TODO lex text and emit
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_word);
|
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) {
|
||||||
|
char c = mrk_lexer_advance(lexer);
|
||||||
|
|
||||||
|
switch (c) {
|
||||||
|
case '*':
|
||||||
|
case '_':
|
||||||
|
if (mrk_lexer_peek(lexer) == c) {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer,
|
||||||
|
c == '_' ? mrk_token_type_double_underscore
|
||||||
|
: mrk_token_type_double_star);
|
||||||
|
} else {
|
||||||
|
mrk_lexer_emit(out, lexer,
|
||||||
|
c == '_' ? mrk_token_type_underscore
|
||||||
|
: mrk_token_type_star);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case '[':
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
|
||||||
|
break;
|
||||||
|
case ']':
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket);
|
||||||
|
break;
|
||||||
|
case '(':
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_left_paren);
|
||||||
|
break;
|
||||||
|
case ')':
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
|
||||||
|
break;
|
||||||
|
case '\\':
|
||||||
|
// TODO better handle escaped characters
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
|
||||||
|
break;
|
||||||
|
case '\n':
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
|
||||||
|
break;
|
||||||
|
case '!':
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
|
||||||
|
case ' ': {
|
||||||
|
if (mrk_lexer_peek_str(lexer, " \n")) {
|
||||||
|
mrk_lexer_advance_n(lexer, 2);
|
||||||
|
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
||||||
|
} else {
|
||||||
|
// TODO match rest of text and emir
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
// TODO lex text and emit
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
||||||
if (mrk_lexer_done(lexer)) {
|
if (mrk_lexer_done(lexer)) {
|
||||||
return mrk_lexer_err_done;
|
return mrk_lexer_err_done;
|
||||||
}
|
}
|
||||||
|
|
||||||
mrk_lexer_reset(lexer);
|
if (lexer->pos.line_index == 0 ||
|
||||||
|
lexer->last_emitted == mrk_token_type_indent) {
|
||||||
while (!lexer->token.emitted && !mrk_lexer_done(lexer)) {
|
mrk_lexer_lex_start_of_line(out, lexer);
|
||||||
char c = mrk_lexer_advance(lexer);
|
|
||||||
switch (c) {
|
|
||||||
// All these characters have multiple meanings depending on their location
|
|
||||||
// in the file and how many there are, so the lexer can only match them as
|
|
||||||
// one or more grouped characters
|
|
||||||
case '#':
|
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_pounds);
|
|
||||||
break;
|
|
||||||
case '`':
|
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_backticks);
|
|
||||||
break;
|
|
||||||
case '-':
|
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_dashes);
|
|
||||||
break;
|
|
||||||
case '_':
|
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_underscores);
|
|
||||||
break;
|
|
||||||
case '*':
|
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_stars);
|
|
||||||
break;
|
|
||||||
case '=':
|
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_equals);
|
|
||||||
break;
|
|
||||||
case '\t':
|
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_tabs);
|
|
||||||
break;
|
|
||||||
case '>':
|
|
||||||
mrk_lexer_advance_eq(lexer, c);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets);
|
|
||||||
break;
|
|
||||||
case '!':
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
|
|
||||||
break;
|
|
||||||
case '[':
|
|
||||||
// Checkboxes for lists are lexed separately to simplify the parser later
|
|
||||||
// on
|
|
||||||
if (mrk_lexer_peek_str(lexer, " ]")) {
|
|
||||||
mrk_lexer_advance_n(lexer, 2);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_unchecked_box);
|
|
||||||
} else if (mrk_lexer_peek_str(lexer, "x]")) {
|
|
||||||
mrk_lexer_advance_n(lexer, 2);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_checked_box);
|
|
||||||
} else {
|
} else {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
|
mrk_lexer_lex_middle_of_line(out, lexer);
|
||||||
}
|
|
||||||
break;
|
|
||||||
case ']':
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket);
|
|
||||||
break;
|
|
||||||
case '(':
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_left_paren);
|
|
||||||
break;
|
|
||||||
case ')':
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
|
|
||||||
break;
|
|
||||||
case '\\':
|
|
||||||
if (mrk_lexer_peek(lexer) == '\n') {
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
|
||||||
} else {
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
// Two consecutive newlines constitute a blank line, otherwise they're
|
|
||||||
// ignored as whitespace
|
|
||||||
case '\n':
|
|
||||||
if (mrk_lexer_peek(lexer) == '\n') {
|
|
||||||
mrk_lexer_advance(lexer);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
|
|
||||||
} else {
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case ' ': {
|
|
||||||
// Either a double space or a line break
|
|
||||||
if (mrk_lexer_peek_str(lexer, " \n")) {
|
|
||||||
mrk_lexer_advance_n(lexer, 2);
|
|
||||||
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
|
||||||
} else {
|
|
||||||
mrk_lexer_advance_eq(lexer, ' ');
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_spaces);
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
default: {
|
|
||||||
// Match ordered list headers
|
|
||||||
if (isdigit(c)) {
|
|
||||||
mrk_lexer_advance(lexer);
|
|
||||||
|
|
||||||
while (isdigit(mrk_lexer_peek(lexer))) {
|
|
||||||
mrk_lexer_advance(lexer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mrk_lexer_peek(lexer) == '.') {
|
return mrk_lexer_err_ok;
|
||||||
mrk_lexer_advance(lexer);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_dotted_number);
|
|
||||||
}
|
|
||||||
// Doesn't end with a dot, so it's just a word that happens to start
|
|
||||||
// with a number
|
|
||||||
else {
|
|
||||||
mrk_lexer_advance_word(lexer);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Any other special scenarios we simply parse as a word
|
|
||||||
else {
|
|
||||||
mrk_lexer_advance_word(lexer);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_word);
|
|
||||||
}
|
|
||||||
} break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return lexer->token.emitted ? mrk_lexer_err_ok : mrk_lexer_err_done;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t mrk_token_len(mrk_token t) { return t.end - t.start; }
|
size_t mrk_token_len(mrk_token t) { return t.end - t.start; }
|
||||||
|
|
|
@ -31,12 +31,12 @@ mrk_err mrk_parser_parse_block(mrk_ast_node **out, mrk_parser *parser) {
|
||||||
|
|
||||||
mrk_err (*parse_fn)(mrk_ast_node **, mrk_parser *) = NULL;
|
mrk_err (*parse_fn)(mrk_ast_node **, mrk_parser *) = NULL;
|
||||||
|
|
||||||
switch (t.type) {
|
/* switch (t.type) { */
|
||||||
case mrk_token_type_pounds: {
|
/* case mrk_token_type_pounds: { */
|
||||||
parse_fn = mrk_parser_parse_header;
|
/* parse_fn = mrk_parser_parse_header; */
|
||||||
break;
|
/* break; */
|
||||||
}
|
/* } */
|
||||||
}
|
/* } */
|
||||||
|
|
||||||
if (parse_fn == NULL) {
|
if (parse_fn == NULL) {
|
||||||
MRK_PARSE_ERR(parser, t, "Unexpected token.");
|
MRK_PARSE_ERR(parser, t, "Unexpected token.");
|
||||||
|
@ -61,10 +61,12 @@ mrk_err mrk_parser_parse_header(mrk_ast_node **out, mrk_parser *parser) {
|
||||||
header->args[0].num = mrk_token_len(t);
|
header->args[0].num = mrk_token_len(t);
|
||||||
|
|
||||||
// Headers are blocks of their own, so they're delimited by blank lines
|
// Headers are blocks of their own, so they're delimited by blank lines
|
||||||
while (!mrk_parser_done(parser) &&
|
/* while (!mrk_parser_done(parser) && */
|
||||||
(t = mrk_parser_peek(parser)).type != mrk_token_type_blank_line) {
|
/* (t = mrk_parser_peek(parser)).type != mrk_token_type_blank_line) {
|
||||||
switch (t.type) { /* case */ }
|
*/
|
||||||
}
|
/* switch (t.type) { /1* case *1/ */
|
||||||
|
/* } */
|
||||||
|
/* } */
|
||||||
|
|
||||||
// Skip blank line
|
// Skip blank line
|
||||||
mrk_parser_advance(parser);
|
mrk_parser_advance(parser);
|
||||||
|
|
|
@ -14,7 +14,7 @@ void test_lexer_header() {
|
||||||
|
|
||||||
mrk_token t;
|
mrk_token t;
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
TEST_CHECK(t.type == mrk_token_type_pounds);
|
TEST_CHECK(t.type == mrk_token_type_header_start);
|
||||||
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
|
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
|
||||||
TEST_CHECK(t.end == 4);
|
TEST_CHECK(t.end == 4);
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ void test_lexer_line_break() {
|
||||||
mrk_lexer_open(lxr, buf2, 0);
|
mrk_lexer_open(lxr, buf2, 0);
|
||||||
|
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
TEST_CHECK(t.type == mrk_token_type_spaces);
|
TEST_CHECK(t.type == mrk_token_type_text);
|
||||||
|
|
||||||
TEST_CHECK(mrk_lexer_done(lxr));
|
TEST_CHECK(mrk_lexer_done(lxr));
|
||||||
|
|
||||||
|
@ -52,17 +52,15 @@ void test_lexer_simple1() {
|
||||||
|
|
||||||
mrk_token t;
|
mrk_token t;
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
TEST_CHECK(t.type == mrk_token_type_pounds);
|
TEST_CHECK(t.type == mrk_token_type_header_start);
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
TEST_CHECK(t.type == mrk_token_type_spaces);
|
TEST_CHECK(t.type == mrk_token_type_text);
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
TEST_CHECK(t.type == mrk_token_type_word);
|
TEST_CHECK(t.type == mrk_token_type_newline);
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
TEST_CHECK(t.type == mrk_token_type_spaces);
|
TEST_CHECK(t.type == mrk_token_type_newline);
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
TEST_CHECK(t.type == mrk_token_type_word);
|
TEST_CHECK(t.type == mrk_token_type_text);
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
|
||||||
TEST_CHECK(t.type == mrk_token_type_blank_line);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_LIST = {
|
TEST_LIST = {
|
||||||
|
|
Loading…
Reference in New Issue