feat(lexer): parse sequences of regular text

main
Jef Roosens 2024-03-11 11:11:19 +01:00
parent f5b3235455
commit 6dba1a8291
Signed by: Jef Roosens
GPG Key ID: B75D4F293C7052DB
3 changed files with 48 additions and 19 deletions

View File

@ -20,12 +20,10 @@ typedef enum mrk_token_type {
mrk_token_type_none = 0, mrk_token_type_none = 0,
mrk_token_type_backtick, mrk_token_type_backtick,
mrk_token_type_triple_backtick, mrk_token_type_triple_backtick,
mrk_token_type_dash,
mrk_token_type_underscore, mrk_token_type_underscore,
mrk_token_type_double_underscore, mrk_token_type_double_underscore,
mrk_token_type_star, mrk_token_type_star,
mrk_token_type_double_star, mrk_token_type_double_star,
mrk_token_type_equals,
mrk_token_type_newline, mrk_token_type_newline,
mrk_token_type_line_break, mrk_token_type_line_break,
mrk_token_type_right_angle_brackets, mrk_token_type_right_angle_brackets,

View File

@ -78,4 +78,21 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type);
* special character. * special character.
*/ */
bool mrk_is_special_char(char c); bool mrk_is_special_char(char c);
/**
* Lex a token at the start of a line (including after 1 or more indents).
*/
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer);
/**
* Lex a token that starts in the middle of a line.
*/
void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer);
/**
* Advance the lexer until it encounters a character that's always parsed as a
* separate token, regardless of context (e,g. newlines, brackets).
*/
void mrk_lexer_advance_text(mrk_lexer *lexer);
#endif #endif

View File

@ -1,5 +1,6 @@
#include <ctype.h> #include <ctype.h>
#include <stdio.h> #include <stdio.h>
#include <string.h>
#include "mrk/lexer.h" #include "mrk/lexer.h"
#include "mrk/lexer_internal.h" #include "mrk/lexer_internal.h"
@ -131,12 +132,17 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
lexer->last_emitted = type; lexer->last_emitted = type;
} }
/* void mrk_lexer_advance_text(mrk_lexer *lexer) { */ void mrk_lexer_advance_text(mrk_lexer *lexer) {
/* const char */ const char *special_chars = "*\n[]()\\";
/* /1* while (!mrk_lexer_done(lexer)) { *1/ */
/* /1* /2* switch () *2/ *1/ */ while (!mrk_lexer_done(lexer)) {
/* /1* } *1/ */ if (strchr(special_chars, mrk_lexer_peek(lexer)) == NULL) {
/* } */ mrk_lexer_advance(lexer);
} else {
break;
}
}
}
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
char c = mrk_lexer_advance(lexer); char c = mrk_lexer_advance(lexer);
@ -149,7 +155,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
if (lexer->token.end - lexer->token.start <= MRK_MAX_HEADER_LEN) { if (lexer->token.end - lexer->token.start <= MRK_MAX_HEADER_LEN) {
mrk_lexer_emit(out, lexer, mrk_token_type_header_start); mrk_lexer_emit(out, lexer, mrk_token_type_header_start);
} else { } else {
// TODO match rest of text and emit mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
} }
break; break;
case '-': case '-':
@ -163,7 +170,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_peek(lexer) == '\n') { mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule); mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
} else { } else {
// TODO match rest of text and emit mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
} }
} }
break; break;
@ -172,7 +180,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_advance(lexer); mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
} else { } else {
// TODO match rest of text and emit mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
} }
break; break;
case '_': case '_':
@ -251,9 +260,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_emit(out, lexer, mrk_token_type_line_break); mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else { } else {
/* mrk_lexer_advance_eq(lexer, ' '); */ mrk_lexer_advance_text(lexer);
/* mrk_lexer_emit(out, lexer, mrk_token_type_spaces); */ mrk_lexer_emit(out, lexer, mrk_token_type_text);
// TODO match rest of text and emir
} }
} break; } break;
case '\n': case '\n':
@ -269,19 +277,22 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_advance(lexer); mrk_lexer_advance(lexer);
} }
if (mrk_lexer_peek(lexer) == '.') { // Ordered list item numbers should be followed by a dot and then a space
if (mrk_lexer_peek_str(lexer, ". ")) {
mrk_lexer_advance(lexer); mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered);
} }
// Doesn't end with a dot, so it's just a word that happens to start // Doesn't end with a dot, so it's just a word that happens to start
// with a number // with a number
else { else {
// TODO lex text and emit mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
} }
} }
// Any other special scenarios we simply parse as a word // Any other special scenarios we simply parse as a word
else { else {
// TODO lex text and emit mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
} }
} break; } break;
} }
@ -325,17 +336,20 @@ void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) {
break; break;
case '!': case '!':
mrk_lexer_emit(out, lexer, mrk_token_type_bang); mrk_lexer_emit(out, lexer, mrk_token_type_bang);
break;
case ' ': { case ' ': {
if (mrk_lexer_peek_str(lexer, " \n")) { if (mrk_lexer_peek_str(lexer, " \n")) {
mrk_lexer_advance_n(lexer, 2); mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break); mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else { } else {
// TODO match rest of text and emir mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
} }
} break; } break;
default: default:
// TODO lex text and emit mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
break; break;
} }
} }