feat(lexer): parse sequences of regular text

main
Jef Roosens 2024-03-11 11:11:19 +01:00
parent f5b3235455
commit 6dba1a8291
Signed by: Jef Roosens
GPG Key ID: B75D4F293C7052DB
3 changed files with 48 additions and 19 deletions

View File

@ -20,12 +20,10 @@ typedef enum mrk_token_type {
mrk_token_type_none = 0,
mrk_token_type_backtick,
mrk_token_type_triple_backtick,
mrk_token_type_dash,
mrk_token_type_underscore,
mrk_token_type_double_underscore,
mrk_token_type_star,
mrk_token_type_double_star,
mrk_token_type_equals,
mrk_token_type_newline,
mrk_token_type_line_break,
mrk_token_type_right_angle_brackets,

View File

@ -78,4 +78,21 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type);
* special character.
*/
bool mrk_is_special_char(char c);
/**
* Lex a token at the start of a line (including after 1 or more indents).
*/
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer);
/**
* Lex a token that starts in the middle of a line.
*/
void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer);
/**
* Advance the lexer until it encounters a character that's always parsed as a
* separate token, regardless of context (e,g. newlines, brackets).
*/
void mrk_lexer_advance_text(mrk_lexer *lexer);
#endif

View File

@ -1,5 +1,6 @@
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include "mrk/lexer.h"
#include "mrk/lexer_internal.h"
@ -131,12 +132,17 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
lexer->last_emitted = type;
}
/* void mrk_lexer_advance_text(mrk_lexer *lexer) { */
/* const char */
/* /1* while (!mrk_lexer_done(lexer)) { *1/ */
/* /1* /2* switch () *2/ *1/ */
/* /1* } *1/ */
/* } */
void mrk_lexer_advance_text(mrk_lexer *lexer) {
const char *special_chars = "*\n[]()\\";
while (!mrk_lexer_done(lexer)) {
if (strchr(special_chars, mrk_lexer_peek(lexer)) == NULL) {
mrk_lexer_advance(lexer);
} else {
break;
}
}
}
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
char c = mrk_lexer_advance(lexer);
@ -149,7 +155,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
if (lexer->token.end - lexer->token.start <= MRK_MAX_HEADER_LEN) {
mrk_lexer_emit(out, lexer, mrk_token_type_header_start);
} else {
// TODO match rest of text and emit
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
break;
case '-':
@ -163,7 +170,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
} else {
// TODO match rest of text and emit
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
}
break;
@ -172,7 +180,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
} else {
// TODO match rest of text and emit
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
break;
case '_':
@ -251,9 +260,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
/* mrk_lexer_advance_eq(lexer, ' '); */
/* mrk_lexer_emit(out, lexer, mrk_token_type_spaces); */
// TODO match rest of text and emir
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
} break;
case '\n':
@ -269,19 +277,22 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_advance(lexer);
}
if (mrk_lexer_peek(lexer) == '.') {
// Ordered list item numbers should be followed by a dot and then a space
if (mrk_lexer_peek_str(lexer, ". ")) {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered);
}
// Doesn't end with a dot, so it's just a word that happens to start
// with a number
else {
// TODO lex text and emit
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
}
// Any other special scenarios we simply parse as a word
else {
// TODO lex text and emit
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
} break;
}
@ -325,17 +336,20 @@ void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) {
break;
case '!':
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
break;
case ' ': {
if (mrk_lexer_peek_str(lexer, " \n")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
// TODO match rest of text and emir
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
} break;
default:
// TODO lex text and emit
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
break;
}
}