feat(lexer): parse sequences of regular text
parent
f5b3235455
commit
6dba1a8291
|
@ -20,12 +20,10 @@ typedef enum mrk_token_type {
|
||||||
mrk_token_type_none = 0,
|
mrk_token_type_none = 0,
|
||||||
mrk_token_type_backtick,
|
mrk_token_type_backtick,
|
||||||
mrk_token_type_triple_backtick,
|
mrk_token_type_triple_backtick,
|
||||||
mrk_token_type_dash,
|
|
||||||
mrk_token_type_underscore,
|
mrk_token_type_underscore,
|
||||||
mrk_token_type_double_underscore,
|
mrk_token_type_double_underscore,
|
||||||
mrk_token_type_star,
|
mrk_token_type_star,
|
||||||
mrk_token_type_double_star,
|
mrk_token_type_double_star,
|
||||||
mrk_token_type_equals,
|
|
||||||
mrk_token_type_newline,
|
mrk_token_type_newline,
|
||||||
mrk_token_type_line_break,
|
mrk_token_type_line_break,
|
||||||
mrk_token_type_right_angle_brackets,
|
mrk_token_type_right_angle_brackets,
|
||||||
|
|
|
@ -78,4 +78,21 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type);
|
||||||
* special character.
|
* special character.
|
||||||
*/
|
*/
|
||||||
bool mrk_is_special_char(char c);
|
bool mrk_is_special_char(char c);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lex a token at the start of a line (including after 1 or more indents).
|
||||||
|
*/
|
||||||
|
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Lex a token that starts in the middle of a line.
|
||||||
|
*/
|
||||||
|
void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the lexer until it encounters a character that's always parsed as a
|
||||||
|
* separate token, regardless of context (e,g. newlines, brackets).
|
||||||
|
*/
|
||||||
|
void mrk_lexer_advance_text(mrk_lexer *lexer);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#include "mrk/lexer.h"
|
#include "mrk/lexer.h"
|
||||||
#include "mrk/lexer_internal.h"
|
#include "mrk/lexer_internal.h"
|
||||||
|
@ -131,12 +132,17 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
|
||||||
lexer->last_emitted = type;
|
lexer->last_emitted = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* void mrk_lexer_advance_text(mrk_lexer *lexer) { */
|
void mrk_lexer_advance_text(mrk_lexer *lexer) {
|
||||||
/* const char */
|
const char *special_chars = "*\n[]()\\";
|
||||||
/* /1* while (!mrk_lexer_done(lexer)) { *1/ */
|
|
||||||
/* /1* /2* switch () *2/ *1/ */
|
while (!mrk_lexer_done(lexer)) {
|
||||||
/* /1* } *1/ */
|
if (strchr(special_chars, mrk_lexer_peek(lexer)) == NULL) {
|
||||||
/* } */
|
mrk_lexer_advance(lexer);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
|
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
|
||||||
char c = mrk_lexer_advance(lexer);
|
char c = mrk_lexer_advance(lexer);
|
||||||
|
@ -149,7 +155,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
|
||||||
if (lexer->token.end - lexer->token.start <= MRK_MAX_HEADER_LEN) {
|
if (lexer->token.end - lexer->token.start <= MRK_MAX_HEADER_LEN) {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_header_start);
|
mrk_lexer_emit(out, lexer, mrk_token_type_header_start);
|
||||||
} else {
|
} else {
|
||||||
// TODO match rest of text and emit
|
mrk_lexer_advance_text(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '-':
|
case '-':
|
||||||
|
@ -163,7 +170,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
|
||||||
mrk_lexer_peek(lexer) == '\n') {
|
mrk_lexer_peek(lexer) == '\n') {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
|
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
|
||||||
} else {
|
} else {
|
||||||
// TODO match rest of text and emit
|
mrk_lexer_advance_text(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -172,7 +180,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
|
||||||
mrk_lexer_advance(lexer);
|
mrk_lexer_advance(lexer);
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
|
||||||
} else {
|
} else {
|
||||||
// TODO match rest of text and emit
|
mrk_lexer_advance_text(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '_':
|
case '_':
|
||||||
|
@ -251,9 +260,8 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
|
||||||
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
||||||
} else {
|
} else {
|
||||||
/* mrk_lexer_advance_eq(lexer, ' '); */
|
mrk_lexer_advance_text(lexer);
|
||||||
/* mrk_lexer_emit(out, lexer, mrk_token_type_spaces); */
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
||||||
// TODO match rest of text and emir
|
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case '\n':
|
case '\n':
|
||||||
|
@ -269,19 +277,22 @@ void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
|
||||||
mrk_lexer_advance(lexer);
|
mrk_lexer_advance(lexer);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mrk_lexer_peek(lexer) == '.') {
|
// Ordered list item numbers should be followed by a dot and then a space
|
||||||
|
if (mrk_lexer_peek_str(lexer, ". ")) {
|
||||||
mrk_lexer_advance(lexer);
|
mrk_lexer_advance(lexer);
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered);
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered);
|
||||||
}
|
}
|
||||||
// Doesn't end with a dot, so it's just a word that happens to start
|
// Doesn't end with a dot, so it's just a word that happens to start
|
||||||
// with a number
|
// with a number
|
||||||
else {
|
else {
|
||||||
// TODO lex text and emit
|
mrk_lexer_advance_text(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Any other special scenarios we simply parse as a word
|
// Any other special scenarios we simply parse as a word
|
||||||
else {
|
else {
|
||||||
// TODO lex text and emit
|
mrk_lexer_advance_text(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
@ -325,17 +336,20 @@ void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) {
|
||||||
break;
|
break;
|
||||||
case '!':
|
case '!':
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
|
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
|
||||||
|
break;
|
||||||
case ' ': {
|
case ' ': {
|
||||||
if (mrk_lexer_peek_str(lexer, " \n")) {
|
if (mrk_lexer_peek_str(lexer, " \n")) {
|
||||||
mrk_lexer_advance_n(lexer, 2);
|
mrk_lexer_advance_n(lexer, 2);
|
||||||
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
||||||
} else {
|
} else {
|
||||||
// TODO match rest of text and emir
|
mrk_lexer_advance_text(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
// TODO lex text and emit
|
mrk_lexer_advance_text(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue