408 lines
11 KiB
C
408 lines
11 KiB
C
#include <ctype.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#include "mrk/lexer.h"
|
|
#include "mrk/lexer_internal.h"
|
|
|
|
mrk_err mrk_lexer_init(mrk_lexer **out) {
|
|
MRK_CALLOC(out, 1, sizeof(mrk_lexer));
|
|
|
|
return mrk_err_ok;
|
|
}
|
|
|
|
void mrk_lexer_free(mrk_lexer *lexer) {
|
|
if (lexer == NULL) {
|
|
return;
|
|
}
|
|
|
|
free(lexer);
|
|
}
|
|
|
|
void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
|
|
lexer->buf.s = buf;
|
|
lexer->buf.len = len;
|
|
|
|
lexer->pos.line = 0;
|
|
lexer->pos.line_index = 0;
|
|
lexer->pos.buf_index = 0;
|
|
|
|
lexer->cur_token.type = mrk_token_type_none;
|
|
lexer->cur_token.start = 0;
|
|
lexer->cur_token.end = 0;
|
|
lexer->cur_token.start_line = 0;
|
|
lexer->cur_token.start_line_index = 0;
|
|
|
|
lexer->last_emitted = lexer->cur_token;
|
|
}
|
|
|
|
bool mrk_lexer_done(const mrk_lexer *lexer) {
|
|
return (lexer->buf.len > 0 && lexer->pos.buf_index == lexer->buf.len) ||
|
|
(lexer->buf.s[lexer->pos.buf_index] == '\0');
|
|
}
|
|
|
|
char mrk_lexer_advance(mrk_lexer *lexer) {
|
|
if (mrk_lexer_done(lexer)) {
|
|
return '\0';
|
|
}
|
|
|
|
char c = lexer->buf.s[lexer->pos.buf_index];
|
|
|
|
// A newline is still part of the previous line, so if the last character was
|
|
// a newline, we now go to the next line
|
|
if (c == '\n') {
|
|
lexer->pos.line++;
|
|
lexer->pos.line_index = 0;
|
|
} else {
|
|
lexer->pos.line_index++;
|
|
}
|
|
|
|
lexer->pos.buf_index++;
|
|
lexer->cur_token.end++;
|
|
|
|
return c;
|
|
}
|
|
|
|
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
|
|
while (n > 1) {
|
|
mrk_lexer_advance(lexer);
|
|
n--;
|
|
}
|
|
|
|
return mrk_lexer_advance(lexer);
|
|
}
|
|
|
|
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c) {
|
|
while (mrk_lexer_peek(lexer) == c) {
|
|
mrk_lexer_advance(lexer);
|
|
}
|
|
}
|
|
|
|
char mrk_lexer_peek(mrk_lexer *lexer) {
|
|
if (mrk_lexer_done(lexer)) {
|
|
return '\0';
|
|
}
|
|
|
|
return lexer->buf.s[lexer->pos.buf_index];
|
|
}
|
|
|
|
void mrk_lexer_advance_word(mrk_lexer *lexer) {
|
|
while (!mrk_is_special_char(mrk_lexer_peek(lexer))) {
|
|
mrk_lexer_advance(lexer);
|
|
}
|
|
}
|
|
|
|
bool mrk_lexer_peek_str(mrk_lexer *lexer, const char *s) {
|
|
bool match = true;
|
|
|
|
size_t i = 0;
|
|
while (*s != '\0') {
|
|
// Check whether the lexer would be done before matching the entire string
|
|
bool done_in_n =
|
|
(lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) ||
|
|
(lexer->buf.s[lexer->pos.buf_index + i] == '\0');
|
|
match = !done_in_n && (lexer->buf.s[lexer->pos.buf_index + i] == *s);
|
|
|
|
i++;
|
|
s++;
|
|
}
|
|
|
|
return match;
|
|
}
|
|
|
|
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) {
|
|
// Check whether the lexer would be done in n steps
|
|
bool done_in_n = false;
|
|
|
|
for (size_t i = 0; i < n && !done_in_n; i++) {
|
|
done_in_n =
|
|
(lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) ||
|
|
(lexer->buf.s[lexer->pos.buf_index + i] == '\0');
|
|
}
|
|
|
|
return done_in_n ? '\0' : lexer->buf.s[lexer->pos.buf_index + n];
|
|
}
|
|
|
|
void mrk_lexer_reset(mrk_lexer *lexer) {
|
|
lexer->cur_token.start = lexer->pos.buf_index;
|
|
lexer->cur_token.end = lexer->pos.buf_index;
|
|
lexer->cur_token.start_line = lexer->pos.line;
|
|
lexer->cur_token.start_line_index = lexer->pos.line_index;
|
|
}
|
|
|
|
void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
|
|
lexer->cur_token.type = type;
|
|
*out = lexer->cur_token;
|
|
|
|
lexer->last_emitted = lexer->cur_token;
|
|
}
|
|
|
|
void mrk_lexer_advance_text(mrk_lexer *lexer) {
|
|
const char *special_chars = "*\n[]()\\`";
|
|
|
|
while (!mrk_lexer_done(lexer)) {
|
|
if (strchr(special_chars, mrk_lexer_peek(lexer)) == NULL) {
|
|
mrk_lexer_advance(lexer);
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
|
|
char c = mrk_lexer_advance(lexer);
|
|
|
|
switch (c) {
|
|
// Headers
|
|
case '#':
|
|
mrk_lexer_advance_eq(lexer, c);
|
|
|
|
if (lexer->cur_token.end - lexer->cur_token.start <= MRK_MAX_HEADER_LEN) {
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_header_start);
|
|
} else {
|
|
mrk_lexer_advance_text(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
|
}
|
|
break;
|
|
case '-':
|
|
if (mrk_lexer_peek(lexer) == ' ') {
|
|
mrk_lexer_advance(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
|
|
} else {
|
|
mrk_lexer_advance_eq(lexer, c);
|
|
|
|
if (lexer->cur_token.end - lexer->cur_token.start >=
|
|
MRK_MIN_HORIZ_RULE_LEN &&
|
|
mrk_lexer_peek(lexer) == '\n') {
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
|
|
} else {
|
|
mrk_lexer_advance_text(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
|
}
|
|
}
|
|
break;
|
|
case '+':
|
|
if (mrk_lexer_peek(lexer) == ' ') {
|
|
mrk_lexer_advance(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
|
|
} else {
|
|
mrk_lexer_advance_text(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
|
}
|
|
break;
|
|
case '_':
|
|
case '*': {
|
|
if (mrk_lexer_peek(lexer) == ' ') {
|
|
mrk_lexer_advance(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
|
|
} else {
|
|
// We first check if the entire line consists of stars; otherwise, we
|
|
// match it as a regular single or double star
|
|
size_t i = 0;
|
|
|
|
while (mrk_lexer_peek_n(lexer, i) == c) {
|
|
i++;
|
|
}
|
|
|
|
if (mrk_lexer_peek_n(lexer, i) == '\n' &&
|
|
(i + 1) >= MRK_MIN_HORIZ_RULE_LEN) {
|
|
mrk_lexer_advance_n(lexer, i + 1);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
|
|
} else if (mrk_lexer_peek(lexer) == c) {
|
|
mrk_lexer_advance(lexer);
|
|
mrk_lexer_emit(out, lexer,
|
|
c == '_' ? mrk_token_type_double_underscore
|
|
: mrk_token_type_double_star);
|
|
} else {
|
|
mrk_lexer_emit(out, lexer,
|
|
c == '_' ? mrk_token_type_underscore
|
|
: mrk_token_type_star);
|
|
}
|
|
}
|
|
} break;
|
|
case '>':
|
|
mrk_lexer_advance_eq(lexer, c);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets);
|
|
break;
|
|
case '[':
|
|
// Checkboxes for lists are lexed separately to simplify the parser later
|
|
// on
|
|
if (mrk_lexer_peek_str(lexer, " ]")) {
|
|
mrk_lexer_advance_n(lexer, 2);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unchecked);
|
|
} else if (mrk_lexer_peek_str(lexer, "x]")) {
|
|
mrk_lexer_advance_n(lexer, 2);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_checked);
|
|
} else {
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
|
|
}
|
|
break;
|
|
case ']':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket);
|
|
break;
|
|
case '(':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_left_paren);
|
|
break;
|
|
case ')':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
|
|
break;
|
|
case '\\':
|
|
// TODO better handle escaped elements
|
|
if (mrk_lexer_peek(lexer) == '\n') {
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
|
} else {
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
|
|
}
|
|
break;
|
|
case ' ': {
|
|
// Indents consist of four spaces
|
|
if (mrk_lexer_peek_str(lexer, " ")) {
|
|
mrk_lexer_advance_n(lexer, 3);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_indent);
|
|
}
|
|
// Either a double space or a line break
|
|
else if (mrk_lexer_peek_str(lexer, " \n")) {
|
|
mrk_lexer_advance_n(lexer, 2);
|
|
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
|
} else {
|
|
mrk_lexer_advance_text(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
|
}
|
|
} break;
|
|
case '\n':
|
|
if (mrk_lexer_peek(lexer) == '\n') {
|
|
mrk_lexer_advance(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
|
|
} else {
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
|
|
}
|
|
break;
|
|
case '\t':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_indent);
|
|
break;
|
|
case '`':
|
|
if (mrk_lexer_peek_str(lexer, "``")) {
|
|
mrk_lexer_advance_n(lexer, 2);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_triple_backtick);
|
|
} else {
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_backtick);
|
|
}
|
|
break;
|
|
default: {
|
|
// Match ordered list headers
|
|
if (isdigit(c)) {
|
|
while (isdigit(mrk_lexer_peek(lexer))) {
|
|
mrk_lexer_advance(lexer);
|
|
}
|
|
|
|
// Ordered list item numbers should be followed by a dot and then a space
|
|
if (mrk_lexer_peek_str(lexer, ". ")) {
|
|
mrk_lexer_advance(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered);
|
|
}
|
|
// Doesn't end with a dot, so it's just a word that happens to start
|
|
// with a number
|
|
else {
|
|
mrk_lexer_advance_text(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
|
}
|
|
}
|
|
// Any other special scenarios we simply parse as a word
|
|
else {
|
|
mrk_lexer_advance_text(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
|
}
|
|
} break;
|
|
}
|
|
}
|
|
|
|
void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) {
|
|
char c = mrk_lexer_advance(lexer);
|
|
|
|
switch (c) {
|
|
case '*':
|
|
case '_':
|
|
if (mrk_lexer_peek(lexer) == c) {
|
|
mrk_lexer_advance(lexer);
|
|
mrk_lexer_emit(out, lexer,
|
|
c == '_' ? mrk_token_type_double_underscore
|
|
: mrk_token_type_double_star);
|
|
} else {
|
|
mrk_lexer_emit(out, lexer,
|
|
c == '_' ? mrk_token_type_underscore
|
|
: mrk_token_type_star);
|
|
}
|
|
break;
|
|
case '[':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
|
|
break;
|
|
case ']':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket);
|
|
break;
|
|
case '(':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_left_paren);
|
|
break;
|
|
case ')':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
|
|
break;
|
|
case '\\':
|
|
// TODO better handle escaped characters
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
|
|
break;
|
|
case '\n':
|
|
if (mrk_lexer_peek(lexer) == '\n') {
|
|
mrk_lexer_advance(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
|
|
} else {
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
|
|
}
|
|
break;
|
|
case '!':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
|
|
break;
|
|
case ' ': {
|
|
if (mrk_lexer_peek_str(lexer, " \n")) {
|
|
mrk_lexer_advance_n(lexer, 2);
|
|
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
|
} else {
|
|
mrk_lexer_advance_text(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
|
}
|
|
} break;
|
|
case '`':
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_backtick);
|
|
break;
|
|
default:
|
|
mrk_lexer_advance_text(lexer);
|
|
mrk_lexer_emit(out, lexer, mrk_token_type_text);
|
|
break;
|
|
}
|
|
}
|
|
|
|
mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
|
if (mrk_lexer_done(lexer)) {
|
|
return mrk_lexer_err_done;
|
|
}
|
|
|
|
mrk_lexer_reset(lexer);
|
|
|
|
if (lexer->pos.line_index == 0 ||
|
|
lexer->last_emitted.type == mrk_token_type_indent) {
|
|
mrk_lexer_lex_start_of_line(out, lexer);
|
|
} else {
|
|
mrk_lexer_lex_middle_of_line(out, lexer);
|
|
}
|
|
|
|
return mrk_lexer_err_ok;
|
|
}
|
|
|
|
size_t mrk_token_len(mrk_token t) { return t.end - t.start; }
|
|
|
|
size_t mrk_token_lexeme(const char **out, mrk_lexer *lexer, mrk_token t) {
|
|
*out = lexer->buf.s + t.start;
|
|
|
|
return t.end - t.start;
|
|
}
|