mrk/src/lexer/lexer.c

408 lines
11 KiB
C

#include <ctype.h>
#include <stdio.h>
#include <string.h>
#include "mrk/lexer.h"
#include "mrk/lexer_internal.h"
mrk_err mrk_lexer_init(mrk_lexer **out) {
MRK_CALLOC(out, 1, sizeof(mrk_lexer));
return mrk_err_ok;
}
void mrk_lexer_free(mrk_lexer *lexer) {
if (lexer == NULL) {
return;
}
free(lexer);
}
void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
lexer->buf.s = buf;
lexer->buf.len = len;
lexer->pos.line = 0;
lexer->pos.line_index = 0;
lexer->pos.buf_index = 0;
lexer->cur_token.type = mrk_token_type_none;
lexer->cur_token.start = 0;
lexer->cur_token.end = 0;
lexer->cur_token.start_line = 0;
lexer->cur_token.start_line_index = 0;
lexer->last_emitted = lexer->cur_token;
}
bool mrk_lexer_done(const mrk_lexer *lexer) {
return (lexer->buf.len > 0 && lexer->pos.buf_index == lexer->buf.len) ||
(lexer->buf.s[lexer->pos.buf_index] == '\0');
}
char mrk_lexer_advance(mrk_lexer *lexer) {
if (mrk_lexer_done(lexer)) {
return '\0';
}
char c = lexer->buf.s[lexer->pos.buf_index];
// A newline is still part of the previous line, so if the last character was
// a newline, we now go to the next line
if (c == '\n') {
lexer->pos.line++;
lexer->pos.line_index = 0;
} else {
lexer->pos.line_index++;
}
lexer->pos.buf_index++;
lexer->cur_token.end++;
return c;
}
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
while (n > 1) {
mrk_lexer_advance(lexer);
n--;
}
return mrk_lexer_advance(lexer);
}
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c) {
while (mrk_lexer_peek(lexer) == c) {
mrk_lexer_advance(lexer);
}
}
char mrk_lexer_peek(mrk_lexer *lexer) {
if (mrk_lexer_done(lexer)) {
return '\0';
}
return lexer->buf.s[lexer->pos.buf_index];
}
void mrk_lexer_advance_word(mrk_lexer *lexer) {
while (!mrk_is_special_char(mrk_lexer_peek(lexer))) {
mrk_lexer_advance(lexer);
}
}
bool mrk_lexer_peek_str(mrk_lexer *lexer, const char *s) {
bool match = true;
size_t i = 0;
while (*s != '\0') {
// Check whether the lexer would be done before matching the entire string
bool done_in_n =
(lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) ||
(lexer->buf.s[lexer->pos.buf_index + i] == '\0');
match = !done_in_n && (lexer->buf.s[lexer->pos.buf_index + i] == *s);
i++;
s++;
}
return match;
}
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) {
// Check whether the lexer would be done in n steps
bool done_in_n = false;
for (size_t i = 0; i < n && !done_in_n; i++) {
done_in_n =
(lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) ||
(lexer->buf.s[lexer->pos.buf_index + i] == '\0');
}
return done_in_n ? '\0' : lexer->buf.s[lexer->pos.buf_index + n];
}
void mrk_lexer_reset(mrk_lexer *lexer) {
lexer->cur_token.start = lexer->pos.buf_index;
lexer->cur_token.end = lexer->pos.buf_index;
lexer->cur_token.start_line = lexer->pos.line;
lexer->cur_token.start_line_index = lexer->pos.line_index;
}
void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
lexer->cur_token.type = type;
*out = lexer->cur_token;
lexer->last_emitted = lexer->cur_token;
}
void mrk_lexer_advance_text(mrk_lexer *lexer) {
const char *special_chars = "*\n[]()\\`";
while (!mrk_lexer_done(lexer)) {
if (strchr(special_chars, mrk_lexer_peek(lexer)) == NULL) {
mrk_lexer_advance(lexer);
} else {
break;
}
}
}
void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) {
char c = mrk_lexer_advance(lexer);
switch (c) {
// Headers
case '#':
mrk_lexer_advance_eq(lexer, c);
if (lexer->cur_token.end - lexer->cur_token.start <= MRK_MAX_HEADER_LEN) {
mrk_lexer_emit(out, lexer, mrk_token_type_header_start);
} else {
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
break;
case '-':
if (mrk_lexer_peek(lexer) == ' ') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
} else {
mrk_lexer_advance_eq(lexer, c);
if (lexer->cur_token.end - lexer->cur_token.start >=
MRK_MIN_HORIZ_RULE_LEN &&
mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
} else {
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
}
break;
case '+':
if (mrk_lexer_peek(lexer) == ' ') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
} else {
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
break;
case '_':
case '*': {
if (mrk_lexer_peek(lexer) == ' ') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered);
} else {
// We first check if the entire line consists of stars; otherwise, we
// match it as a regular single or double star
size_t i = 0;
while (mrk_lexer_peek_n(lexer, i) == c) {
i++;
}
if (mrk_lexer_peek_n(lexer, i) == '\n' &&
(i + 1) >= MRK_MIN_HORIZ_RULE_LEN) {
mrk_lexer_advance_n(lexer, i + 1);
mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule);
} else if (mrk_lexer_peek(lexer) == c) {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer,
c == '_' ? mrk_token_type_double_underscore
: mrk_token_type_double_star);
} else {
mrk_lexer_emit(out, lexer,
c == '_' ? mrk_token_type_underscore
: mrk_token_type_star);
}
}
} break;
case '>':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets);
break;
case '[':
// Checkboxes for lists are lexed separately to simplify the parser later
// on
if (mrk_lexer_peek_str(lexer, " ]")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unchecked);
} else if (mrk_lexer_peek_str(lexer, "x]")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_checked);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
}
break;
case ']':
mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket);
break;
case '(':
mrk_lexer_emit(out, lexer, mrk_token_type_left_paren);
break;
case ')':
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
break;
case '\\':
// TODO better handle escaped elements
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
}
break;
case ' ': {
// Indents consist of four spaces
if (mrk_lexer_peek_str(lexer, " ")) {
mrk_lexer_advance_n(lexer, 3);
mrk_lexer_emit(out, lexer, mrk_token_type_indent);
}
// Either a double space or a line break
else if (mrk_lexer_peek_str(lexer, " \n")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
} break;
case '\n':
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
}
break;
case '\t':
mrk_lexer_emit(out, lexer, mrk_token_type_indent);
break;
case '`':
if (mrk_lexer_peek_str(lexer, "``")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_triple_backtick);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_backtick);
}
break;
default: {
// Match ordered list headers
if (isdigit(c)) {
while (isdigit(mrk_lexer_peek(lexer))) {
mrk_lexer_advance(lexer);
}
// Ordered list item numbers should be followed by a dot and then a space
if (mrk_lexer_peek_str(lexer, ". ")) {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered);
}
// Doesn't end with a dot, so it's just a word that happens to start
// with a number
else {
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
}
// Any other special scenarios we simply parse as a word
else {
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
} break;
}
}
void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) {
char c = mrk_lexer_advance(lexer);
switch (c) {
case '*':
case '_':
if (mrk_lexer_peek(lexer) == c) {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer,
c == '_' ? mrk_token_type_double_underscore
: mrk_token_type_double_star);
} else {
mrk_lexer_emit(out, lexer,
c == '_' ? mrk_token_type_underscore
: mrk_token_type_star);
}
break;
case '[':
mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket);
break;
case ']':
mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket);
break;
case '(':
mrk_lexer_emit(out, lexer, mrk_token_type_left_paren);
break;
case ')':
mrk_lexer_emit(out, lexer, mrk_token_type_right_paren);
break;
case '\\':
// TODO better handle escaped characters
mrk_lexer_emit(out, lexer, mrk_token_type_backslash);
break;
case '\n':
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_newline);
}
break;
case '!':
mrk_lexer_emit(out, lexer, mrk_token_type_bang);
break;
case ' ': {
if (mrk_lexer_peek_str(lexer, " \n")) {
mrk_lexer_advance_n(lexer, 2);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
}
} break;
case '`':
mrk_lexer_emit(out, lexer, mrk_token_type_backtick);
break;
default:
mrk_lexer_advance_text(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_text);
break;
}
}
mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
if (mrk_lexer_done(lexer)) {
return mrk_lexer_err_done;
}
mrk_lexer_reset(lexer);
if (lexer->pos.line_index == 0 ||
lexer->last_emitted.type == mrk_token_type_indent) {
mrk_lexer_lex_start_of_line(out, lexer);
} else {
mrk_lexer_lex_middle_of_line(out, lexer);
}
return mrk_lexer_err_ok;
}
size_t mrk_token_len(mrk_token t) { return t.end - t.start; }
size_t mrk_token_lexeme(const char **out, mrk_lexer *lexer, mrk_token t) {
*out = lexer->buf.s + t.start;
return t.end - t.start;
}