#include #include #include #include "mrk/lexer.h" #include "mrk/lexer_internal.h" mrk_err mrk_lexer_init(mrk_lexer **out) { MRK_CALLOC(out, 1, sizeof(mrk_lexer)); return mrk_err_ok; } void mrk_lexer_free(mrk_lexer *lexer) { if (lexer == NULL) { return; } free(lexer); } void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) { lexer->buf.s = buf; lexer->buf.len = len; lexer->pos.line = 0; lexer->pos.line_index = 0; lexer->pos.buf_index = 0; lexer->cur_token.type = mrk_token_type_none; lexer->cur_token.start = 0; lexer->cur_token.end = 0; lexer->cur_token.start_line = 0; lexer->cur_token.start_line_index = 0; lexer->last_emitted = lexer->cur_token; } bool mrk_lexer_done(const mrk_lexer *lexer) { return (lexer->buf.len > 0 && lexer->pos.buf_index == lexer->buf.len) || (lexer->buf.s[lexer->pos.buf_index] == '\0'); } char mrk_lexer_advance(mrk_lexer *lexer) { if (mrk_lexer_done(lexer)) { return '\0'; } char c = lexer->buf.s[lexer->pos.buf_index]; // A newline is still part of the previous line, so if the last character was // a newline, we now go to the next line if (c == '\n') { lexer->pos.line++; lexer->pos.line_index = 0; } else { lexer->pos.line_index++; } lexer->pos.buf_index++; lexer->cur_token.end++; return c; } char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) { while (n > 1) { mrk_lexer_advance(lexer); n--; } return mrk_lexer_advance(lexer); } void mrk_lexer_advance_eq(mrk_lexer *lexer, char c) { while (mrk_lexer_peek(lexer) == c) { mrk_lexer_advance(lexer); } } char mrk_lexer_peek(mrk_lexer *lexer) { if (mrk_lexer_done(lexer)) { return '\0'; } return lexer->buf.s[lexer->pos.buf_index]; } void mrk_lexer_advance_word(mrk_lexer *lexer) { while (!mrk_is_special_char(mrk_lexer_peek(lexer))) { mrk_lexer_advance(lexer); } } bool mrk_lexer_peek_str(mrk_lexer *lexer, const char *s) { bool match = true; size_t i = 0; while (*s != '\0') { // Check whether the lexer would be done before matching the entire string bool done_in_n = (lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) || (lexer->buf.s[lexer->pos.buf_index + i] == '\0'); match = !done_in_n && (lexer->buf.s[lexer->pos.buf_index + i] == *s); i++; s++; } return match; } char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) { // Check whether the lexer would be done in n steps bool done_in_n = false; for (size_t i = 0; i < n && !done_in_n; i++) { done_in_n = (lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) || (lexer->buf.s[lexer->pos.buf_index + i] == '\0'); } return done_in_n ? '\0' : lexer->buf.s[lexer->pos.buf_index + n]; } void mrk_lexer_reset(mrk_lexer *lexer) { lexer->cur_token.start = lexer->pos.buf_index; lexer->cur_token.end = lexer->pos.buf_index; lexer->cur_token.start_line = lexer->pos.line; lexer->cur_token.start_line_index = lexer->pos.line_index; } void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) { lexer->cur_token.type = type; *out = lexer->cur_token; lexer->last_emitted = lexer->cur_token; } void mrk_lexer_advance_text(mrk_lexer *lexer) { const char *special_chars = "*\n[]()\\`"; while (!mrk_lexer_done(lexer)) { if (strchr(special_chars, mrk_lexer_peek(lexer)) == NULL) { mrk_lexer_advance(lexer); } else { break; } } } void mrk_lexer_lex_start_of_line(mrk_token *out, mrk_lexer *lexer) { char c = mrk_lexer_advance(lexer); switch (c) { // Headers case '#': mrk_lexer_advance_eq(lexer, c); if (lexer->cur_token.end - lexer->cur_token.start <= MRK_MAX_HEADER_LEN) { mrk_lexer_emit(out, lexer, mrk_token_type_header_start); } else { mrk_lexer_advance_text(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_text); } break; case '-': if (mrk_lexer_peek(lexer) == ' ') { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered); } else { mrk_lexer_advance_eq(lexer, c); if (lexer->cur_token.end - lexer->cur_token.start >= MRK_MIN_HORIZ_RULE_LEN && mrk_lexer_peek(lexer) == '\n') { mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule); } else { mrk_lexer_advance_text(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_text); } } break; case '+': if (mrk_lexer_peek(lexer) == ' ') { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered); } else { mrk_lexer_advance_text(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_text); } break; case '_': case '*': { if (mrk_lexer_peek(lexer) == ' ') { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unordered); } else { // We first check if the entire line consists of stars; otherwise, we // match it as a regular single or double star size_t i = 0; while (mrk_lexer_peek_n(lexer, i) == c) { i++; } if (mrk_lexer_peek_n(lexer, i) == '\n' && (i + 1) >= MRK_MIN_HORIZ_RULE_LEN) { mrk_lexer_advance_n(lexer, i + 1); mrk_lexer_emit(out, lexer, mrk_token_type_horizontal_rule); } else if (mrk_lexer_peek(lexer) == c) { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, c == '_' ? mrk_token_type_double_underscore : mrk_token_type_double_star); } else { mrk_lexer_emit(out, lexer, c == '_' ? mrk_token_type_underscore : mrk_token_type_star); } } } break; case '>': mrk_lexer_advance_eq(lexer, c); mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets); break; case '[': // Checkboxes for lists are lexed separately to simplify the parser later // on if (mrk_lexer_peek_str(lexer, " ]")) { mrk_lexer_advance_n(lexer, 2); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_unchecked); } else if (mrk_lexer_peek_str(lexer, "x]")) { mrk_lexer_advance_n(lexer, 2); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_checked); } else { mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket); } break; case ']': mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket); break; case '(': mrk_lexer_emit(out, lexer, mrk_token_type_left_paren); break; case ')': mrk_lexer_emit(out, lexer, mrk_token_type_right_paren); break; case '\\': // TODO better handle escaped elements if (mrk_lexer_peek(lexer) == '\n') { mrk_lexer_emit(out, lexer, mrk_token_type_line_break); } else { mrk_lexer_emit(out, lexer, mrk_token_type_backslash); } break; case ' ': { // Indents consist of four spaces if (mrk_lexer_peek_str(lexer, " ")) { mrk_lexer_advance_n(lexer, 3); mrk_lexer_emit(out, lexer, mrk_token_type_indent); } // Either a double space or a line break else if (mrk_lexer_peek_str(lexer, " \n")) { mrk_lexer_advance_n(lexer, 2); mrk_lexer_emit(out, lexer, mrk_token_type_line_break); } else { mrk_lexer_advance_text(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_text); } } break; case '\n': if (mrk_lexer_peek(lexer) == '\n') { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_blank_line); } else { mrk_lexer_emit(out, lexer, mrk_token_type_newline); } break; case '\t': mrk_lexer_emit(out, lexer, mrk_token_type_indent); break; case '`': if (mrk_lexer_peek_str(lexer, "``")) { mrk_lexer_advance_n(lexer, 2); mrk_lexer_emit(out, lexer, mrk_token_type_triple_backtick); } else { mrk_lexer_emit(out, lexer, mrk_token_type_backtick); } break; default: { // Match ordered list headers if (isdigit(c)) { while (isdigit(mrk_lexer_peek(lexer))) { mrk_lexer_advance(lexer); } // Ordered list item numbers should be followed by a dot and then a space if (mrk_lexer_peek_str(lexer, ". ")) { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_list_item_ordered); } // Doesn't end with a dot, so it's just a word that happens to start // with a number else { mrk_lexer_advance_text(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_text); } } // Any other special scenarios we simply parse as a word else { mrk_lexer_advance_text(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_text); } } break; } } void mrk_lexer_lex_middle_of_line(mrk_token *out, mrk_lexer *lexer) { char c = mrk_lexer_advance(lexer); switch (c) { case '*': case '_': if (mrk_lexer_peek(lexer) == c) { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, c == '_' ? mrk_token_type_double_underscore : mrk_token_type_double_star); } else { mrk_lexer_emit(out, lexer, c == '_' ? mrk_token_type_underscore : mrk_token_type_star); } break; case '[': mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket); break; case ']': mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket); break; case '(': mrk_lexer_emit(out, lexer, mrk_token_type_left_paren); break; case ')': mrk_lexer_emit(out, lexer, mrk_token_type_right_paren); break; case '\\': // TODO better handle escaped characters mrk_lexer_emit(out, lexer, mrk_token_type_backslash); break; case '\n': if (mrk_lexer_peek(lexer) == '\n') { mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_blank_line); } else { mrk_lexer_emit(out, lexer, mrk_token_type_newline); } break; case '!': mrk_lexer_emit(out, lexer, mrk_token_type_bang); break; case ' ': { if (mrk_lexer_peek_str(lexer, " \n")) { mrk_lexer_advance_n(lexer, 2); mrk_lexer_emit(out, lexer, mrk_token_type_line_break); } else { mrk_lexer_advance_text(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_text); } } break; case '`': mrk_lexer_emit(out, lexer, mrk_token_type_backtick); break; default: mrk_lexer_advance_text(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_text); break; } } mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) { if (mrk_lexer_done(lexer)) { return mrk_lexer_err_done; } mrk_lexer_reset(lexer); if (lexer->pos.line_index == 0 || lexer->last_emitted.type == mrk_token_type_indent) { mrk_lexer_lex_start_of_line(out, lexer); } else { mrk_lexer_lex_middle_of_line(out, lexer); } return mrk_lexer_err_ok; } size_t mrk_token_len(mrk_token t) { return t.end - t.start; } size_t mrk_token_lexeme(const char **out, mrk_lexer *lexer, mrk_token t) { *out = lexer->buf.s + t.start; return t.end - t.start; }