diff --git a/include/mrk/ast.h b/include/mrk/ast.h index 0ca008f..1e3adc2 100644 --- a/include/mrk/ast.h +++ b/include/mrk/ast.h @@ -8,7 +8,9 @@ #define MRK_AST_NODE_ARGS 1 typedef enum mrk_ast_node_type { - mrk_ast_node_type_header = 0, + mrk_ast_node_type_none = 0, + mrk_ast_node_type_header, + mrk_ast_node_type_sentence, } mrk_ast_node_type; typedef struct mrk_ast_node { @@ -17,7 +19,10 @@ typedef struct mrk_ast_node { size_t len; } children; mrk_ast_node_type type; - void *args[MRK_AST_NODE_ARGS]; + struct { + void *ptr; + size_t num; + } args[MRK_AST_NODE_ARGS]; } mrk_ast_node; /** diff --git a/include/mrk/common.h b/include/mrk/common.h index f621949..3a8094c 100644 --- a/include/mrk/common.h +++ b/include/mrk/common.h @@ -30,6 +30,8 @@ typedef enum mrk_err { mrk_err_ok = 0, mrk_err_failed_alloc, + mrk_err_unexpected_token, + mrk_err_invalid, } mrk_err; #endif diff --git a/include/mrk/lexer.h b/include/mrk/lexer.h index 49c7af9..bb181c0 100644 --- a/include/mrk/lexer.h +++ b/include/mrk/lexer.h @@ -37,6 +37,7 @@ typedef enum mrk_token_type { mrk_token_type_word, mrk_token_type_checked_box, mrk_token_type_unchecked_box, + mrk_token_type_text, } mrk_token_type; typedef struct mrk_token { @@ -45,6 +46,17 @@ typedef struct mrk_token { size_t end; } mrk_token; +/** + * Return the length of the given token. + */ +size_t mrk_token_len(mrk_token t); + +/** + * Return a pointer to the start of the lexeme in the initial buffer, returning + * the length of the lexeme. + */ +size_t mrk_token_lexeme(const char **out, mrk_lexer *lexer, mrk_token t); + /** * Initialize a new lexer struct. */ diff --git a/include/mrk/parser.h b/include/mrk/parser.h index ace979c..8bd0c33 100644 --- a/include/mrk/parser.h +++ b/include/mrk/parser.h @@ -5,6 +5,8 @@ #include "mrk/common.h" #include "mrk/lexer.h" +#define MRK_MAX_HEADER_LEN 6 + typedef struct mrk_parser mrk_parser; /** diff --git a/src/_include/mrk/parser_internal.h b/src/_include/mrk/parser_internal.h index 9a567c1..86b1953 100644 --- a/src/_include/mrk/parser_internal.h +++ b/src/_include/mrk/parser_internal.h @@ -5,6 +5,10 @@ #include "mrk/parser.h" #define MRK_PARSER_LOOKAHEAD_BUF_SIZE 4 +#define MRK_PARSE_ERR(p, t, m) \ + p->error.token = t; \ + p->error.msg = m; \ + return mrk_err_invalid struct mrk_parser { mrk_lexer *lexer; @@ -12,6 +16,10 @@ struct mrk_parser { mrk_token buf[MRK_PARSER_LOOKAHEAD_BUF_SIZE]; size_t index; } lookahead; + struct { + mrk_token token; + const char *msg; + } error; }; /** @@ -34,4 +42,6 @@ void mrk_parser_advance(mrk_parser *parser); mrk_err mrk_parser_parse_block(mrk_ast_node **out, mrk_parser *parser); +mrk_err mrk_parser_parse_header(mrk_ast_node **out, mrk_parser *parser); + #endif diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c index eed61ff..20efe96 100644 --- a/src/lexer/lexer.c +++ b/src/lexer/lexer.c @@ -129,6 +129,126 @@ void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) { lexer->token.emitted = true; } +void mrk_lexer_lex_special(mrk_token *out, mrk_lexer *lexer) { + char c = mrk_lexer_advance(lexer); + switch (c) { + // All these characters have multiple meanings depending on their location + // in the file and how many there are, so the lexer can only match them as + // one or more grouped characters + case '#': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_pounds); + break; + case '`': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_backticks); + break; + case '-': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_dashes); + break; + case '_': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_underscores); + break; + case '*': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_stars); + break; + case '=': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_equals); + break; + case '\t': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_tabs); + break; + case '>': + mrk_lexer_advance_eq(lexer, c); + mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_brackets); + break; + case '!': + mrk_lexer_emit(out, lexer, mrk_token_type_bang); + break; + case '[': + // Checkboxes for lists are lexed separately to simplify the parser later + // on + if (mrk_lexer_peek_str(lexer, " ]")) { + mrk_lexer_advance_n(lexer, 2); + mrk_lexer_emit(out, lexer, mrk_token_type_unchecked_box); + } else if (mrk_lexer_peek_str(lexer, "x]")) { + mrk_lexer_advance_n(lexer, 2); + mrk_lexer_emit(out, lexer, mrk_token_type_checked_box); + } else { + mrk_lexer_emit(out, lexer, mrk_token_type_left_bracket); + } + break; + case ']': + mrk_lexer_emit(out, lexer, mrk_token_type_right_bracket); + break; + case '(': + mrk_lexer_emit(out, lexer, mrk_token_type_left_paren); + break; + case ')': + mrk_lexer_emit(out, lexer, mrk_token_type_right_paren); + break; + case '\\': + if (mrk_lexer_peek(lexer) == '\n') { + mrk_lexer_emit(out, lexer, mrk_token_type_line_break); + } else { + mrk_lexer_emit(out, lexer, mrk_token_type_backslash); + } + break; + // Two consecutive newlines constitute a blank line, otherwise they're + // ignored as whitespace + case '\n': + if (mrk_lexer_peek(lexer) == '\n') { + mrk_lexer_advance(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_blank_line); + } else { + mrk_lexer_emit(out, lexer, mrk_token_type_newline); + } + break; + case ' ': { + // Either a double space or a line break + if (mrk_lexer_peek_str(lexer, " \n")) { + mrk_lexer_advance_n(lexer, 2); + + mrk_lexer_emit(out, lexer, mrk_token_type_line_break); + } else { + mrk_lexer_advance_eq(lexer, ' '); + mrk_lexer_emit(out, lexer, mrk_token_type_spaces); + } + } break; + default: { + // Match ordered list headers + if (isdigit(c)) { + mrk_lexer_advance(lexer); + + while (isdigit(mrk_lexer_peek(lexer))) { + mrk_lexer_advance(lexer); + } + + if (mrk_lexer_peek(lexer) == '.') { + mrk_lexer_advance(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_dotted_number); + } + // Doesn't end with a dot, so it's just a word that happens to start + // with a number + else { + mrk_lexer_advance_word(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_word); + } + } + // Any other special scenarios we simply parse as a word + else { + mrk_lexer_advance_word(lexer); + mrk_lexer_emit(out, lexer, mrk_token_type_word); + } + } break; + } +} + mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) { if (mrk_lexer_done(lexer)) { return mrk_lexer_err_done; @@ -258,3 +378,11 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) { return lexer->token.emitted ? mrk_lexer_err_ok : mrk_lexer_err_done; } + +size_t mrk_token_len(mrk_token t) { return t.end - t.start; } + +size_t mrk_token_lexeme(const char **out, mrk_lexer *lexer, mrk_token t) { + *out = lexer->buf.s + t.start; + + return t.end - t.start; +} diff --git a/src/parser/parser.c b/src/parser/parser.c index 671db05..9d8a369 100644 --- a/src/parser/parser.c +++ b/src/parser/parser.c @@ -26,4 +26,49 @@ mrk_err mrk_parser_parse(mrk_ast_node **out, mrk_parser *parser) { return mrk_err_ok; } -mrk_err mrk_parser_parse_block(mrk_ast_node **out, mrk_parser *parser) {} +mrk_err mrk_parser_parse_block(mrk_ast_node **out, mrk_parser *parser) { + mrk_token t = mrk_parser_peek(parser); + + mrk_err (*parse_fn)(mrk_ast_node **, mrk_parser *) = NULL; + + switch (t.type) { + case mrk_token_type_pounds: { + parse_fn = mrk_parser_parse_header; + break; + } + } + + if (parse_fn == NULL) { + MRK_PARSE_ERR(parser, t, "Unexpected token."); + } + + return parse_fn(out, parser); +} + +/* mrk_err mrk_parser_parse_ */ + +mrk_err mrk_parser_parse_header(mrk_ast_node **out, mrk_parser *parser) { + mrk_token t = mrk_parser_peek(parser); + + if (mrk_token_len(t) > MRK_MAX_HEADER_LEN) { + MRK_PARSE_ERR(parser, t, "Headers can be at most 6 levels deep."); + } + + mrk_parser_advance(parser); + + mrk_ast_node *header; + MRK_RES(mrk_ast_node_init(&header)); + header->args[0].num = mrk_token_len(t); + + // Headers are blocks of their own, so they're delimited by blank lines + while (!mrk_parser_done(parser) && + (t = mrk_parser_peek(parser)).type != mrk_token_type_blank_line) { + switch (t.type) { /* case */ } + } + + // Skip blank line + mrk_parser_advance(parser); + + *out = header; + return mrk_err_ok; +}