feat(lexer): matching more things
parent
f003e3555b
commit
f6e034097d
|
@ -14,11 +14,16 @@ typedef enum mrk_lexer_err {
|
|||
} mrk_lexer_err;
|
||||
|
||||
typedef enum mrk_token_type {
|
||||
mrk_token_type_header = 0,
|
||||
mrk_token_type_pounds = 0,
|
||||
mrk_token_type_backticks,
|
||||
mrk_token_type_dashes,
|
||||
mrk_token_type_underscores,
|
||||
mrk_token_type_stars,
|
||||
mrk_token_type_blank_line,
|
||||
mrk_token_type_star_star,
|
||||
mrk_token_type_space_space,
|
||||
mrk_token_type_space,
|
||||
mrk_token_type_line_break,
|
||||
mrk_token_type_right_angle_bracket,
|
||||
mrk_token_type_tab,
|
||||
} mrk_token_type;
|
||||
|
||||
typedef struct mrk_token {
|
||||
|
|
|
@ -26,6 +26,12 @@ struct mrk_lexer {
|
|||
*/
|
||||
char mrk_lexer_peek(mrk_lexer *lexer);
|
||||
|
||||
/**
|
||||
* Return the n'th next character that would be consumed. If `n` is zero, this
|
||||
* function is equivalent to calling peek.
|
||||
*/
|
||||
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n);
|
||||
|
||||
/**
|
||||
* Returns true if the nul-terminated string s is equal to the next characters
|
||||
* in the token stream.
|
||||
|
@ -38,6 +44,11 @@ bool mrk_lexer_match(mrk_lexer *lexer, const char *s);
|
|||
*/
|
||||
char mrk_lexer_advance(mrk_lexer *lexer);
|
||||
|
||||
/**
|
||||
* Advance until the next element to peek is not equal to c.
|
||||
*/
|
||||
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c);
|
||||
|
||||
/**
|
||||
* Advance `n` positions; equivalent to running advance `n` times and returning
|
||||
* the last call's result.
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "mrk/lexer.h"
|
||||
#include "mrk/lexer_internal.h"
|
||||
|
||||
|
@ -55,11 +58,18 @@ char mrk_lexer_advance(mrk_lexer *lexer) {
|
|||
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
|
||||
while (n > 1) {
|
||||
mrk_lexer_advance(lexer);
|
||||
n--;
|
||||
}
|
||||
|
||||
return mrk_lexer_advance(lexer);
|
||||
}
|
||||
|
||||
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c) {
|
||||
while (mrk_lexer_peek(lexer) == c) {
|
||||
mrk_lexer_advance(lexer);
|
||||
}
|
||||
}
|
||||
|
||||
char mrk_lexer_peek(mrk_lexer *lexer) {
|
||||
if (mrk_lexer_done(lexer)) {
|
||||
return '\0';
|
||||
|
@ -68,6 +78,27 @@ char mrk_lexer_peek(mrk_lexer *lexer) {
|
|||
return lexer->buf.s[lexer->pos.buf_index];
|
||||
}
|
||||
|
||||
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) {
|
||||
// Check whether the lexer would be done in n steps
|
||||
bool done_in_n = false;
|
||||
|
||||
for (size_t i = 0; i < n && !done_in_n; i++) {
|
||||
done_in_n =
|
||||
(lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) ||
|
||||
(lexer->buf.s[lexer->pos.buf_index + i] == '\0');
|
||||
}
|
||||
|
||||
return done_in_n ? '\0' : lexer->buf.s[lexer->pos.buf_index + n];
|
||||
}
|
||||
|
||||
/* bool mrk_lexer_match(mrk_lexer *lexer, const char *s) { */
|
||||
/* size_t s_len = strlen(s); */
|
||||
/* if (mrk_lexer_done(lexer) && s[0] != '\0') { */
|
||||
/* return false; */
|
||||
/* } */
|
||||
|
||||
/* } */
|
||||
|
||||
void mrk_lexer_reset(mrk_lexer *lexer) {
|
||||
lexer->token.start = lexer->pos.buf_index;
|
||||
lexer->token.end = lexer->pos.buf_index;
|
||||
|
@ -90,14 +121,29 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
|||
mrk_lexer_reset(lexer);
|
||||
|
||||
while (!lexer->token.emitted && !mrk_lexer_done(lexer)) {
|
||||
switch (mrk_lexer_advance(lexer)) {
|
||||
// Match one or more hashtags as a single header definition
|
||||
char c = mrk_lexer_advance(lexer);
|
||||
switch (c) {
|
||||
// All these characters have multiple meanings depending on their location
|
||||
// in the file and how many there are
|
||||
case '#':
|
||||
while (mrk_lexer_peek(lexer) == '#') {
|
||||
mrk_lexer_advance(lexer);
|
||||
}
|
||||
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_header);
|
||||
mrk_lexer_advance_eq(lexer, c);
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_pounds);
|
||||
break;
|
||||
case '`':
|
||||
mrk_lexer_advance_eq(lexer, c);
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_backticks);
|
||||
break;
|
||||
case '-':
|
||||
mrk_lexer_advance_eq(lexer, c);
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_dashes);
|
||||
break;
|
||||
case '_':
|
||||
mrk_lexer_advance_eq(lexer, c);
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_underscores);
|
||||
break;
|
||||
case '*':
|
||||
mrk_lexer_advance_eq(lexer, c);
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_stars);
|
||||
break;
|
||||
// Two consecutive newlines constitute a blank line, otherwise they're
|
||||
// ignored as whitespace
|
||||
|
@ -110,29 +156,22 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
|||
}
|
||||
break;
|
||||
case ' ': {
|
||||
/* if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer,)) */
|
||||
// Either a double space or a line break
|
||||
if (mrk_lexer_peek(lexer) == ' ') {
|
||||
mrk_lexer_advance(lexer);
|
||||
if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer, 1) == '\n') {
|
||||
mrk_lexer_advance_n(lexer, 2);
|
||||
|
||||
if (mrk_lexer_peek(lexer) == '\n') {
|
||||
mrk_lexer_advance(lexer);
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
||||
} else {
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_space_space);
|
||||
}
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
||||
} else {
|
||||
mrk_lexer_reset(lexer);
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_space);
|
||||
}
|
||||
} break;
|
||||
/* case '*': */
|
||||
/* if (mrk_lexer_peek(lexer) == '*') { */
|
||||
/* mrk_lexer_advance(lexer); */
|
||||
/* mrk_lexer_emit(out, lexer, mrk_token_type_star_star); */
|
||||
/* } else { */
|
||||
/* // TODO match word */
|
||||
/* } */
|
||||
/* default: */
|
||||
/* return mrk_lexer_err_unexpected_char; */
|
||||
case '\t':
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_tab);
|
||||
break;
|
||||
case '>':
|
||||
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_bracket);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -14,14 +14,41 @@ void test_lexer_header() {
|
|||
|
||||
mrk_token t;
|
||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||
TEST_CHECK(t.type == mrk_token_type_header);
|
||||
TEST_CHECK(t.type == mrk_token_type_pounds);
|
||||
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
|
||||
TEST_CHECK(t.end == 4);
|
||||
|
||||
mrk_lexer_free(lxr);
|
||||
}
|
||||
|
||||
void test_lexer_line_break() {
|
||||
LEXER_INIT();
|
||||
|
||||
const char *buf = " \n";
|
||||
mrk_lexer_open(lxr, buf, 0);
|
||||
|
||||
mrk_token t;
|
||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||
TEST_CHECK(t.type == mrk_token_type_line_break);
|
||||
|
||||
TEST_CHECK(mrk_lexer_done(lxr));
|
||||
|
||||
const char *buf2 = " ";
|
||||
mrk_lexer_open(lxr, buf2, 0);
|
||||
|
||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||
TEST_CHECK(t.type == mrk_token_type_space);
|
||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||
TEST_CHECK(t.type == mrk_token_type_space);
|
||||
|
||||
TEST_CHECK(mrk_lexer_done(lxr));
|
||||
|
||||
mrk_lexer_free(lxr);
|
||||
|
||||
}
|
||||
|
||||
TEST_LIST = {
|
||||
{ "lexer header", test_lexer_header },
|
||||
{ "lexer line break", test_lexer_line_break},
|
||||
{ NULL, NULL }
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue