feat(lexer): matching more things
parent
f003e3555b
commit
f6e034097d
|
@ -14,11 +14,16 @@ typedef enum mrk_lexer_err {
|
||||||
} mrk_lexer_err;
|
} mrk_lexer_err;
|
||||||
|
|
||||||
typedef enum mrk_token_type {
|
typedef enum mrk_token_type {
|
||||||
mrk_token_type_header = 0,
|
mrk_token_type_pounds = 0,
|
||||||
|
mrk_token_type_backticks,
|
||||||
|
mrk_token_type_dashes,
|
||||||
|
mrk_token_type_underscores,
|
||||||
|
mrk_token_type_stars,
|
||||||
mrk_token_type_blank_line,
|
mrk_token_type_blank_line,
|
||||||
mrk_token_type_star_star,
|
mrk_token_type_space,
|
||||||
mrk_token_type_space_space,
|
|
||||||
mrk_token_type_line_break,
|
mrk_token_type_line_break,
|
||||||
|
mrk_token_type_right_angle_bracket,
|
||||||
|
mrk_token_type_tab,
|
||||||
} mrk_token_type;
|
} mrk_token_type;
|
||||||
|
|
||||||
typedef struct mrk_token {
|
typedef struct mrk_token {
|
||||||
|
|
|
@ -26,6 +26,12 @@ struct mrk_lexer {
|
||||||
*/
|
*/
|
||||||
char mrk_lexer_peek(mrk_lexer *lexer);
|
char mrk_lexer_peek(mrk_lexer *lexer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the n'th next character that would be consumed. If `n` is zero, this
|
||||||
|
* function is equivalent to calling peek.
|
||||||
|
*/
|
||||||
|
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns true if the nul-terminated string s is equal to the next characters
|
* Returns true if the nul-terminated string s is equal to the next characters
|
||||||
* in the token stream.
|
* in the token stream.
|
||||||
|
@ -38,6 +44,11 @@ bool mrk_lexer_match(mrk_lexer *lexer, const char *s);
|
||||||
*/
|
*/
|
||||||
char mrk_lexer_advance(mrk_lexer *lexer);
|
char mrk_lexer_advance(mrk_lexer *lexer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance until the next element to peek is not equal to c.
|
||||||
|
*/
|
||||||
|
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Advance `n` positions; equivalent to running advance `n` times and returning
|
* Advance `n` positions; equivalent to running advance `n` times and returning
|
||||||
* the last call's result.
|
* the last call's result.
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
#include <ctype.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "mrk/lexer.h"
|
#include "mrk/lexer.h"
|
||||||
#include "mrk/lexer_internal.h"
|
#include "mrk/lexer_internal.h"
|
||||||
|
|
||||||
|
@ -55,11 +58,18 @@ char mrk_lexer_advance(mrk_lexer *lexer) {
|
||||||
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
|
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
|
||||||
while (n > 1) {
|
while (n > 1) {
|
||||||
mrk_lexer_advance(lexer);
|
mrk_lexer_advance(lexer);
|
||||||
|
n--;
|
||||||
}
|
}
|
||||||
|
|
||||||
return mrk_lexer_advance(lexer);
|
return mrk_lexer_advance(lexer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c) {
|
||||||
|
while (mrk_lexer_peek(lexer) == c) {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
char mrk_lexer_peek(mrk_lexer *lexer) {
|
char mrk_lexer_peek(mrk_lexer *lexer) {
|
||||||
if (mrk_lexer_done(lexer)) {
|
if (mrk_lexer_done(lexer)) {
|
||||||
return '\0';
|
return '\0';
|
||||||
|
@ -68,6 +78,27 @@ char mrk_lexer_peek(mrk_lexer *lexer) {
|
||||||
return lexer->buf.s[lexer->pos.buf_index];
|
return lexer->buf.s[lexer->pos.buf_index];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) {
|
||||||
|
// Check whether the lexer would be done in n steps
|
||||||
|
bool done_in_n = false;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n && !done_in_n; i++) {
|
||||||
|
done_in_n =
|
||||||
|
(lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) ||
|
||||||
|
(lexer->buf.s[lexer->pos.buf_index + i] == '\0');
|
||||||
|
}
|
||||||
|
|
||||||
|
return done_in_n ? '\0' : lexer->buf.s[lexer->pos.buf_index + n];
|
||||||
|
}
|
||||||
|
|
||||||
|
/* bool mrk_lexer_match(mrk_lexer *lexer, const char *s) { */
|
||||||
|
/* size_t s_len = strlen(s); */
|
||||||
|
/* if (mrk_lexer_done(lexer) && s[0] != '\0') { */
|
||||||
|
/* return false; */
|
||||||
|
/* } */
|
||||||
|
|
||||||
|
/* } */
|
||||||
|
|
||||||
void mrk_lexer_reset(mrk_lexer *lexer) {
|
void mrk_lexer_reset(mrk_lexer *lexer) {
|
||||||
lexer->token.start = lexer->pos.buf_index;
|
lexer->token.start = lexer->pos.buf_index;
|
||||||
lexer->token.end = lexer->pos.buf_index;
|
lexer->token.end = lexer->pos.buf_index;
|
||||||
|
@ -90,14 +121,29 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
||||||
mrk_lexer_reset(lexer);
|
mrk_lexer_reset(lexer);
|
||||||
|
|
||||||
while (!lexer->token.emitted && !mrk_lexer_done(lexer)) {
|
while (!lexer->token.emitted && !mrk_lexer_done(lexer)) {
|
||||||
switch (mrk_lexer_advance(lexer)) {
|
char c = mrk_lexer_advance(lexer);
|
||||||
// Match one or more hashtags as a single header definition
|
switch (c) {
|
||||||
|
// All these characters have multiple meanings depending on their location
|
||||||
|
// in the file and how many there are
|
||||||
case '#':
|
case '#':
|
||||||
while (mrk_lexer_peek(lexer) == '#') {
|
mrk_lexer_advance_eq(lexer, c);
|
||||||
mrk_lexer_advance(lexer);
|
mrk_lexer_emit(out, lexer, mrk_token_type_pounds);
|
||||||
}
|
break;
|
||||||
|
case '`':
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_header);
|
mrk_lexer_advance_eq(lexer, c);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_backticks);
|
||||||
|
break;
|
||||||
|
case '-':
|
||||||
|
mrk_lexer_advance_eq(lexer, c);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_dashes);
|
||||||
|
break;
|
||||||
|
case '_':
|
||||||
|
mrk_lexer_advance_eq(lexer, c);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_underscores);
|
||||||
|
break;
|
||||||
|
case '*':
|
||||||
|
mrk_lexer_advance_eq(lexer, c);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_stars);
|
||||||
break;
|
break;
|
||||||
// Two consecutive newlines constitute a blank line, otherwise they're
|
// Two consecutive newlines constitute a blank line, otherwise they're
|
||||||
// ignored as whitespace
|
// ignored as whitespace
|
||||||
|
@ -110,29 +156,22 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case ' ': {
|
case ' ': {
|
||||||
|
/* if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer,)) */
|
||||||
// Either a double space or a line break
|
// Either a double space or a line break
|
||||||
if (mrk_lexer_peek(lexer) == ' ') {
|
if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer, 1) == '\n') {
|
||||||
mrk_lexer_advance(lexer);
|
mrk_lexer_advance_n(lexer, 2);
|
||||||
|
|
||||||
if (mrk_lexer_peek(lexer) == '\n') {
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
||||||
mrk_lexer_advance(lexer);
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
|
||||||
} else {
|
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_space_space);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
mrk_lexer_reset(lexer);
|
mrk_lexer_emit(out, lexer, mrk_token_type_space);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
/* case '*': */
|
case '\t':
|
||||||
/* if (mrk_lexer_peek(lexer) == '*') { */
|
mrk_lexer_emit(out, lexer, mrk_token_type_tab);
|
||||||
/* mrk_lexer_advance(lexer); */
|
break;
|
||||||
/* mrk_lexer_emit(out, lexer, mrk_token_type_star_star); */
|
case '>':
|
||||||
/* } else { */
|
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_bracket);
|
||||||
/* // TODO match word */
|
break;
|
||||||
/* } */
|
|
||||||
/* default: */
|
|
||||||
/* return mrk_lexer_err_unexpected_char; */
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -14,14 +14,41 @@ void test_lexer_header() {
|
||||||
|
|
||||||
mrk_token t;
|
mrk_token t;
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
TEST_CHECK(t.type == mrk_token_type_header);
|
TEST_CHECK(t.type == mrk_token_type_pounds);
|
||||||
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
|
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
|
||||||
TEST_CHECK(t.end == 4);
|
TEST_CHECK(t.end == 4);
|
||||||
|
|
||||||
mrk_lexer_free(lxr);
|
mrk_lexer_free(lxr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void test_lexer_line_break() {
|
||||||
|
LEXER_INIT();
|
||||||
|
|
||||||
|
const char *buf = " \n";
|
||||||
|
mrk_lexer_open(lxr, buf, 0);
|
||||||
|
|
||||||
|
mrk_token t;
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_line_break);
|
||||||
|
|
||||||
|
TEST_CHECK(mrk_lexer_done(lxr));
|
||||||
|
|
||||||
|
const char *buf2 = " ";
|
||||||
|
mrk_lexer_open(lxr, buf2, 0);
|
||||||
|
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_space);
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_space);
|
||||||
|
|
||||||
|
TEST_CHECK(mrk_lexer_done(lxr));
|
||||||
|
|
||||||
|
mrk_lexer_free(lxr);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
TEST_LIST = {
|
TEST_LIST = {
|
||||||
{ "lexer header", test_lexer_header },
|
{ "lexer header", test_lexer_header },
|
||||||
|
{ "lexer line break", test_lexer_line_break},
|
||||||
{ NULL, NULL }
|
{ NULL, NULL }
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue