feat(lexer): matching more things

main
Jef Roosens 2024-03-04 14:50:00 +01:00
parent f003e3555b
commit f6e034097d
Signed by: Jef Roosens
GPG Key ID: B75D4F293C7052DB
4 changed files with 111 additions and 29 deletions

View File

@ -14,11 +14,16 @@ typedef enum mrk_lexer_err {
} mrk_lexer_err; } mrk_lexer_err;
typedef enum mrk_token_type { typedef enum mrk_token_type {
mrk_token_type_header = 0, mrk_token_type_pounds = 0,
mrk_token_type_backticks,
mrk_token_type_dashes,
mrk_token_type_underscores,
mrk_token_type_stars,
mrk_token_type_blank_line, mrk_token_type_blank_line,
mrk_token_type_star_star, mrk_token_type_space,
mrk_token_type_space_space,
mrk_token_type_line_break, mrk_token_type_line_break,
mrk_token_type_right_angle_bracket,
mrk_token_type_tab,
} mrk_token_type; } mrk_token_type;
typedef struct mrk_token { typedef struct mrk_token {

View File

@ -26,6 +26,12 @@ struct mrk_lexer {
*/ */
char mrk_lexer_peek(mrk_lexer *lexer); char mrk_lexer_peek(mrk_lexer *lexer);
/**
* Return the n'th next character that would be consumed. If `n` is zero, this
* function is equivalent to calling peek.
*/
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n);
/** /**
* Returns true if the nul-terminated string s is equal to the next characters * Returns true if the nul-terminated string s is equal to the next characters
* in the token stream. * in the token stream.
@ -38,6 +44,11 @@ bool mrk_lexer_match(mrk_lexer *lexer, const char *s);
*/ */
char mrk_lexer_advance(mrk_lexer *lexer); char mrk_lexer_advance(mrk_lexer *lexer);
/**
* Advance until the next element to peek is not equal to c.
*/
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c);
/** /**
* Advance `n` positions; equivalent to running advance `n` times and returning * Advance `n` positions; equivalent to running advance `n` times and returning
* the last call's result. * the last call's result.

View File

@ -1,3 +1,6 @@
#include <ctype.h>
#include <stdio.h>
#include "mrk/lexer.h" #include "mrk/lexer.h"
#include "mrk/lexer_internal.h" #include "mrk/lexer_internal.h"
@ -55,11 +58,18 @@ char mrk_lexer_advance(mrk_lexer *lexer) {
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) { char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
while (n > 1) { while (n > 1) {
mrk_lexer_advance(lexer); mrk_lexer_advance(lexer);
n--;
} }
return mrk_lexer_advance(lexer); return mrk_lexer_advance(lexer);
} }
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c) {
while (mrk_lexer_peek(lexer) == c) {
mrk_lexer_advance(lexer);
}
}
char mrk_lexer_peek(mrk_lexer *lexer) { char mrk_lexer_peek(mrk_lexer *lexer) {
if (mrk_lexer_done(lexer)) { if (mrk_lexer_done(lexer)) {
return '\0'; return '\0';
@ -68,6 +78,27 @@ char mrk_lexer_peek(mrk_lexer *lexer) {
return lexer->buf.s[lexer->pos.buf_index]; return lexer->buf.s[lexer->pos.buf_index];
} }
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) {
// Check whether the lexer would be done in n steps
bool done_in_n = false;
for (size_t i = 0; i < n && !done_in_n; i++) {
done_in_n =
(lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) ||
(lexer->buf.s[lexer->pos.buf_index + i] == '\0');
}
return done_in_n ? '\0' : lexer->buf.s[lexer->pos.buf_index + n];
}
/* bool mrk_lexer_match(mrk_lexer *lexer, const char *s) { */
/* size_t s_len = strlen(s); */
/* if (mrk_lexer_done(lexer) && s[0] != '\0') { */
/* return false; */
/* } */
/* } */
void mrk_lexer_reset(mrk_lexer *lexer) { void mrk_lexer_reset(mrk_lexer *lexer) {
lexer->token.start = lexer->pos.buf_index; lexer->token.start = lexer->pos.buf_index;
lexer->token.end = lexer->pos.buf_index; lexer->token.end = lexer->pos.buf_index;
@ -90,14 +121,29 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_reset(lexer); mrk_lexer_reset(lexer);
while (!lexer->token.emitted && !mrk_lexer_done(lexer)) { while (!lexer->token.emitted && !mrk_lexer_done(lexer)) {
switch (mrk_lexer_advance(lexer)) { char c = mrk_lexer_advance(lexer);
// Match one or more hashtags as a single header definition switch (c) {
// All these characters have multiple meanings depending on their location
// in the file and how many there are
case '#': case '#':
while (mrk_lexer_peek(lexer) == '#') { mrk_lexer_advance_eq(lexer, c);
mrk_lexer_advance(lexer); mrk_lexer_emit(out, lexer, mrk_token_type_pounds);
} break;
case '`':
mrk_lexer_emit(out, lexer, mrk_token_type_header); mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_backticks);
break;
case '-':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_dashes);
break;
case '_':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_underscores);
break;
case '*':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_stars);
break; break;
// Two consecutive newlines constitute a blank line, otherwise they're // Two consecutive newlines constitute a blank line, otherwise they're
// ignored as whitespace // ignored as whitespace
@ -110,29 +156,22 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
} }
break; break;
case ' ': { case ' ': {
/* if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer,)) */
// Either a double space or a line break // Either a double space or a line break
if (mrk_lexer_peek(lexer) == ' ') { if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer, 1) == '\n') {
mrk_lexer_advance(lexer); mrk_lexer_advance_n(lexer, 2);
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break); mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else { } else {
mrk_lexer_emit(out, lexer, mrk_token_type_space_space); mrk_lexer_emit(out, lexer, mrk_token_type_space);
}
} else {
mrk_lexer_reset(lexer);
} }
} break; } break;
/* case '*': */ case '\t':
/* if (mrk_lexer_peek(lexer) == '*') { */ mrk_lexer_emit(out, lexer, mrk_token_type_tab);
/* mrk_lexer_advance(lexer); */ break;
/* mrk_lexer_emit(out, lexer, mrk_token_type_star_star); */ case '>':
/* } else { */ mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_bracket);
/* // TODO match word */ break;
/* } */
/* default: */
/* return mrk_lexer_err_unexpected_char; */
} }
} }

View File

@ -14,14 +14,41 @@ void test_lexer_header() {
mrk_token t; mrk_token t;
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok); TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_header); TEST_CHECK(t.type == mrk_token_type_pounds);
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start); TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
TEST_CHECK(t.end == 4); TEST_CHECK(t.end == 4);
mrk_lexer_free(lxr); mrk_lexer_free(lxr);
} }
void test_lexer_line_break() {
LEXER_INIT();
const char *buf = " \n";
mrk_lexer_open(lxr, buf, 0);
mrk_token t;
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_line_break);
TEST_CHECK(mrk_lexer_done(lxr));
const char *buf2 = " ";
mrk_lexer_open(lxr, buf2, 0);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_space);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_space);
TEST_CHECK(mrk_lexer_done(lxr));
mrk_lexer_free(lxr);
}
TEST_LIST = { TEST_LIST = {
{ "lexer header", test_lexer_header }, { "lexer header", test_lexer_header },
{ "lexer line break", test_lexer_line_break},
{ NULL, NULL } { NULL, NULL }
}; };