feat(lexer): matching more things

main
Jef Roosens 2024-03-04 14:50:00 +01:00
parent f003e3555b
commit f6e034097d
Signed by: Jef Roosens
GPG Key ID: B75D4F293C7052DB
4 changed files with 111 additions and 29 deletions

View File

@ -14,11 +14,16 @@ typedef enum mrk_lexer_err {
} mrk_lexer_err;
typedef enum mrk_token_type {
mrk_token_type_header = 0,
mrk_token_type_pounds = 0,
mrk_token_type_backticks,
mrk_token_type_dashes,
mrk_token_type_underscores,
mrk_token_type_stars,
mrk_token_type_blank_line,
mrk_token_type_star_star,
mrk_token_type_space_space,
mrk_token_type_space,
mrk_token_type_line_break,
mrk_token_type_right_angle_bracket,
mrk_token_type_tab,
} mrk_token_type;
typedef struct mrk_token {

View File

@ -26,6 +26,12 @@ struct mrk_lexer {
*/
char mrk_lexer_peek(mrk_lexer *lexer);
/**
* Return the n'th next character that would be consumed. If `n` is zero, this
* function is equivalent to calling peek.
*/
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n);
/**
* Returns true if the nul-terminated string s is equal to the next characters
* in the token stream.
@ -38,6 +44,11 @@ bool mrk_lexer_match(mrk_lexer *lexer, const char *s);
*/
char mrk_lexer_advance(mrk_lexer *lexer);
/**
* Advance until the next element to peek is not equal to c.
*/
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c);
/**
* Advance `n` positions; equivalent to running advance `n` times and returning
* the last call's result.

View File

@ -1,3 +1,6 @@
#include <ctype.h>
#include <stdio.h>
#include "mrk/lexer.h"
#include "mrk/lexer_internal.h"
@ -55,11 +58,18 @@ char mrk_lexer_advance(mrk_lexer *lexer) {
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
while (n > 1) {
mrk_lexer_advance(lexer);
n--;
}
return mrk_lexer_advance(lexer);
}
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c) {
while (mrk_lexer_peek(lexer) == c) {
mrk_lexer_advance(lexer);
}
}
char mrk_lexer_peek(mrk_lexer *lexer) {
if (mrk_lexer_done(lexer)) {
return '\0';
@ -68,6 +78,27 @@ char mrk_lexer_peek(mrk_lexer *lexer) {
return lexer->buf.s[lexer->pos.buf_index];
}
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) {
// Check whether the lexer would be done in n steps
bool done_in_n = false;
for (size_t i = 0; i < n && !done_in_n; i++) {
done_in_n =
(lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) ||
(lexer->buf.s[lexer->pos.buf_index + i] == '\0');
}
return done_in_n ? '\0' : lexer->buf.s[lexer->pos.buf_index + n];
}
/* bool mrk_lexer_match(mrk_lexer *lexer, const char *s) { */
/* size_t s_len = strlen(s); */
/* if (mrk_lexer_done(lexer) && s[0] != '\0') { */
/* return false; */
/* } */
/* } */
void mrk_lexer_reset(mrk_lexer *lexer) {
lexer->token.start = lexer->pos.buf_index;
lexer->token.end = lexer->pos.buf_index;
@ -90,14 +121,29 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
mrk_lexer_reset(lexer);
while (!lexer->token.emitted && !mrk_lexer_done(lexer)) {
switch (mrk_lexer_advance(lexer)) {
// Match one or more hashtags as a single header definition
char c = mrk_lexer_advance(lexer);
switch (c) {
// All these characters have multiple meanings depending on their location
// in the file and how many there are
case '#':
while (mrk_lexer_peek(lexer) == '#') {
mrk_lexer_advance(lexer);
}
mrk_lexer_emit(out, lexer, mrk_token_type_header);
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_pounds);
break;
case '`':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_backticks);
break;
case '-':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_dashes);
break;
case '_':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_underscores);
break;
case '*':
mrk_lexer_advance_eq(lexer, c);
mrk_lexer_emit(out, lexer, mrk_token_type_stars);
break;
// Two consecutive newlines constitute a blank line, otherwise they're
// ignored as whitespace
@ -110,29 +156,22 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
}
break;
case ' ': {
/* if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer,)) */
// Either a double space or a line break
if (mrk_lexer_peek(lexer) == ' ') {
mrk_lexer_advance(lexer);
if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer, 1) == '\n') {
mrk_lexer_advance_n(lexer, 2);
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_space_space);
}
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
mrk_lexer_reset(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_space);
}
} break;
/* case '*': */
/* if (mrk_lexer_peek(lexer) == '*') { */
/* mrk_lexer_advance(lexer); */
/* mrk_lexer_emit(out, lexer, mrk_token_type_star_star); */
/* } else { */
/* // TODO match word */
/* } */
/* default: */
/* return mrk_lexer_err_unexpected_char; */
case '\t':
mrk_lexer_emit(out, lexer, mrk_token_type_tab);
break;
case '>':
mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_bracket);
break;
}
}

View File

@ -14,14 +14,41 @@ void test_lexer_header() {
mrk_token t;
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_header);
TEST_CHECK(t.type == mrk_token_type_pounds);
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
TEST_CHECK(t.end == 4);
mrk_lexer_free(lxr);
}
void test_lexer_line_break() {
LEXER_INIT();
const char *buf = " \n";
mrk_lexer_open(lxr, buf, 0);
mrk_token t;
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_line_break);
TEST_CHECK(mrk_lexer_done(lxr));
const char *buf2 = " ";
mrk_lexer_open(lxr, buf2, 0);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_space);
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_space);
TEST_CHECK(mrk_lexer_done(lxr));
mrk_lexer_free(lxr);
}
TEST_LIST = {
{ "lexer header", test_lexer_header },
{ "lexer line break", test_lexer_line_break},
{ NULL, NULL }
};