feat(lexer): header, line breaks, paragraph breaks

main
Jef Roosens 2024-03-04 14:06:17 +01:00
parent a6c17eff5f
commit f003e3555b
Signed by: Jef Roosens
GPG Key ID: B75D4F293C7052DB
5 changed files with 1974 additions and 13 deletions

View File

@ -14,7 +14,11 @@ typedef enum mrk_lexer_err {
} mrk_lexer_err;
typedef enum mrk_token_type {
mrk_token_type_pound = 0,
mrk_token_type_header = 0,
mrk_token_type_blank_line,
mrk_token_type_star_star,
mrk_token_type_space_space,
mrk_token_type_line_break,
} mrk_token_type;
typedef struct mrk_token {
@ -28,6 +32,11 @@ typedef struct mrk_token {
*/
mrk_err mrk_lexer_init(mrk_lexer **out);
/**
* Deallocate the lexer object.
*/
void mrk_lexer_free(mrk_lexer *lexer);
/**
* Open the buffer with the given lexer struct. `buf` is expected to live for
* the duration of the lexing.
@ -41,7 +50,7 @@ void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len);
/**
* Returns whether the lexer is done.
*/
bool mrk_lexer_at_end(const mrk_lexer *lexer);
bool mrk_lexer_done(const mrk_lexer *lexer);
/**
* Output the next lexed token for the given input.

View File

@ -16,6 +16,7 @@ struct mrk_lexer {
struct {
size_t start;
size_t end;
bool emitted;
} token;
};
@ -25,12 +26,29 @@ struct mrk_lexer {
*/
char mrk_lexer_peek(mrk_lexer *lexer);
/**
* Returns true if the nul-terminated string s is equal to the next characters
* in the token stream.
*/
bool mrk_lexer_match(mrk_lexer *lexer, const char *s);
/**
* Advance the current position by one character, adding the new character to
* the curent token's context and returning it.
*/
char mrk_lexer_advance(mrk_lexer *lexer);
/**
* Advance `n` positions; equivalent to running advance `n` times and returning
* the last call's result.
*/
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n);
/**
* Reset the lexer's current token context.
*/
void mrk_lexer_reset(mrk_lexer *lexer);
/**
* Output the currently matched token to the token struct with the given type,
* and reset the lexer's tracked token.

View File

@ -1,3 +1,4 @@
#include "mrk/lexer.h"
#include "mrk/lexer_internal.h"
mrk_err mrk_lexer_init(mrk_lexer **out) {
@ -6,6 +7,14 @@ mrk_err mrk_lexer_init(mrk_lexer **out) {
return mrk_err_ok;
}
void mrk_lexer_free(mrk_lexer *lexer) {
if (lexer == NULL) {
return;
}
free(lexer);
}
void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
lexer->buf.s = buf;
lexer->buf.len = len;
@ -13,15 +22,16 @@ void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
lexer->pos.buf_index = 0;
lexer->token.start = 0;
lexer->token.end = 0;
lexer->token.emitted = false;
}
bool mrk_lexer_at_end(const mrk_lexer *lexer) {
bool mrk_lexer_done(const mrk_lexer *lexer) {
return (lexer->buf.len > 0 && lexer->pos.buf_index == lexer->buf.len) ||
(lexer->buf.s[lexer->pos.buf_index] == '\0');
}
char mrk_lexer_advance(mrk_lexer *lexer) {
if (mrk_lexer_at_end(lexer)) {
if (mrk_lexer_done(lexer)) {
return '\0';
}
@ -42,32 +52,90 @@ char mrk_lexer_advance(mrk_lexer *lexer) {
return c;
}
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
while (n > 1) {
mrk_lexer_advance(lexer);
}
return mrk_lexer_advance(lexer);
}
char mrk_lexer_peek(mrk_lexer *lexer) {
if (mrk_lexer_at_end(lexer)) {
if (mrk_lexer_done(lexer)) {
return '\0';
}
return lexer->buf.s[lexer->pos.buf_index];
}
void mrk_lexer_reset(mrk_lexer *lexer) {
lexer->token.start = lexer->pos.buf_index;
lexer->token.end = lexer->pos.buf_index;
lexer->token.emitted = false;
}
void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
out->type = type;
out->start = lexer->token.start;
out->start = lexer->token.end;
out->end = lexer->token.end;
lexer->token.start = lexer->token.end;
lexer->token.emitted = true;
}
mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
if (mrk_lexer_at_end(lexer)) {
if (mrk_lexer_done(lexer)) {
return mrk_lexer_err_done;
}
switch (mrk_lexer_advance(lexer)) {
case '#':
mrk_lexer_emit(out, lexer, mrk_token_type_pound);
break;
mrk_lexer_reset(lexer);
while (!lexer->token.emitted && !mrk_lexer_done(lexer)) {
switch (mrk_lexer_advance(lexer)) {
// Match one or more hashtags as a single header definition
case '#':
while (mrk_lexer_peek(lexer) == '#') {
mrk_lexer_advance(lexer);
}
mrk_lexer_emit(out, lexer, mrk_token_type_header);
break;
// Two consecutive newlines constitute a blank line, otherwise they're
// ignored as whitespace
case '\n':
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
} else {
mrk_lexer_reset(lexer);
}
break;
case ' ': {
// Either a double space or a line break
if (mrk_lexer_peek(lexer) == ' ') {
mrk_lexer_advance(lexer);
if (mrk_lexer_peek(lexer) == '\n') {
mrk_lexer_advance(lexer);
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
} else {
mrk_lexer_emit(out, lexer, mrk_token_type_space_space);
}
} else {
mrk_lexer_reset(lexer);
}
} break;
/* case '*': */
/* if (mrk_lexer_peek(lexer) == '*') { */
/* mrk_lexer_advance(lexer); */
/* mrk_lexer_emit(out, lexer, mrk_token_type_star_star); */
/* } else { */
/* // TODO match word */
/* } */
/* default: */
/* return mrk_lexer_err_unexpected_char; */
}
}
return mrk_lexer_err_ok;
return lexer->token.emitted ? mrk_lexer_err_ok : mrk_lexer_err_done;
;
}

27
test/lexer/lexer.c 100644
View File

@ -0,0 +1,27 @@
#include "test.h"
#include "mrk/lexer.h"
#define LEXER_INIT() \
mrk_lexer *lxr; \
TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok)
void test_lexer_header() {
LEXER_INIT();
const char *buf = "#### hallo";
mrk_lexer_open(lxr, buf, 0);
mrk_token t;
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
TEST_CHECK(t.type == mrk_token_type_header);
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
TEST_CHECK(t.end == 4);
mrk_lexer_free(lxr);
}
TEST_LIST = {
{ "lexer header", test_lexer_header },
{ NULL, NULL }
};

1839
test/test.h 100644

File diff suppressed because it is too large Load Diff