feat(lexer): header, line breaks, paragraph breaks
parent
a6c17eff5f
commit
f003e3555b
|
@ -14,7 +14,11 @@ typedef enum mrk_lexer_err {
|
||||||
} mrk_lexer_err;
|
} mrk_lexer_err;
|
||||||
|
|
||||||
typedef enum mrk_token_type {
|
typedef enum mrk_token_type {
|
||||||
mrk_token_type_pound = 0,
|
mrk_token_type_header = 0,
|
||||||
|
mrk_token_type_blank_line,
|
||||||
|
mrk_token_type_star_star,
|
||||||
|
mrk_token_type_space_space,
|
||||||
|
mrk_token_type_line_break,
|
||||||
} mrk_token_type;
|
} mrk_token_type;
|
||||||
|
|
||||||
typedef struct mrk_token {
|
typedef struct mrk_token {
|
||||||
|
@ -28,6 +32,11 @@ typedef struct mrk_token {
|
||||||
*/
|
*/
|
||||||
mrk_err mrk_lexer_init(mrk_lexer **out);
|
mrk_err mrk_lexer_init(mrk_lexer **out);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deallocate the lexer object.
|
||||||
|
*/
|
||||||
|
void mrk_lexer_free(mrk_lexer *lexer);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Open the buffer with the given lexer struct. `buf` is expected to live for
|
* Open the buffer with the given lexer struct. `buf` is expected to live for
|
||||||
* the duration of the lexing.
|
* the duration of the lexing.
|
||||||
|
@ -41,7 +50,7 @@ void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len);
|
||||||
/**
|
/**
|
||||||
* Returns whether the lexer is done.
|
* Returns whether the lexer is done.
|
||||||
*/
|
*/
|
||||||
bool mrk_lexer_at_end(const mrk_lexer *lexer);
|
bool mrk_lexer_done(const mrk_lexer *lexer);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Output the next lexed token for the given input.
|
* Output the next lexed token for the given input.
|
||||||
|
|
|
@ -16,6 +16,7 @@ struct mrk_lexer {
|
||||||
struct {
|
struct {
|
||||||
size_t start;
|
size_t start;
|
||||||
size_t end;
|
size_t end;
|
||||||
|
bool emitted;
|
||||||
} token;
|
} token;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -25,12 +26,29 @@ struct mrk_lexer {
|
||||||
*/
|
*/
|
||||||
char mrk_lexer_peek(mrk_lexer *lexer);
|
char mrk_lexer_peek(mrk_lexer *lexer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns true if the nul-terminated string s is equal to the next characters
|
||||||
|
* in the token stream.
|
||||||
|
*/
|
||||||
|
bool mrk_lexer_match(mrk_lexer *lexer, const char *s);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Advance the current position by one character, adding the new character to
|
* Advance the current position by one character, adding the new character to
|
||||||
* the curent token's context and returning it.
|
* the curent token's context and returning it.
|
||||||
*/
|
*/
|
||||||
char mrk_lexer_advance(mrk_lexer *lexer);
|
char mrk_lexer_advance(mrk_lexer *lexer);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance `n` positions; equivalent to running advance `n` times and returning
|
||||||
|
* the last call's result.
|
||||||
|
*/
|
||||||
|
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reset the lexer's current token context.
|
||||||
|
*/
|
||||||
|
void mrk_lexer_reset(mrk_lexer *lexer);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Output the currently matched token to the token struct with the given type,
|
* Output the currently matched token to the token struct with the given type,
|
||||||
* and reset the lexer's tracked token.
|
* and reset the lexer's tracked token.
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "mrk/lexer.h"
|
||||||
#include "mrk/lexer_internal.h"
|
#include "mrk/lexer_internal.h"
|
||||||
|
|
||||||
mrk_err mrk_lexer_init(mrk_lexer **out) {
|
mrk_err mrk_lexer_init(mrk_lexer **out) {
|
||||||
|
@ -6,6 +7,14 @@ mrk_err mrk_lexer_init(mrk_lexer **out) {
|
||||||
return mrk_err_ok;
|
return mrk_err_ok;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void mrk_lexer_free(mrk_lexer *lexer) {
|
||||||
|
if (lexer == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(lexer);
|
||||||
|
}
|
||||||
|
|
||||||
void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
|
void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
|
||||||
lexer->buf.s = buf;
|
lexer->buf.s = buf;
|
||||||
lexer->buf.len = len;
|
lexer->buf.len = len;
|
||||||
|
@ -13,15 +22,16 @@ void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
|
||||||
lexer->pos.buf_index = 0;
|
lexer->pos.buf_index = 0;
|
||||||
lexer->token.start = 0;
|
lexer->token.start = 0;
|
||||||
lexer->token.end = 0;
|
lexer->token.end = 0;
|
||||||
|
lexer->token.emitted = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mrk_lexer_at_end(const mrk_lexer *lexer) {
|
bool mrk_lexer_done(const mrk_lexer *lexer) {
|
||||||
return (lexer->buf.len > 0 && lexer->pos.buf_index == lexer->buf.len) ||
|
return (lexer->buf.len > 0 && lexer->pos.buf_index == lexer->buf.len) ||
|
||||||
(lexer->buf.s[lexer->pos.buf_index] == '\0');
|
(lexer->buf.s[lexer->pos.buf_index] == '\0');
|
||||||
}
|
}
|
||||||
|
|
||||||
char mrk_lexer_advance(mrk_lexer *lexer) {
|
char mrk_lexer_advance(mrk_lexer *lexer) {
|
||||||
if (mrk_lexer_at_end(lexer)) {
|
if (mrk_lexer_done(lexer)) {
|
||||||
return '\0';
|
return '\0';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,32 +52,90 @@ char mrk_lexer_advance(mrk_lexer *lexer) {
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
|
||||||
|
while (n > 1) {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
}
|
||||||
|
|
||||||
|
return mrk_lexer_advance(lexer);
|
||||||
|
}
|
||||||
|
|
||||||
char mrk_lexer_peek(mrk_lexer *lexer) {
|
char mrk_lexer_peek(mrk_lexer *lexer) {
|
||||||
if (mrk_lexer_at_end(lexer)) {
|
if (mrk_lexer_done(lexer)) {
|
||||||
return '\0';
|
return '\0';
|
||||||
}
|
}
|
||||||
|
|
||||||
return lexer->buf.s[lexer->pos.buf_index];
|
return lexer->buf.s[lexer->pos.buf_index];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void mrk_lexer_reset(mrk_lexer *lexer) {
|
||||||
|
lexer->token.start = lexer->pos.buf_index;
|
||||||
|
lexer->token.end = lexer->pos.buf_index;
|
||||||
|
lexer->token.emitted = false;
|
||||||
|
}
|
||||||
|
|
||||||
void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
|
void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
|
||||||
out->type = type;
|
out->type = type;
|
||||||
out->start = lexer->token.start;
|
out->start = lexer->token.start;
|
||||||
out->start = lexer->token.end;
|
out->end = lexer->token.end;
|
||||||
|
|
||||||
lexer->token.start = lexer->token.end;
|
lexer->token.emitted = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
||||||
if (mrk_lexer_at_end(lexer)) {
|
if (mrk_lexer_done(lexer)) {
|
||||||
return mrk_lexer_err_done;
|
return mrk_lexer_err_done;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mrk_lexer_reset(lexer);
|
||||||
|
|
||||||
|
while (!lexer->token.emitted && !mrk_lexer_done(lexer)) {
|
||||||
switch (mrk_lexer_advance(lexer)) {
|
switch (mrk_lexer_advance(lexer)) {
|
||||||
|
// Match one or more hashtags as a single header definition
|
||||||
case '#':
|
case '#':
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_pound);
|
while (mrk_lexer_peek(lexer) == '#') {
|
||||||
break;
|
mrk_lexer_advance(lexer);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mrk_lexer_err_ok;
|
mrk_lexer_emit(out, lexer, mrk_token_type_header);
|
||||||
|
break;
|
||||||
|
// Two consecutive newlines constitute a blank line, otherwise they're
|
||||||
|
// ignored as whitespace
|
||||||
|
case '\n':
|
||||||
|
if (mrk_lexer_peek(lexer) == '\n') {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_blank_line);
|
||||||
|
} else {
|
||||||
|
mrk_lexer_reset(lexer);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case ' ': {
|
||||||
|
// Either a double space or a line break
|
||||||
|
if (mrk_lexer_peek(lexer) == ' ') {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
|
||||||
|
if (mrk_lexer_peek(lexer) == '\n') {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
|
||||||
|
} else {
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_space_space);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
mrk_lexer_reset(lexer);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
/* case '*': */
|
||||||
|
/* if (mrk_lexer_peek(lexer) == '*') { */
|
||||||
|
/* mrk_lexer_advance(lexer); */
|
||||||
|
/* mrk_lexer_emit(out, lexer, mrk_token_type_star_star); */
|
||||||
|
/* } else { */
|
||||||
|
/* // TODO match word */
|
||||||
|
/* } */
|
||||||
|
/* default: */
|
||||||
|
/* return mrk_lexer_err_unexpected_char; */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return lexer->token.emitted ? mrk_lexer_err_ok : mrk_lexer_err_done;
|
||||||
|
;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
#include "test.h"
|
||||||
|
|
||||||
|
#include "mrk/lexer.h"
|
||||||
|
|
||||||
|
#define LEXER_INIT() \
|
||||||
|
mrk_lexer *lxr; \
|
||||||
|
TEST_CHECK(mrk_lexer_init(&lxr) == mrk_err_ok)
|
||||||
|
|
||||||
|
void test_lexer_header() {
|
||||||
|
LEXER_INIT();
|
||||||
|
|
||||||
|
const char *buf = "#### hallo";
|
||||||
|
mrk_lexer_open(lxr, buf, 0);
|
||||||
|
|
||||||
|
mrk_token t;
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_header);
|
||||||
|
TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
|
||||||
|
TEST_CHECK(t.end == 4);
|
||||||
|
|
||||||
|
mrk_lexer_free(lxr);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_LIST = {
|
||||||
|
{ "lexer header", test_lexer_header },
|
||||||
|
{ NULL, NULL }
|
||||||
|
};
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue