feat(lexer): match regular words
parent
dcc52e2850
commit
ec076a56a5
2
Makefile
2
Makefile
|
@ -24,7 +24,7 @@ TARGETS_TEST := $(BINS_TEST:%=test-%)
|
||||||
TARGETS_MEM_TEST := $(BINS_TEST:%=test-mem-%)
|
TARGETS_MEM_TEST := $(BINS_TEST:%=test-mem-%)
|
||||||
TARGETS_EXAMPLE := $(BINS_EXAMPLE:%=example-%)
|
TARGETS_EXAMPLE := $(BINS_EXAMPLE:%=example-%)
|
||||||
|
|
||||||
_CFLAGS := $(addprefix -I,$(INC_DIRS)) $(CFLAGS) -Wall -Wextra
|
_CFLAGS := $(addprefix -I,$(INC_DIRS)) $(CFLAGS)
|
||||||
|
|
||||||
.PHONY: all
|
.PHONY: all
|
||||||
all: lib
|
all: lib
|
||||||
|
|
|
@ -32,6 +32,8 @@ typedef enum mrk_token_type {
|
||||||
mrk_token_type_left_paren,
|
mrk_token_type_left_paren,
|
||||||
mrk_token_type_right_paren,
|
mrk_token_type_right_paren,
|
||||||
mrk_token_type_backslash,
|
mrk_token_type_backslash,
|
||||||
|
mrk_token_type_dotted_number,
|
||||||
|
mrk_token_type_word,
|
||||||
} mrk_token_type;
|
} mrk_token_type;
|
||||||
|
|
||||||
typedef struct mrk_token {
|
typedef struct mrk_token {
|
||||||
|
|
|
@ -55,6 +55,11 @@ void mrk_lexer_advance_eq(mrk_lexer *lexer, char c);
|
||||||
*/
|
*/
|
||||||
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n);
|
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance the lexer as long as the next character is part of a regular word.
|
||||||
|
*/
|
||||||
|
void mrk_lexer_advance_word(mrk_lexer *lexer);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reset the lexer's current token context.
|
* Reset the lexer's current token context.
|
||||||
*/
|
*/
|
||||||
|
@ -66,4 +71,9 @@ void mrk_lexer_reset(mrk_lexer *lexer);
|
||||||
*/
|
*/
|
||||||
void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type);
|
void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether the given character can be part of a word, and is thus not a
|
||||||
|
* special character.
|
||||||
|
*/
|
||||||
|
bool mrk_is_special_char(char c);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -78,6 +78,12 @@ char mrk_lexer_peek(mrk_lexer *lexer) {
|
||||||
return lexer->buf.s[lexer->pos.buf_index];
|
return lexer->buf.s[lexer->pos.buf_index];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void mrk_lexer_advance_word(mrk_lexer *lexer) {
|
||||||
|
while (!mrk_is_special_char(mrk_lexer_peek(lexer))) {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) {
|
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) {
|
||||||
// Check whether the lexer would be done in n steps
|
// Check whether the lexer would be done in n steps
|
||||||
bool done_in_n = false;
|
bool done_in_n = false;
|
||||||
|
@ -201,9 +207,34 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
|
||||||
mrk_lexer_emit(out, lexer, mrk_token_type_spaces);
|
mrk_lexer_emit(out, lexer, mrk_token_type_spaces);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
default: {
|
||||||
|
// Match ordered list headers
|
||||||
|
if (isdigit(c)) {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
|
||||||
|
while (isdigit(mrk_lexer_peek(lexer))) {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mrk_lexer_peek(lexer) == '.') {
|
||||||
|
mrk_lexer_advance(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_dotted_number);
|
||||||
|
}
|
||||||
|
// Doesn't end with a dot, so it's just a word that happens to start
|
||||||
|
// with a number
|
||||||
|
else {
|
||||||
|
mrk_lexer_advance_word(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Any other special scenarios we simply parse as a word
|
||||||
|
else {
|
||||||
|
mrk_lexer_advance_word(lexer);
|
||||||
|
mrk_lexer_emit(out, lexer, mrk_token_type_word);
|
||||||
|
}
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return lexer->token.emitted ? mrk_lexer_err_ok : mrk_lexer_err_done;
|
return lexer->token.emitted ? mrk_lexer_err_ok : mrk_lexer_err_done;
|
||||||
;
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
#include "mrk/lexer_internal.h"
|
||||||
|
|
||||||
|
const char special_chars[] = {'#', '`', '-', '_', '*', '=', '\t', '>',
|
||||||
|
'!', '[', ']', '(', ')', '\\', '\n', ' '};
|
||||||
|
const size_t special_chars_len = sizeof(special_chars);
|
||||||
|
|
||||||
|
bool mrk_is_special_char(char c) {
|
||||||
|
bool is_special = false;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < special_chars_len && !is_special; i++) {
|
||||||
|
is_special = c == special_chars[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return is_special;
|
||||||
|
}
|
|
@ -37,18 +37,37 @@ void test_lexer_line_break() {
|
||||||
mrk_lexer_open(lxr, buf2, 0);
|
mrk_lexer_open(lxr, buf2, 0);
|
||||||
|
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
TEST_CHECK(t.type == mrk_token_type_space);
|
TEST_CHECK(t.type == mrk_token_type_spaces);
|
||||||
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
|
||||||
TEST_CHECK(t.type == mrk_token_type_space);
|
|
||||||
|
|
||||||
TEST_CHECK(mrk_lexer_done(lxr));
|
TEST_CHECK(mrk_lexer_done(lxr));
|
||||||
|
|
||||||
mrk_lexer_free(lxr);
|
mrk_lexer_free(lxr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void test_lexer_simple1() {
|
||||||
|
LEXER_INIT();
|
||||||
|
|
||||||
|
const char *buf = "### hello world\n\nthis is a paragraph";
|
||||||
|
mrk_lexer_open(lxr, buf, 0);
|
||||||
|
|
||||||
|
mrk_token t;
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_pounds);
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_spaces);
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_word);
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_spaces);
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_word);
|
||||||
|
TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
|
||||||
|
TEST_CHECK(t.type == mrk_token_type_blank_line);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_LIST = {
|
TEST_LIST = {
|
||||||
{ "lexer header", test_lexer_header },
|
{ "lexer header", test_lexer_header },
|
||||||
{ "lexer line break", test_lexer_line_break},
|
{ "lexer line break", test_lexer_line_break},
|
||||||
|
{ "lexer simple 1", test_lexer_simple1 },
|
||||||
{ NULL, NULL }
|
{ NULL, NULL }
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue