feat(lexer): matching more things
							parent
							
								
									f003e3555b
								
							
						
					
					
						commit
						f6e034097d
					
				| 
						 | 
				
			
			@ -14,11 +14,16 @@ typedef enum mrk_lexer_err {
 | 
			
		|||
} mrk_lexer_err;
 | 
			
		||||
 | 
			
		||||
typedef enum mrk_token_type {
 | 
			
		||||
  mrk_token_type_header = 0,
 | 
			
		||||
  mrk_token_type_pounds = 0,
 | 
			
		||||
  mrk_token_type_backticks,
 | 
			
		||||
  mrk_token_type_dashes,
 | 
			
		||||
  mrk_token_type_underscores,
 | 
			
		||||
  mrk_token_type_stars,
 | 
			
		||||
  mrk_token_type_blank_line,
 | 
			
		||||
  mrk_token_type_star_star,
 | 
			
		||||
  mrk_token_type_space_space,
 | 
			
		||||
  mrk_token_type_space,
 | 
			
		||||
  mrk_token_type_line_break,
 | 
			
		||||
  mrk_token_type_right_angle_bracket,
 | 
			
		||||
  mrk_token_type_tab,
 | 
			
		||||
} mrk_token_type;
 | 
			
		||||
 | 
			
		||||
typedef struct mrk_token {
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -26,6 +26,12 @@ struct mrk_lexer {
 | 
			
		|||
 */
 | 
			
		||||
char mrk_lexer_peek(mrk_lexer *lexer);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Return the n'th next character that would be consumed. If `n` is zero, this
 | 
			
		||||
 * function is equivalent to calling peek.
 | 
			
		||||
 */
 | 
			
		||||
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Returns true if the nul-terminated string s is equal to the next characters
 | 
			
		||||
 * in the token stream.
 | 
			
		||||
| 
						 | 
				
			
			@ -38,6 +44,11 @@ bool mrk_lexer_match(mrk_lexer *lexer, const char *s);
 | 
			
		|||
 */
 | 
			
		||||
char mrk_lexer_advance(mrk_lexer *lexer);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Advance until the next element to peek is not equal to c.
 | 
			
		||||
 */
 | 
			
		||||
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * Advance `n` positions; equivalent to running advance `n` times and returning
 | 
			
		||||
 * the last call's result.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,3 +1,6 @@
 | 
			
		|||
#include <ctype.h>
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
 | 
			
		||||
#include "mrk/lexer.h"
 | 
			
		||||
#include "mrk/lexer_internal.h"
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -55,11 +58,18 @@ char mrk_lexer_advance(mrk_lexer *lexer) {
 | 
			
		|||
char mrk_lexer_advance_n(mrk_lexer *lexer, size_t n) {
 | 
			
		||||
  while (n > 1) {
 | 
			
		||||
    mrk_lexer_advance(lexer);
 | 
			
		||||
    n--;
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return mrk_lexer_advance(lexer);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void mrk_lexer_advance_eq(mrk_lexer *lexer, char c) {
 | 
			
		||||
  while (mrk_lexer_peek(lexer) == c) {
 | 
			
		||||
    mrk_lexer_advance(lexer);
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
char mrk_lexer_peek(mrk_lexer *lexer) {
 | 
			
		||||
  if (mrk_lexer_done(lexer)) {
 | 
			
		||||
    return '\0';
 | 
			
		||||
| 
						 | 
				
			
			@ -68,6 +78,27 @@ char mrk_lexer_peek(mrk_lexer *lexer) {
 | 
			
		|||
  return lexer->buf.s[lexer->pos.buf_index];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
char mrk_lexer_peek_n(mrk_lexer *lexer, size_t n) {
 | 
			
		||||
  // Check whether the lexer would be done in n steps
 | 
			
		||||
  bool done_in_n = false;
 | 
			
		||||
 | 
			
		||||
  for (size_t i = 0; i < n && !done_in_n; i++) {
 | 
			
		||||
    done_in_n =
 | 
			
		||||
        (lexer->buf.len > 0 && lexer->pos.buf_index + i == lexer->buf.len) ||
 | 
			
		||||
        (lexer->buf.s[lexer->pos.buf_index + i] == '\0');
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return done_in_n ? '\0' : lexer->buf.s[lexer->pos.buf_index + n];
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* bool mrk_lexer_match(mrk_lexer *lexer, const char *s) { */
 | 
			
		||||
/*   size_t s_len = strlen(s); */
 | 
			
		||||
/*   if (mrk_lexer_done(lexer) && s[0] != '\0') { */
 | 
			
		||||
/*     return false; */
 | 
			
		||||
/*   } */
 | 
			
		||||
 | 
			
		||||
/* } */
 | 
			
		||||
 | 
			
		||||
void mrk_lexer_reset(mrk_lexer *lexer) {
 | 
			
		||||
  lexer->token.start = lexer->pos.buf_index;
 | 
			
		||||
  lexer->token.end = lexer->pos.buf_index;
 | 
			
		||||
| 
						 | 
				
			
			@ -90,14 +121,29 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
 | 
			
		|||
  mrk_lexer_reset(lexer);
 | 
			
		||||
 | 
			
		||||
  while (!lexer->token.emitted && !mrk_lexer_done(lexer)) {
 | 
			
		||||
    switch (mrk_lexer_advance(lexer)) {
 | 
			
		||||
    // Match one or more hashtags as a single header definition
 | 
			
		||||
    char c = mrk_lexer_advance(lexer);
 | 
			
		||||
    switch (c) {
 | 
			
		||||
    // All these characters have multiple meanings depending on their location
 | 
			
		||||
    // in the file and how many there are
 | 
			
		||||
    case '#':
 | 
			
		||||
      while (mrk_lexer_peek(lexer) == '#') {
 | 
			
		||||
        mrk_lexer_advance(lexer);
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      mrk_lexer_emit(out, lexer, mrk_token_type_header);
 | 
			
		||||
      mrk_lexer_advance_eq(lexer, c);
 | 
			
		||||
      mrk_lexer_emit(out, lexer, mrk_token_type_pounds);
 | 
			
		||||
      break;
 | 
			
		||||
    case '`':
 | 
			
		||||
      mrk_lexer_advance_eq(lexer, c);
 | 
			
		||||
      mrk_lexer_emit(out, lexer, mrk_token_type_backticks);
 | 
			
		||||
      break;
 | 
			
		||||
    case '-':
 | 
			
		||||
      mrk_lexer_advance_eq(lexer, c);
 | 
			
		||||
      mrk_lexer_emit(out, lexer, mrk_token_type_dashes);
 | 
			
		||||
      break;
 | 
			
		||||
    case '_':
 | 
			
		||||
      mrk_lexer_advance_eq(lexer, c);
 | 
			
		||||
      mrk_lexer_emit(out, lexer, mrk_token_type_underscores);
 | 
			
		||||
      break;
 | 
			
		||||
    case '*':
 | 
			
		||||
      mrk_lexer_advance_eq(lexer, c);
 | 
			
		||||
      mrk_lexer_emit(out, lexer, mrk_token_type_stars);
 | 
			
		||||
      break;
 | 
			
		||||
      // Two consecutive newlines constitute a blank line, otherwise they're
 | 
			
		||||
      // ignored as whitespace
 | 
			
		||||
| 
						 | 
				
			
			@ -110,29 +156,22 @@ mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
 | 
			
		|||
      }
 | 
			
		||||
      break;
 | 
			
		||||
    case ' ': {
 | 
			
		||||
      /* if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer,)) */
 | 
			
		||||
      // Either a double space or a line break
 | 
			
		||||
      if (mrk_lexer_peek(lexer) == ' ') {
 | 
			
		||||
        mrk_lexer_advance(lexer);
 | 
			
		||||
      if (mrk_lexer_peek(lexer) == ' ' && mrk_lexer_peek_n(lexer, 1) == '\n') {
 | 
			
		||||
        mrk_lexer_advance_n(lexer, 2);
 | 
			
		||||
 | 
			
		||||
        if (mrk_lexer_peek(lexer) == '\n') {
 | 
			
		||||
          mrk_lexer_advance(lexer);
 | 
			
		||||
          mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
 | 
			
		||||
        } else {
 | 
			
		||||
          mrk_lexer_emit(out, lexer, mrk_token_type_space_space);
 | 
			
		||||
        }
 | 
			
		||||
        mrk_lexer_emit(out, lexer, mrk_token_type_line_break);
 | 
			
		||||
      } else {
 | 
			
		||||
        mrk_lexer_reset(lexer);
 | 
			
		||||
        mrk_lexer_emit(out, lexer, mrk_token_type_space);
 | 
			
		||||
      }
 | 
			
		||||
    } break;
 | 
			
		||||
      /* case '*': */
 | 
			
		||||
      /*   if (mrk_lexer_peek(lexer) == '*') { */
 | 
			
		||||
      /*     mrk_lexer_advance(lexer); */
 | 
			
		||||
      /*     mrk_lexer_emit(out, lexer, mrk_token_type_star_star); */
 | 
			
		||||
      /*   } else { */
 | 
			
		||||
      /*     // TODO match word */
 | 
			
		||||
      /*   } */
 | 
			
		||||
      /* default: */
 | 
			
		||||
      /*   return mrk_lexer_err_unexpected_char; */
 | 
			
		||||
    case '\t':
 | 
			
		||||
      mrk_lexer_emit(out, lexer, mrk_token_type_tab);
 | 
			
		||||
      break;
 | 
			
		||||
    case '>':
 | 
			
		||||
      mrk_lexer_emit(out, lexer, mrk_token_type_right_angle_bracket);
 | 
			
		||||
      break;
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -14,14 +14,41 @@ void test_lexer_header() {
 | 
			
		|||
 | 
			
		||||
  mrk_token t;
 | 
			
		||||
  TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
 | 
			
		||||
  TEST_CHECK(t.type == mrk_token_type_header);
 | 
			
		||||
  TEST_CHECK(t.type == mrk_token_type_pounds);
 | 
			
		||||
  TEST_CHECK_(t.start == 0, "t.start == %lu", t.start);
 | 
			
		||||
  TEST_CHECK(t.end == 4);
 | 
			
		||||
 | 
			
		||||
  mrk_lexer_free(lxr);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void test_lexer_line_break() {
 | 
			
		||||
  LEXER_INIT();
 | 
			
		||||
 | 
			
		||||
  const char *buf = "  \n";
 | 
			
		||||
  mrk_lexer_open(lxr, buf, 0);
 | 
			
		||||
 | 
			
		||||
  mrk_token t;
 | 
			
		||||
  TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
 | 
			
		||||
  TEST_CHECK(t.type == mrk_token_type_line_break);
 | 
			
		||||
 | 
			
		||||
  TEST_CHECK(mrk_lexer_done(lxr));
 | 
			
		||||
 | 
			
		||||
  const char *buf2 = "  ";
 | 
			
		||||
  mrk_lexer_open(lxr, buf2, 0);
 | 
			
		||||
 | 
			
		||||
  TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
 | 
			
		||||
  TEST_CHECK(t.type == mrk_token_type_space);
 | 
			
		||||
  TEST_CHECK(mrk_lexer_next(&t, lxr) == mrk_lexer_err_ok);
 | 
			
		||||
  TEST_CHECK(t.type == mrk_token_type_space);
 | 
			
		||||
 | 
			
		||||
  TEST_CHECK(mrk_lexer_done(lxr));
 | 
			
		||||
 | 
			
		||||
  mrk_lexer_free(lxr);
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
TEST_LIST = {
 | 
			
		||||
  { "lexer header", test_lexer_header },
 | 
			
		||||
  { "lexer line break", test_lexer_line_break},
 | 
			
		||||
  { NULL, NULL }
 | 
			
		||||
};
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue