From a6c17eff5f2608f020de24d406a6aea3c620bc2f Mon Sep 17 00:00:00 2001
From: Chewing_Bever <roosensjef@gmail.com>
Date: Mon, 4 Mar 2024 12:18:48 +0100
Subject: [PATCH] feat: started project and lexer

---
 .editorconfig                     |   5 ++
 .gitignore                        |   5 ++
 Makefile                          | 134 ++++++++++++++++++++++++++++++
 config.mk                         |  17 ++++
 include/mrk/common.h              |  19 +++++
 include/mrk/lexer.h               |  51 ++++++++++++
 src/_include/mrk/lexer_internal.h |  40 +++++++++
 src/lexer/lexer.c                 |  73 ++++++++++++++++
 8 files changed, 344 insertions(+)
 create mode 100644 .editorconfig
 create mode 100644 .gitignore
 create mode 100644 Makefile
 create mode 100644 config.mk
 create mode 100644 include/mrk/common.h
 create mode 100644 include/mrk/lexer.h
 create mode 100644 src/_include/mrk/lexer_internal.h
 create mode 100644 src/lexer/lexer.c

diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..09faabf
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,5 @@
+root = true
+
+[*.{c,cpp,h}]
+indent_style = space
+indent_size = 2
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..dd72998
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+build/
+.cache/
+compile_commands.json
+.cache/
+vgcore.*
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e52bb1f
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,134 @@
+# https://spin.atomicobject.com/2016/08/26/makefile-c-projects/ was a great
+# base for this Makefile
+
+-include config.mk
+
+LIB := $(BUILD_DIR)/$(LIB_FILENAME)
+
+SRCS != find '$(SRC_DIR)' -iname '*.c'
+SRCS_H != find include -iname '*.h'
+SRCS_H_INTERNAL != find $(SRC_DIR) -iname '*.h'
+SRCS_TEST != find '$(TEST_DIR)' -iname '*.c'
+SRCS_EXAMPLE != find '$(EXAMPLE_DIR)' -iname '*.c'
+
+OBJS := $(SRCS:%=$(BUILD_DIR)/%.o)
+OBJS_TEST := $(SRCS_TEST:%=$(BUILD_DIR)/%.o)
+OBJS_EXAMPLE := $(SRCS_EXAMPLE:%=$(BUILD_DIR)/%.o)
+
+DEPS := $(SRCS:%=$(BUILD_DIR)/%.d) $(SRCS_TEST:%=$(BUILD_DIR)/%.d)
+
+BINS_TEST := $(OBJS_TEST:%.c.o=%)
+BINS_EXAMPLE := $(OBJS_EXAMPLE:%.c.o=%)
+
+TARGETS_TEST := $(BINS_TEST:%=test-%)
+TARGETS_MEM_TEST := $(BINS_TEST:%=test-mem-%)
+TARGETS_EXAMPLE := $(BINS_EXAMPLE:%=example-%)
+
+_CFLAGS := $(addprefix -I,$(INC_DIRS)) $(CFLAGS) -Wall -Wextra
+
+.PHONY: all
+all: lib
+
+
+# =====COMPILATION=====
+# Utility used by the CI to lint
+.PHONY: objs
+objs: $(OBJS)
+
+.PHONY: lib
+lib: $(LIB)
+$(LIB): $(OBJS)
+	ar -rcs $@ $(OBJS)
+
+$(BUILD_DIR)/$(SRC_DIR)/%.c.o: $(SRC_DIR)/%.c
+	mkdir -p $(dir $@)
+	$(CC) -c $(_CFLAGS) $< -o $@
+
+# =====TESTING=====
+.PHONY: test
+test: $(TARGETS_TEST)
+
+.PHONY: test-mem
+test-mem: $(TARGETS_MEM_TEST)
+
+.PHONY: $(TARGETS_TEST)
+$(TARGETS_TEST): test-%: %
+	./$^
+
+.PHONY: $(TARGETS_MEM_TEST)
+$(TARGETS_MEM_TEST): test-mem-%: %
+	valgrind --tool=memcheck --error-exitcode=1 --track-origins=yes --leak-check=full ./$^
+
+.PHONY: build-test
+build-test: $(BINS_TEST)
+
+$(BINS_TEST): %: %.c.o $(LIB)
+	$(CC) \
+		$^ -o $@
+
+# Along with the include directory, each test includes $(TEST_DIR) (which
+# contains the acutest.h header file), and the src directory of the module it's
+# testing. This allows tests to access internal methods, which aren't publicly
+# exposed.
+$(BUILD_DIR)/$(TEST_DIR)/%.c.o: $(TEST_DIR)/%.c
+	mkdir -p $(dir $@)
+	$(CC) $(_CFLAGS) -I$(TEST_DIR) \
+		-I$(SRC_DIR)/_include \
+		-c $< -o $@
+
+# =====EXAMPLES=====
+.PHONY: build-example
+build-example: $(BINS_EXAMPLE)
+
+$(BINS_EXAMPLE): %: %.c.o $(LIB)
+	$(CC) \
+		$^ -o $@
+
+# Example binaries link the resulting library
+$(BUILD_DIR)/$(EXAMPLE_DIR)/%.c.o: $(EXAMPLE_DIR)/%.c
+	mkdir -p $(dir $@)
+	$(CC) $(_CFLAGS) -I$(PUB_INC_DIR) -c $< -o $@
+
+# =====MAINTENANCE=====
+.PHONY: lint
+lint:
+	clang-format -n --Werror \
+		$(filter-out $(THIRDPARTY),$(SRCS)) \
+		$(filter-out $(THIRDPARTY),$(SRCS_H)) \
+		$(filter-out $(THIRDPARTY),$(SRCS_H_INTERNAL))
+
+.PHONY: fmt
+fmt:
+	clang-format -i \
+		$(filter-out $(THIRDPARTY),$(SRCS)) \
+		$(filter-out $(THIRDPARTY),$(SRCS_H)) \
+		$(filter-out $(THIRDPARTY),$(SRCS_H_INTERNAL))
+
+.PHONY: check
+check:
+	mkdir -p $(BUILD_DIR)/cppcheck
+	cppcheck \
+		$(addprefix -I,$(INC_DIRS)) \
+		--cppcheck-build-dir=$(BUILD_DIR)/cppcheck \
+		--error-exitcode=1 \
+		--enable=warning,style \
+		--inline-suppr \
+		--check-level=exhaustive \
+		--quiet \
+		-j$(shell nproc) \
+		$(filter-out $(THIRDPARTY),$(SRCS))
+
+.PHONY: clean
+clean:
+	rm -rf '$(BUILD_DIR)'
+
+
+.PHONY: bear
+bear: clean
+	bear -- make
+	bear --append -- make build-test
+	bear --append -- make build-example
+
+
+# Make make aware of the .d files
+-include $(DEPS)
diff --git a/config.mk b/config.mk
new file mode 100644
index 0000000..0c88a69
--- /dev/null
+++ b/config.mk
@@ -0,0 +1,17 @@
+LIB_FILENAME = libmrk.a
+
+BUILD_DIR = build
+SRC_DIR = src
+TEST_DIR = test
+EXAMPLE_DIR = example
+THIRDPARTY = 
+
+PUB_INC_DIR = include
+INC_DIRS = $(PUB_INC_DIR) src/_include
+
+# -MMD: generate a .d file for every source file. This file can be imported by
+#  make and makes make aware that a header file has been changed, ensuring an
+#  object file is also recompiled if only a header is changed.
+# -MP: generate a dummy target for every header file (according to the  docs it
+#  prevents some errors when removing header files)
+CFLAGS ?= -MMD -MP -g
diff --git a/include/mrk/common.h b/include/mrk/common.h
new file mode 100644
index 0000000..54163c5
--- /dev/null
+++ b/include/mrk/common.h
@@ -0,0 +1,19 @@
+#ifndef MRK_COMMON
+#define MRK_COMMON
+
+#include <stdlib.h>
+
+#define MRK_CALLOC(out, n, size)                                               \
+  {                                                                            \
+    void *temp = calloc(n, size);                                              \
+    if (temp == NULL)                                                          \
+      return mrk_err_failed_alloc;                                             \
+    *out = temp;                                                               \
+  }
+
+typedef enum mrk_err {
+  mrk_err_ok = 0,
+  mrk_err_failed_alloc,
+} mrk_err;
+
+#endif
diff --git a/include/mrk/lexer.h b/include/mrk/lexer.h
new file mode 100644
index 0000000..88af930
--- /dev/null
+++ b/include/mrk/lexer.h
@@ -0,0 +1,51 @@
+#ifndef MRK_LEXER
+#define MRK_LEXER
+
+#include <stdbool.h>
+
+#include "mrk/common.h"
+
+typedef struct mrk_lexer mrk_lexer;
+
+typedef enum mrk_lexer_err {
+  mrk_lexer_err_ok = 0,
+  mrk_lexer_err_done,
+  mrk_lexer_err_unexpected_char,
+} mrk_lexer_err;
+
+typedef enum mrk_token_type {
+  mrk_token_type_pound = 0,
+} mrk_token_type;
+
+typedef struct mrk_token {
+  mrk_token_type type;
+  size_t start;
+  size_t end;
+} mrk_token;
+
+/**
+ * Initialize a new lexer struct.
+ */
+mrk_err mrk_lexer_init(mrk_lexer **out);
+
+/**
+ * Open the buffer with the given lexer struct. `buf` is expected to live for
+ * the duration of the lexing.
+ *
+ * The lexer will run either until `len` characters have been matched, or until
+ * a nul character has been reached. If `len` is set to 0, only the nul check is
+ * used to determine the end of the buffer.
+ */
+void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len);
+
+/**
+ * Returns whether the lexer is done.
+ */
+bool mrk_lexer_at_end(const mrk_lexer *lexer);
+
+/**
+ * Output the next lexed token for the given input.
+ */
+mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer);
+
+#endif
diff --git a/src/_include/mrk/lexer_internal.h b/src/_include/mrk/lexer_internal.h
new file mode 100644
index 0000000..9811761
--- /dev/null
+++ b/src/_include/mrk/lexer_internal.h
@@ -0,0 +1,40 @@
+#ifndef MRK_LEXER_INTERNAL
+#define MRK_LEXER_INTERNAL
+
+#include "mrk/lexer.h"
+
+struct mrk_lexer {
+  struct {
+    const char *s;
+    size_t len;
+  } buf;
+  struct {
+    size_t line;
+    size_t line_index;
+    size_t buf_index;
+  } pos;
+  struct {
+    size_t start;
+    size_t end;
+  } token;
+};
+
+/**
+ * Return the next character that would be consumed by a call to advance. At the
+ * end of the buffer, this value is nul.
+ */
+char mrk_lexer_peek(mrk_lexer *lexer);
+
+/**
+ * Advance the current position by one character, adding the new character to
+ * the curent token's context and returning it.
+ */
+char mrk_lexer_advance(mrk_lexer *lexer);
+
+/**
+ * Output the currently matched token to the token struct with the given type,
+ * and reset the lexer's tracked token.
+ */
+void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type);
+
+#endif
diff --git a/src/lexer/lexer.c b/src/lexer/lexer.c
new file mode 100644
index 0000000..ccacbb3
--- /dev/null
+++ b/src/lexer/lexer.c
@@ -0,0 +1,73 @@
+#include "mrk/lexer_internal.h"
+
+mrk_err mrk_lexer_init(mrk_lexer **out) {
+  MRK_CALLOC(out, 1, sizeof(mrk_lexer));
+
+  return mrk_err_ok;
+}
+
+void mrk_lexer_open(mrk_lexer *lexer, const char *buf, size_t len) {
+  lexer->buf.s = buf;
+  lexer->buf.len = len;
+  lexer->pos.line = 0;
+  lexer->pos.buf_index = 0;
+  lexer->token.start = 0;
+  lexer->token.end = 0;
+}
+
+bool mrk_lexer_at_end(const mrk_lexer *lexer) {
+  return (lexer->buf.len > 0 && lexer->pos.buf_index == lexer->buf.len) ||
+         (lexer->buf.s[lexer->pos.buf_index] == '\0');
+}
+
+char mrk_lexer_advance(mrk_lexer *lexer) {
+  if (mrk_lexer_at_end(lexer)) {
+    return '\0';
+  }
+
+  char c = lexer->buf.s[lexer->pos.buf_index];
+
+  // A newline is still part of the previous line, so if the last character was
+  // a newline, we now go to the next line
+  if (lexer->buf.s[lexer->pos.buf_index] == '\0') {
+    lexer->pos.line++;
+    lexer->pos.line_index = 0;
+  } else {
+    lexer->pos.line_index++;
+  }
+
+  lexer->pos.buf_index++;
+  lexer->token.end++;
+
+  return c;
+}
+
+char mrk_lexer_peek(mrk_lexer *lexer) {
+  if (mrk_lexer_at_end(lexer)) {
+    return '\0';
+  }
+
+  return lexer->buf.s[lexer->pos.buf_index];
+}
+
+void mrk_lexer_emit(mrk_token *out, mrk_lexer *lexer, mrk_token_type type) {
+  out->type = type;
+  out->start = lexer->token.start;
+  out->start = lexer->token.end;
+
+  lexer->token.start = lexer->token.end;
+}
+
+mrk_lexer_err mrk_lexer_next(mrk_token *out, mrk_lexer *lexer) {
+  if (mrk_lexer_at_end(lexer)) {
+    return mrk_lexer_err_done;
+  }
+
+  switch (mrk_lexer_advance(lexer)) {
+  case '#':
+    mrk_lexer_emit(out, lexer, mrk_token_type_pound);
+    break;
+  }
+
+  return mrk_lexer_err_ok;
+}