From 622d644f2510c207a4fb0516bdd998961ac1ed3b Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Fri, 13 Oct 2023 21:10:31 +0200 Subject: [PATCH] feat(lsm): possibly implemented trie insert --- lsm/include/lsm.h | 3 +- lsm/include/lsm/bt.h | 10 +++ lsm/include/lsm/str.h | 61 +++++++++++++ lsm/src/_include/lsm/str_internal.h | 2 +- lsm/src/_include/lsm/trie_internal.h | 24 ++++- lsm/src/bt/lsm_bt.c | 17 ++++ lsm/src/str/lsm_str.c | 99 +++++++++++++++++++++ lsm/src/trie/lsm_trie.c | 127 +++++++++++++++++++++++++++ 8 files changed, 340 insertions(+), 3 deletions(-) diff --git a/lsm/include/lsm.h b/lsm/include/lsm.h index aa76826..8ecb958 100644 --- a/lsm/include/lsm.h +++ b/lsm/include/lsm.h @@ -9,7 +9,8 @@ typedef enum lsm_error { lsm_error_ok = 0, lsm_error_failed_alloc = 1, lsm_error_not_found = 2, - lsm_error_already_present = 3 + lsm_error_already_present = 3, + lsm_error_null_value = 4 } lsm_error; /*typedef struct lsm_string { */ diff --git a/lsm/include/lsm/bt.h b/lsm/include/lsm/bt.h index a2826b0..a0995a1 100644 --- a/lsm/include/lsm/bt.h +++ b/lsm/include/lsm/bt.h @@ -48,4 +48,14 @@ lsm_error lsm_bt_insert(lsm_bt *bt, char key, void *data); */ lsm_error lsm_bt_remove(void **out, lsm_bt *bt, char key); +/** + * Replace the data at an existing key with new data, returning the old. + * + * @param out address to write old data pointer to + * @param bt binary tree to replace in + * @param key key to replace at + * @param data new data to store + */ +lsm_error lsm_bt_replace(void **out, lsm_bt *bt, char key, void *data); + #endif diff --git a/lsm/include/lsm/str.h b/lsm/include/lsm/str.h index 58930ec..b790a32 100644 --- a/lsm/include/lsm/str.h +++ b/lsm/include/lsm/str.h @@ -57,4 +57,65 @@ void lsm_str_free(lsm_str *str); */ uint64_t lsm_str_len(lsm_str *str); +/** + * Return a pointer to the string's underlying char array. Note that this array + * will *not* neccessarily be null-terminatd. + * + * @param str string to return pointer for + */ +const char *lsm_str_ptr(lsm_str *str); + +/** + * Returns the character at the specified position. + * + * @index index of character to return + */ +char lsm_str_char(lsm_str *str, uint64_t index); + +/** + * Take a substring and copy it to a provided string object. + * + * @param out string to store new substring in. The contents of this string will + * be replaced. + * @param str string to take substring from + * @param start inclusive start index for the substring. If this is greater than + * or equal to the string's length, out will be a zero-length string. + * @param end exclusive end index for the substring + */ +lsm_error lsm_str_substr(lsm_str *out, lsm_str *str, uint64_t start, + uint64_t end); + +/** + * Return the first index where s1 and s2 differ, starting at their respective + * offsets. If both strings are equal (or one is a prefix of the other), the + * result will be the length of the shortest string. The returned value is + * relative to the given offets. + * + * @param s1 string to compare + * @param s1_offset offset inside s1 to start comparing from + * @param s2 string to compare s1 to + * @param s2_offset offset inside s2 to start comparing from + */ +uint64_t lsm_str_cmp(lsm_str *s1, uint64_t s1_offset, lsm_str *s2, + uint64_t s2_offset); + +/** + * Truncate a string in-place. + * + * @param s string to truncate + * @param new_len new length of the string. If new_len is >= the original + * length, this function does nothing. + */ +lsm_error lsm_str_truncate(lsm_str *s, uint64_t new_len); + +/** + * Split s at the specified index, saving the second half the string in s2. + * + * @param s string to split + * @param s2 string to store second part of s + * @param index position to split string. If index is the length of s or + * greater, s2 will simply be an empty string. + */ +lsm_error lsm_str_split(lsm_str *s, lsm_str *s2, uint64_t index); + #endif diff --git a/lsm/src/_include/lsm/str_internal.h b/lsm/src/_include/lsm/str_internal.h index 909a0df..03f5395 100644 --- a/lsm/src/_include/lsm/str_internal.h +++ b/lsm/src/_include/lsm/str_internal.h @@ -8,7 +8,7 @@ struct lsm_str { uint64_t len; union { - void *ptr; + char *ptr; char val[8]; } data; }; diff --git a/lsm/src/_include/lsm/trie_internal.h b/lsm/src/_include/lsm/trie_internal.h index 4fb7037..e3526d9 100644 --- a/lsm/src/_include/lsm/trie_internal.h +++ b/lsm/src/_include/lsm/trie_internal.h @@ -5,10 +5,32 @@ #include "lsm/str_internal.h" #include "lsm/trie.h" +/** + * A node inside a trie structure + */ typedef struct lsm_trie_node { lsm_bt bt; lsm_str skip; - char c; + void *data; } lsm_trie_node; +/** + * Allocate and initialize a new trie node + * + * @param ptr pointer to store new node pointer + */ +lsm_error lsm_trie_node_init(lsm_trie_node **ptr); + +/** + * Deallocate a trie node + * + * @param node node to deallocate + */ +void lsm_trie_node_free(lsm_trie_node *node); + +struct lsm_trie { + lsm_trie_node *root; + uint64_t size; +}; + #endif diff --git a/lsm/src/bt/lsm_bt.c b/lsm/src/bt/lsm_bt.c index da08cbd..d5b2895 100644 --- a/lsm/src/bt/lsm_bt.c +++ b/lsm/src/bt/lsm_bt.c @@ -130,3 +130,20 @@ lsm_error lsm_bt_remove(void **out, lsm_bt *bt, char key) { return lsm_error_ok; } + +lsm_error lsm_bt_replace(void **out, lsm_bt *bt, char key, void *data) { + lsm_bt_node *node = bt->root; + + while ((node != NULL) && (node->key != key)) { + node = key < node->key ? node->left : node->right; + } + + if (node == NULL) { + return lsm_error_not_found; + } + + *out = node->data; + node->data = data; + + return lsm_error_ok; +} diff --git a/lsm/src/str/lsm_str.c b/lsm/src/str/lsm_str.c index 38bce13..0e4e75b 100644 --- a/lsm/src/str/lsm_str.c +++ b/lsm/src/str/lsm_str.c @@ -5,6 +5,8 @@ #include "lsm.h" #include "lsm/str_internal.h" +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + lsm_error lsm_str_init_zero(lsm_str **ptr) { lsm_str *str = calloc(1, sizeof(lsm_str)); @@ -59,3 +61,100 @@ void lsm_str_free(lsm_str *str) { } uint64_t lsm_str_len(lsm_str *str) { return str->len; } + +const char *lsm_str_ptr(lsm_str *str) { + if (str->len <= 8) { + return str->data.val; + } else { + return str->data.ptr; + } +} + +char lsm_str_char(lsm_str *str, uint64_t index) { + if (str->len <= 8) { + return str->data.val[index]; + } else { + return str->data.ptr[index]; + } +} + +lsm_error lsm_str_substr(lsm_str *out, lsm_str *str, uint64_t start, + uint64_t end) { + // A substring that starts past the string's length will have length 0 + uint64_t len = start < str->len ? end - start : 0; + const char *str_ptr = lsm_str_ptr(str); + + if (len <= 8) { + lsm_str_zero(out); + memcpy(out->data.val, &str_ptr[start], len); + } else { + char *buf = malloc(len * sizeof(char)); + + if (buf == NULL) { + return lsm_error_failed_alloc; + } + + memcpy(buf, &str_ptr[start], len); + + lsm_str_zero(out); + out->data.ptr = buf; + } + + out->len = len; + + return lsm_error_ok; +} + +uint64_t lsm_str_cmp(lsm_str *s1, uint64_t s1_offset, lsm_str *s2, + uint64_t s2_offset) { + uint64_t index = 0; + uint64_t max_len = MIN(s1->len - s1_offset, s2->len - s2_offset); + + while ((index < max_len) && (lsm_str_char(s1, s1_offset + index) == + lsm_str_char(s2, s2_offset + index))) { + index++; + } + + return index; +} + +lsm_error lsm_str_truncate(lsm_str *s, uint64_t new_len) { + if (new_len >= s->len) { + return lsm_error_ok; + } + + if (new_len <= 8) { + char *s_buf = s->data.ptr; + + memcpy(s->data.val, lsm_str_ptr(s), new_len); + + if (s->len > 8) { + free(s_buf); + } + } else { + char *buf = malloc(new_len * sizeof(char)); + + if (buf == NULL) { + return lsm_error_failed_alloc; + } + + memcpy(buf, s->data.ptr, new_len); + free(s->data.ptr); + + s->data.ptr = buf; + } + + s->len = new_len; + + return lsm_error_ok; +} + +lsm_error lsm_str_split(lsm_str *s, lsm_str *s2, uint64_t index) { + lsm_error res = lsm_str_substr(s2, s, index, s->len); + + if (res != lsm_error_ok) { + return res; + } + + return lsm_str_truncate(s, index); +} diff --git a/lsm/src/trie/lsm_trie.c b/lsm/src/trie/lsm_trie.c index 568decb..c7708df 100644 --- a/lsm/src/trie/lsm_trie.c +++ b/lsm/src/trie/lsm_trie.c @@ -1 +1,128 @@ +#include + +#include "lsm.h" #include "lsm/trie_internal.h" + +lsm_error lsm_trie_node_init(lsm_trie_node **ptr) { + lsm_trie_node *node = calloc(1, sizeof(lsm_trie_node)); + + if (node == NULL) { + return lsm_error_failed_alloc; + } + + *ptr = node; + + return lsm_error_ok; +} + +lsm_error lsm_trie_init(lsm_trie **ptr) { + lsm_trie *trie = calloc(1, sizeof(lsm_trie)); + + if (trie == NULL) { + return lsm_error_failed_alloc; + } + + lsm_trie_node *root; + lsm_error res = lsm_trie_node_init(&root); + + if (res != lsm_error_ok) { + return res; + } + + trie->root = root; + *ptr = trie; + + return lsm_error_ok; +} + +lsm_error lsm_trie_insert(lsm_trie *trie, lsm_str *key, void *data) { + // NULL is not allowed as a data value, as it's used to indicate a lack of + // data + if (data == NULL) { + return lsm_error_null_value; + } + + uint64_t key_len = lsm_str_len(key); + + // Empty string is represented by the root + if (key_len == 0) { + if (trie->root->data == NULL) { + trie->root->data = data; + + return lsm_error_ok; + } else { + return lsm_error_already_present; + } + } + + uint64_t index = 0; + lsm_trie_node *node = trie->root; + lsm_trie_node *next_node; + lsm_error res; + + while (index < key_len) { + char c = lsm_str_char(key, index); + res = lsm_bt_search((void **)&next_node, &node->bt, c); + + // No child is present yet for this character, so we can insert the string + // here + if (res == lsm_error_not_found) { + lsm_trie_node *new_node; + res = lsm_trie_node_init(&new_node); + + if (res != lsm_error_ok) { + return res; + } + + new_node->data = data; + lsm_str_substr(&new_node->skip, key, index + 1, key_len); + + return lsm_bt_insert(&node->bt, c, new_node); + } + + index++; + + // We compare the remaining part of the key with the node's skip. If cmp is + // less than the length of the skip, we know they differ and the edge should + // be split. + uint64_t cmp = lsm_str_cmp(key, index, &next_node->skip, 0); + + if (cmp < lsm_str_len(&next_node->skip)) { + lsm_trie_node *split_node; + res = lsm_trie_node_init(&split_node); + + if (res != lsm_error_ok) { + return res; + } + + // split_node replaces the original node as the new child node + lsm_trie_node *bottom_node; + lsm_bt_replace((void **)&bottom_node, &node->bt, c, split_node); + + // The old child node now becomes the child of split_node + lsm_bt_insert(&split_node->bt, lsm_str_char(key, index + cmp), + bottom_node); + + // The new node splits the edge into two parts, so the new node will have + // the remaining part of the skip (minus the one character) as its skip + lsm_str_substr(&split_node->skip, &next_node->skip, cmp + 1, + lsm_str_len(&next_node->skip)); + + // The old node keeps the first part of the skip + lsm_str_truncate(&next_node->skip, cmp); + + next_node = split_node; + } + + node = next_node; + index += cmp; + } + + if (node->data != NULL) { + return lsm_error_already_present; + } + + node->data = data; + + return lsm_error_ok; +}