From 6938c29725bd6f078bba6c1eb9b7dc762c5ed153 Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Sat, 14 Oct 2023 15:57:33 +0200 Subject: [PATCH] feat(lsm): implement a simple trie remove --- lsm/include/lsm/bt.h | 10 +++++ lsm/src/bt/lsm_bt.c | 38 ++++++++++++------ lsm/src/trie/lsm_trie.c | 87 +++++++++++++++++++++++++++++++++++++++-- lsm/test/bt/bt.c | 6 +++ lsm/test/trie/fuzzy.h | 40 +++++++++---------- 5 files changed, 147 insertions(+), 34 deletions(-) diff --git a/lsm/include/lsm/bt.h b/lsm/include/lsm/bt.h index a0995a1..2e30ae5 100644 --- a/lsm/include/lsm/bt.h +++ b/lsm/include/lsm/bt.h @@ -20,6 +20,16 @@ lsm_error lsm_bt_init(lsm_bt **ptr); */ void lsm_bt_free(lsm_bt *bt); +/** + * Remove the binary tree's entire contents, but keep the struct allocated. + */ +void lsm_bt_clear(lsm_bt *bt); + +/** + * Return the size of the binary tree + */ +uint64_t lsm_bt_size(lsm_bt *bt); + /** * Search for the data stored behind the given key. * diff --git a/lsm/src/bt/lsm_bt.c b/lsm/src/bt/lsm_bt.c index d5b2895..69fa895 100644 --- a/lsm/src/bt/lsm_bt.c +++ b/lsm/src/bt/lsm_bt.c @@ -18,16 +18,16 @@ lsm_error lsm_bt_node_init(lsm_bt_node **ptr, const char key, void *data) { void lsm_bt_node_free(lsm_bt_node *node) { free(node); } -void lsm_bt_node_free_recursive(lsm_bt_node *node) { +void lsm_bt_node_free_tree(lsm_bt_node *node) { if (node->left != NULL) { - lsm_bt_node_free_recursive(node->left); + lsm_bt_node_free_tree(node->left); + lsm_bt_node_free(node->left); } if (node->right != NULL) { - lsm_bt_node_free_recursive(node->right); + lsm_bt_node_free_tree(node->right); + lsm_bt_node_free(node->right); } - - lsm_bt_node_free(node); } lsm_error lsm_bt_init(lsm_bt **ptr) { @@ -42,14 +42,23 @@ lsm_error lsm_bt_init(lsm_bt **ptr) { return lsm_error_ok; } -void lsm_bt_free(lsm_bt *bt) { +void lsm_bt_clear(lsm_bt *bt) { if (bt->root != NULL) { - lsm_bt_node_free_recursive(bt->root); - } + lsm_bt_node_free_tree(bt->root); + lsm_bt_node_free(bt->root); + bt->root = NULL; + bt->size = 0; + } +} + +void lsm_bt_free(lsm_bt *bt) { + lsm_bt_clear(bt); free(bt); } +uint64_t lsm_bt_size(lsm_bt *bt) { return bt->size; } + lsm_error lsm_bt_insert(lsm_bt *bt, char key, void *data) { lsm_bt_node **dest = &bt->root; @@ -85,7 +94,9 @@ lsm_error lsm_bt_search(void **out, lsm_bt *bt, char key) { return lsm_error_not_found; } - *out = node->data; + if (out != NULL) { + *out = node->data; + } return lsm_error_ok; } @@ -105,7 +116,9 @@ lsm_error lsm_bt_remove(void **out, lsm_bt *bt, char key) { return lsm_error_not_found; } - *out = (*dest)->data; + if (out != NULL) { + *out = (*dest)->data; + } bt->size--; if (((*dest)->left != NULL) && ((*dest)->right != NULL)) { @@ -142,7 +155,10 @@ lsm_error lsm_bt_replace(void **out, lsm_bt *bt, char key, void *data) { return lsm_error_not_found; } - *out = node->data; + if (out != NULL) { + *out = node->data; + } + node->data = data; return lsm_error_ok; diff --git a/lsm/src/trie/lsm_trie.c b/lsm/src/trie/lsm_trie.c index a8b7d82..b6e190a 100644 --- a/lsm/src/trie/lsm_trie.c +++ b/lsm/src/trie/lsm_trie.c @@ -16,6 +16,12 @@ lsm_error lsm_trie_node_init(lsm_trie_node **ptr) { return lsm_error_ok; } +void lsm_trie_node_free(lsm_trie_node *node) { + lsm_bt_clear(&node->bt); + lsm_str_zero(&node->skip); + free(node); +} + lsm_error lsm_trie_init(lsm_trie **ptr) { lsm_trie *trie = calloc(1, sizeof(lsm_trie)); @@ -142,12 +148,14 @@ lsm_error lsm_trie_insert(lsm_trie *trie, lsm_str *key, void *data) { return lsm_error_ok; } -lsm_error lsm_trie_search(void **data, lsm_trie *trie, lsm_str *key) { +lsm_error lsm_trie_search(void **out, lsm_trie *trie, lsm_str *key) { uint64_t key_len = lsm_str_len(key); if (key_len == 0) { if (trie->root->data != NULL) { - *data = trie->root->data; + if (out != NULL) { + *out = trie->root->data; + } return lsm_error_ok; } else { @@ -185,7 +193,80 @@ lsm_error lsm_trie_search(void **data, lsm_trie *trie, lsm_str *key) { return lsm_error_not_found; } - *data = node->data; + if (out != NULL) { + *out = node->data; + } + + return lsm_error_ok; +} + +lsm_error lsm_trie_remove(void **data, lsm_trie *trie, lsm_str *key) { + uint64_t key_len = lsm_str_len(key); + + if (key_len == 0) { + if (trie->root->data != NULL) { + if (data != NULL) { + *data = trie->root->data; + } + + trie->root->data = NULL; + trie->size--; + + return lsm_error_ok; + } else { + return lsm_error_not_found; + } + } + + uint64_t index = 0; + lsm_trie_node *parent = trie->root; + lsm_trie_node *child; + lsm_error res; + char c; + + while (index < key_len) { + c = lsm_str_char(key, index); + res = lsm_bt_search((void **)&child, &parent->bt, c); + + if (res != lsm_error_ok) { + return res; + } + + index++; + + uint64_t cmp = lsm_str_cmp(key, index, &child->skip, 0); + + // If we end in the middle of an edge, we definitely haven't found the node + if (cmp != lsm_str_len(&child->skip)) { + return lsm_error_not_found; + } + + index += cmp; + + // This context is needed for the removal + if (index < key_len) { + parent = child; + } + } + + if (child->data == NULL) { + return lsm_error_not_found; + } + + // Child is the node we wish to delete + if (data != NULL) { + *data = child->data; + } + + child->data = NULL; + + // We only remove child if it has no children of its own + if (lsm_bt_size(&child->bt) == 0) { + lsm_bt_remove(NULL, &parent->bt, c); + lsm_trie_node_free(child); + } + + trie->size--; return lsm_error_ok; } diff --git a/lsm/test/bt/bt.c b/lsm/test/bt/bt.c index f96cf99..fdff839 100644 --- a/lsm/test/bt/bt.c +++ b/lsm/test/bt/bt.c @@ -54,6 +54,8 @@ void test_insert_multiple() { TEST_CHECK(lsm_bt_insert(bt, chars[i], (void *)(i + 1)) == lsm_error_ok); } + TEST_CHECK(lsm_bt_size(bt) == char_count); + void *data; for (size_t i = 0; i < char_count; i++) { TEST_CHECK(lsm_bt_insert(bt, chars[i], (void *)(i + 1)) == lsm_error_already_present); @@ -68,11 +70,13 @@ void test_remove_root() { BT_INIT(); TEST_CHECK(lsm_bt_insert(bt, 'a', (void *)1) == lsm_error_ok); + TEST_CHECK(lsm_bt_size(bt) == 1); void *data; TEST_CHECK(lsm_bt_remove(&data, bt, 'a') == lsm_error_ok); TEST_CHECK(data == (void *)1); TEST_CHECK(bt->root == NULL); + TEST_CHECK(lsm_bt_size(bt) == 0); lsm_bt_free(bt); } @@ -95,6 +99,8 @@ void test_remove_multiple() { TEST_CHECK(data == (void *)6); TEST_CHECK(lsm_bt_remove(&data, bt, 'e') == lsm_error_not_found); + TEST_CHECK(lsm_bt_size(bt) == char_count - 2); + lsm_bt_free(bt); } diff --git a/lsm/test/trie/fuzzy.h b/lsm/test/trie/fuzzy.h index 6cd0a07..40850e9 100644 --- a/lsm/test/trie/fuzzy.h +++ b/lsm/test/trie/fuzzy.h @@ -122,8 +122,7 @@ int fuzzy_test_trie_seed(FuzzyConfig conf) { lsm_trie *trie; lsm_trie_init(&trie); - bool changed; - lsm_error status; + lsm_error res; // 0: success // 1: invalid add @@ -134,13 +133,13 @@ int fuzzy_test_trie_seed(FuzzyConfig conf) { // Add all strings to trie, checking for duplicates for (int i = 0; i < conf.word_count; i++) { - status = lsm_trie_insert(trie, &matrix[i], (void **)1); + res = lsm_trie_insert(trie, &matrix[i], (void **)1); // if changed is false, *contains_dedupped[i] should be true, as changed // can only be false if the string is already contained in the trie. if // changed is true, *contains_dedupped[i] should be false, as the string // cannot be in the trie yet. - if (status == lsm_error_ok && *contains_dedupped[i]) { + if (res == lsm_error_ok && *contains_dedupped[i]) { exit_code = 1; goto END; } @@ -159,26 +158,27 @@ int fuzzy_test_trie_seed(FuzzyConfig conf) { } // Remove all strings again, again taking duplicates into consideration - /* for (int i = 0; i < conf.word_count; i++) { */ - /* changed = remove_func(ct, matrix[i]); */ + for (int i = 0; i < conf.word_count; i++) { + res = lsm_trie_remove(NULL, trie, &matrix[i]); - /* // The string shouldn't be in the trie, yet another add operation */ - /* // says it added it as well */ - /* if (changed != *contains_dedupped[i]) { */ - /* exit_code = 2; */ - /* goto END; */ - /* } */ + // The string shouldn't be in the trie, yet another add operation + // says it added it as well + if (res == lsm_error_ok && !*contains_dedupped[i]) { + exit_code = 2; + goto END; + } - /* if (*contains_dedupped[i]) { */ - /* *contains_dedupped[i] = false; */ - /* size--; */ - /* } */ - /* } */ + if (*contains_dedupped[i]) { + *contains_dedupped[i] = false; + size--; + } + } // Finally, check that the trie is completely empty - /* if (size_func(ct) != 0) { */ - /* exit_code = 4; */ - /* } */ + if (lsm_trie_size(trie) != 0) { + printf("%lu %lu\n", lsm_trie_size(trie), size); + exit_code = 4; + } END: /* trie_free(ct); */