From 88ea0db2ee1dd0e3155ea307c58db1f677914550 Mon Sep 17 00:00:00 2001 From: Chewing_Bever Date: Tue, 29 Nov 2022 15:08:07 +0100 Subject: [PATCH] feat: allow skips up to 8 characters long --- Makefile | 4 + include/trie.h | 1 + src/trie.c | 268 ++++++++++++++++++++++++++++-------------------- src/trie_node.c | 147 ++++++++++++-------------- 4 files changed, 224 insertions(+), 196 deletions(-) diff --git a/Makefile b/Makefile index 9367cfa..e194e57 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,10 @@ prod: cmake-release run: build @ LANDER_DATA_DIR=data LANDER_BASE_URL=http://localhost:18080/ LANDER_API_KEY=test ./build/Debug/lander +.PHONY: gdb +gdb: build + @ LANDER_DATA_DIR=data LANDER_BASE_URL=http://localhost:18080/ LANDER_API_KEY=test gdb --args ./build/Debug/lander + .PHONY: clean clean: @ rm -rf '$(BUILD_DIR)' compile_commands.json diff --git a/include/trie.h b/include/trie.h index ca3ae9a..a7386e8 100644 --- a/include/trie.h +++ b/include/trie.h @@ -4,6 +4,7 @@ #define ALPHABET_SIZE 256 #define DELIMITER '\0' #define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define TRIE_MAX_SKIP_SIZE 8 /** * The implementation of a Ternary Trie. diff --git a/src/trie.c b/src/trie.c index 52708e9..cfb84e6 100644 --- a/src/trie.c +++ b/src/trie.c @@ -21,7 +21,7 @@ typedef struct ttrie { */ Trie *trie_init() { Trie *trie = calloc(1, sizeof(Trie)); - trie->root = ttnode_init(); + trie->root = tnode_init(); pthread_rwlock_init(&trie->lock, NULL); return trie; @@ -33,11 +33,11 @@ Trie *trie_init() { * @param trie trie to free */ void trie_free(Trie *trie) { - ttnode_free(trie->root); + tnode_free(trie->root); free(trie); } -bool trie_add_internal(Trie *trie, const char *key, Entry *entry); +bool trie_add_no_lock(Trie *trie, const char *key, Entry *entry); EntryType entry_type_from_char(char c) { switch (c) { @@ -116,7 +116,7 @@ int trie_populate(Trie *trie, const char *file_path) { buffer[j] = '\0'; entry = entry_new(type, buffer + i + 3); - trie_add_internal(trie, buffer, entry); + trie_add_no_lock(trie, buffer, entry); entries++; } @@ -136,7 +136,7 @@ SearchResult trie_search_node(Trie *trie, const char *key) { // Edge case for empty string if (key[0] == DELIMITER) { - if (trie->root->type == 1) { + if (trie->root->represents) { out.child = trie->root; } @@ -144,38 +144,53 @@ SearchResult trie_search_node(Trie *trie, const char *key) { } size_t i = 0; + size_t offset; TrieNode **node_ptr = &(trie->root); TrieNode **child_ptr; do { - child_ptr = ttnode_search(*node_ptr, key[i], false); + child_ptr = tnode_search(*node_ptr, key[i], false); // We don't have to check whether *node_ptr is NULL, because if it was // NULL, it wouldn't be in the binary tree. - if (child_ptr == NULL || *child_ptr == NULL) { + if (child_ptr == NULL) { return out; } i++; + offset = 0; - if (key[i] == DELIMITER || (*child_ptr)->type == 2) { - break; + // We iterate over each character on the edge and compare it to the string. + while (offset < (*child_ptr)->string_len) { + // Our string ends in the middle of an edge, so it's definitely not in + // the trie. + if (key[i + offset] == DELIMITER) { + return out; + } + + // We compare each character with the characters in the skipped + // substring. If they don't match, we know the string isn't in the + // trie. + if (key[i + offset] != ((*child_ptr)->string[offset])) { + return out; + } + + offset++; } - node_ptr = child_ptr; - } while (1); + i += offset; - if ((*child_ptr)->type == 2) { - if (key[i] != DELIMITER && strcmp(key + i, (*child_ptr)->ptr.string) == 0) { - out.child = *child_ptr; - out.parent = *node_ptr; + if (key[i] != DELIMITER) { + node_ptr = child_ptr; } - } - // Here we know we've traversed through the entire string and have arrived at - // a node that isn't a full leaf - else if ((*child_ptr)->type == 1) { - out.child = *child_ptr; + } while (key[i] != DELIMITER); + + // At this point, we've either arrived at an empty child, or traversed through + // the entire string. Therefore, all we have to do is check whether we're at + // the end of the string and if node represents a string. + if (key[i] == DELIMITER && (*child_ptr)->represents) { out.parent = *node_ptr; + out.child = *child_ptr; } return out; @@ -212,87 +227,114 @@ Entry *trie_search(Trie *trie, const char *key) { * @return true if the string wasn't present in the trie and thus added, false * otherwise */ -bool trie_add_internal(Trie *trie, const char *string, +bool trie_add_no_lock(Trie *trie, const char *string, Entry *entry) { // Edge case for empty string if (string[0] == DELIMITER) { - if (trie->root->type == 0) { - trie->root->type = 1; - trie->root->entry = entry; - trie->size++; - - return true; + if (trie->root->represents) { + return false; } - return false; + trie->root->represents = true; + trie->size++; + + return true; } size_t i = 0; + uint8_t offset; TrieNode **node_ptr = &(trie->root); - TrieNode **new_node_ptr; + TrieNode **child_node_ptr; + TrieNode *child_node; do { - new_node_ptr = ttnode_search(*node_ptr, string[i], true); + offset = 0; + child_node_ptr = tnode_search(*node_ptr, string[i], true); - // ttnode_search will only return NULL with create true if the node to look - // in represents a full leaf. Therefore, we split the node and restart the - // iteration. - if (new_node_ptr == NULL) { - // It's possible we've ended up in the full leaf node that represents this - // string - if (strcmp(string + i, (*node_ptr)->ptr.string) == 0) { - return false; + // We've reached a NULL child, so we add the remaining part of the string here + if (*child_node_ptr == NULL) { + child_node = tnode_init(); + + while (offset < TRIE_MAX_SKIP_SIZE && string[i + 1 + offset] != DELIMITER) { + child_node->string[offset] = string[i + 1 + offset]; + offset++; } - ttnode_split(*node_ptr); - continue; - } + child_node->string_len = offset; + *child_node_ptr = child_node; - node_ptr = new_node_ptr; + // If the remaining part of the string is still longer than the maximum + // allowed skip length, we continue through the loop. The next iteration + // will enter this if statement again, and perform the same loop, until + // the string is fully added to the trie. + if (string[i + 1 + offset] != DELIMITER) { + node_ptr = child_node_ptr; + i += offset + 1; - // The search function has added the character to the node - i++; - - // The next node in the string's path doesn't exist yet, so we add it to the - // trie - if (*node_ptr == NULL) { - TrieNode *new_node = ttnode_init(); - - // If there's a remaining part of the string, we add it to the leaf - if (string[i] != DELIMITER) { - ttnode_set_string(new_node, string + i); - } else { - new_node->type = 1; + continue; } - new_node->entry = entry; - - *node_ptr = new_node; + child_node->represents = true; + child_node->entry = entry; trie->size++; - return true; } + + i++; + + while (offset < (*child_node_ptr)->string_len) { + // String no longer aligns with edge, so we have to split + if (string[i + offset] != (*child_node_ptr)->string[offset]) { + TrieNode *split_node = tnode_init(); + child_node = *child_node_ptr; + + // New string of the split node is the prefix that we were able + // to skip + if (offset > 0) { + memcpy(split_node->string, child_node->string, offset); + split_node->string_len = offset; + } + + // split_node replaces child_node as the child of node + *child_node_ptr = split_node; + TrieNode **new_node_ptr = tnode_search(split_node, child_node->string[offset], true); + *new_node_ptr = child_node; + + // child_node has now become a child of split_node, so we update its + // string accordingely by removing the skipped prefix + the one + // character that's already stored by being a child of split_node + /* char *old_string = child_node->string.ptr; */ + uint8_t new_skip_len = child_node->string_len - (offset + 1); + + if (new_skip_len > 0) { + char old_string[TRIE_MAX_SKIP_SIZE]; + memcpy(old_string, child_node->string + offset + 1, new_skip_len); + memcpy(child_node->string, old_string, new_skip_len); + } + + child_node->string_len = new_skip_len; + + // The while loop will exit either way after this has happened, as + // child_node is now split_node and split_node's len is already set to + // offset. + break; + } + + offset++; + } + + node_ptr = child_node_ptr; + + i += offset; } while (string[i] != DELIMITER); - // If we've arrived here, we've traversed through the entire string and have - // arrived at a node that already exists. - - // The existing node is a full leaf, so we split it and make it - // represent our new string. - if ((*node_ptr)->type == 2) { - ttnode_split(*node_ptr); - } - // The string is already in the trie - else if ((*node_ptr)->type == 1) { + if ((*child_node_ptr)->represents) { return false; } - (*node_ptr)->type = 1; - (*node_ptr)->entry = entry; - + (*child_node_ptr)->represents = true; trie->size++; - return true; } @@ -326,7 +368,7 @@ bool trie_add_persistent(Trie *trie, const char *key, // This function *should* always return true. Otherwise, the function would've // exited because the string was found in the trie. - return trie_add_internal(trie, key, entry); + return trie_add_no_lock(trie, key, entry); } bool trie_add(Trie *trie, const char *key, Entry *entry) { @@ -381,57 +423,57 @@ char *trie_add_random(Trie *trie, Entry *entry, bool secure) { * @param string string to remove * @return true if the string was in the trie and thus removed, false otherwise */ -bool trie_remove(Trie *trie, const char *string) { - pthread_rwlock_wrlock(&trie->lock); +/* bool trie_remove(Trie *trie, const char *string) { */ +/* pthread_rwlock_wrlock(&trie->lock); */ - bool return_value = false; +/* bool return_value = false; */ - SearchResult res = trie_search_node(trie, string); +/* SearchResult res = trie_search_node(trie, string); */ - if (res.child == NULL) { - goto end; - } +/* if (res.child == NULL) { */ +/* goto end; */ +/* } */ - trie->size--; - return_value = true; +/* trie->size--; */ +/* return_value = true; */ - if (res.parent != NULL) { - // We're removing a full leaf, so we calculate the offset of the character - // to remove from the parent - if (res.child->type == 2) { - size_t str_len = strlen(string); - size_t suffix_len = strlen(res.child->ptr.string); +/* if (res.parent != NULL) { */ +/* // We're removing a full leaf, so we calculate the offset of the character */ +/* // to remove from the parent */ +/* if (res.child->type == 2) { */ +/* size_t str_len = strlen(string); */ +/* size_t suffix_len = strlen(res.child->ptr.string); */ - ttnode_remove(res.parent, string[str_len - suffix_len - 1]); - } - // In the other case, the character to remove from the parent is the last - // character of the string - else if (res.child->size == 0) { - size_t i = 0; +/* tnode_remove(res.parent, string[str_len - suffix_len - 1]); */ +/* } */ +/* // In the other case, the character to remove from the parent is the last */ +/* // character of the string */ +/* else if (res.child->size == 0) { */ +/* size_t i = 0; */ - while (string[i + 1] != DELIMITER) { - i++; - } +/* while (string[i + 1] != DELIMITER) { */ +/* i++; */ +/* } */ - ttnode_remove(res.parent, string[i]); - } else { - res.child->type = 0; +/* tnode_remove(res.parent, string[i]); */ +/* } else { */ +/* res.child->type = 0; */ - goto end; - } +/* goto end; */ +/* } */ - ttnode_free(res.child); - } - // We're in the root here - else { - res.child->type = 0; - } +/* tnode_free(res.child); */ +/* } */ +/* // We're in the root here */ +/* else { */ +/* res.child->type = 0; */ +/* } */ -end: - pthread_rwlock_unlock(&trie->lock); +/* end: */ +/* pthread_rwlock_unlock(&trie->lock); */ - return return_value; -} +/* return return_value; */ +/* } */ /** * Return the current size of the given trie. diff --git a/src/trie_node.c b/src/trie_node.c index 9f7e86c..1dbcb8f 100644 --- a/src/trie_node.c +++ b/src/trie_node.c @@ -8,10 +8,10 @@ * Represents a node of the binary tree contained within each non-leaf * TrieNode. */ -typedef struct ttinode { - struct ttinode *left; - struct ttinode *right; - struct ttnode *next; +typedef struct tinode { + struct tinode *left; + struct tinode *right; + struct tnode *next; char key; } TrieInnerNode; @@ -26,25 +26,21 @@ typedef struct ttinode { * to be stored as a single node. Its size will be zero, represents its true, * and its string pointer is initialized. */ -typedef struct ttnode { - union { - TrieInnerNode *root; - char *string; - } ptr; +typedef struct tnode { Entry *entry; - // What type of node this is - // 0: regular non-representing node - // 1: regular representing node - // 2: full leaf - uint8_t type; - // Dependent on type - // 0, 1: size of underlying binary tree - // 2: length of string - uint8_t size; + + TrieInnerNode* tree; + uint8_t tree_size; + + // Skips are at most 8 characters, and are stored in the nodes + char string[TRIE_MAX_SKIP_SIZE]; + uint8_t string_len: 4; + + bool represents : 1; } TrieNode; // Required for recursively freeing tree structure -void ttnode_free(TrieNode *node); +void tnode_free(TrieNode *node); /** * Allocate and initialize a new TrieInnerNode representing a given @@ -53,7 +49,7 @@ void ttnode_free(TrieNode *node); * @param c character to represent * @return pointer to newly allocated struct */ -TrieInnerNode *ttinode_init(char c) { +TrieInnerNode *tinode_init(char c) { TrieInnerNode *node = calloc(1, sizeof(TrieInnerNode)); node->key = c; @@ -65,7 +61,15 @@ TrieInnerNode *ttinode_init(char c) { * * @return pointer to newly allocated struct */ -TrieNode *ttnode_init() { return calloc(1, sizeof(TrieNode)); } +TrieNode *tnode_init() { + TrieNode *node = malloc(sizeof(TrieNode)); + + node->tree_size = 0; + node->string_len = 0; + node->represents = false; + + return node; +} /** * Free a TrieInnerNode and its underlying tree structure. This should @@ -74,17 +78,17 @@ TrieNode *ttnode_init() { return calloc(1, sizeof(TrieNode)); } * * @param node node whose tree to free */ -void ttinode_free_cascade(TrieInnerNode *node) { +void tinode_free_cascade(TrieInnerNode *node) { if (node->left != NULL) { - ttinode_free_cascade(node->left); + tinode_free_cascade(node->left); } if (node->right != NULL) { - ttinode_free_cascade(node->right); + tinode_free_cascade(node->right); } if (node->next != NULL) { - ttnode_free(node->next); + tnode_free(node->next); } free(node); @@ -95,11 +99,9 @@ void ttinode_free_cascade(TrieInnerNode *node) { * * @param node node to free */ -void ttnode_free(TrieNode *node) { - if (node->type == 2) { - free(node->ptr.string); - } else if (node->size != 0) { - ttinode_free_cascade(node->ptr.root); +void tnode_free(TrieNode *node) { + if (node->tree_size > 0) { + tinode_free_cascade(node->tree); } // TODO properly free entry @@ -110,18 +112,6 @@ void ttnode_free(TrieNode *node) { free(node); } -/** - * Add the string to the given node & set its type accordingely. - * - * @param node node to add string to - * @param string string to add - */ -void ttnode_set_string(TrieNode *node, const char *string) { - node->type = 2; - node->size = strlen(string); - node->ptr.string = strdup(string); -} - /** * This function performs a lookup in the underlying binary tree of the given * TrieNode. If found, the return value is a pointer to the memory @@ -140,26 +130,21 @@ void ttnode_set_string(TrieNode *node, const char *string) { * node represents a leaf with a string, because the struct and therefore the * address is created if it doesn't exist yet. */ -TrieNode **ttnode_search(TrieNode *node, const char c, +TrieNode **tnode_search(TrieNode *node, const char c, bool create) { - // Full leafs will always return NULL - if (node->type == 2) { - return NULL; - } - // It can happen that the node has no initialized root yet - if (node->size == 0) { + if (node->tree_size == 0) { if (create) { - node->size++; - node->ptr.root = ttinode_init(c); + node->tree_size++; + node->tree = tinode_init(c); - return &node->ptr.root->next; + return &node->tree->next; } return NULL; } - TrieInnerNode *parent = node->ptr.root; + TrieInnerNode *parent = node->tree; TrieInnerNode *child; // Iterate through the tree until we either find the character or realize it's @@ -186,7 +171,7 @@ TrieNode **ttnode_search(TrieNode *node, const char c, // If create is true, we create the new node so that we can still return a // non-NULL pointer. if (create) { - TrieInnerNode *new_node = ttinode_init(c); + TrieInnerNode *new_node = tinode_init(c); if (c < parent->key) { parent->left = new_node; @@ -194,7 +179,7 @@ TrieNode **ttnode_search(TrieNode *node, const char c, parent->right = new_node; } - node->size++; + node->tree_size++; return &new_node->next; } @@ -208,37 +193,37 @@ TrieNode **ttnode_search(TrieNode *node, const char c, * * @param node node to split */ -void ttnode_split(TrieNode *node) { - TrieNode *new_node = ttnode_init(); - char key = node->ptr.string[0]; +/* void tnode_split(TrieNode *node) { */ +/* TrieNode *new_node = tnode_init(); */ +/* char key = node->ptr.string[0]; */ - // There's a chance the remaining string was only 1 character, meaning the new - // node doesn't have to store a string - if (node->ptr.string[1] != DELIMITER) { - ttnode_set_string(new_node, node->ptr.string + 1); - } else { - new_node->type = 1; - } +/* // There's a chance the remaining string was only 1 character, meaning the new */ +/* // node doesn't have to store a string */ +/* if (node->ptr.string[1] != DELIMITER) { */ +/* tnode_set_string(new_node, node->ptr.string + 1); */ +/* } else { */ +/* new_node->type = 1; */ +/* } */ - new_node->entry = node->entry; +/* new_node->entry = node->entry; */ - node->type = 0; - node->size = 0; - node->entry = NULL; +/* node->type = 0; */ +/* node->size = 0; */ +/* node->entry = NULL; */ - free(node->ptr.string); - node->ptr.string = NULL; +/* free(node->ptr.string); */ +/* node->ptr.string = NULL; */ - // Initialize node's binary tree with the correct character - TrieNode **node_ptr = ttnode_search(node, key, true); - *node_ptr = new_node; -} +/* // Initialize node's binary tree with the correct character */ +/* TrieNode **node_ptr = tnode_search(node, key, true); */ +/* *node_ptr = new_node; */ +/* } */ /* * Remove the given character from a TrieInnerNode's subtree. The * function assumes the character is indeed in the subtree. */ -void ttinode_remove(TrieInnerNode *node, const char c) { +void tinode_remove(TrieInnerNode *node, const char c) { TrieInnerNode **to_remove_ptr = &node; // We use pointers to pointers here so we can later free the removed node @@ -311,12 +296,8 @@ void ttinode_remove(TrieInnerNode *node, const char c) { * @param node node to remove character from * @param c character to remove */ -void ttnode_remove(TrieNode *node, const char c) { - ttinode_remove(node->ptr.root, c); +void tnode_remove(TrieNode *node, const char c) { + tinode_remove(node->tree, c); - node->size--; - - if (node->size == 0) { - node->ptr.root = NULL; - } + node->tree_size--; }