feat: allow skips up to 8 characters long

trie-skips
Jef Roosens 2022-11-29 15:08:07 +01:00
parent 4bcdd5c4d9
commit 88ea0db2ee
Signed by: Jef Roosens
GPG Key ID: B75D4F293C7052DB
4 changed files with 224 additions and 196 deletions

View File

@ -34,6 +34,10 @@ prod: cmake-release
run: build run: build
@ LANDER_DATA_DIR=data LANDER_BASE_URL=http://localhost:18080/ LANDER_API_KEY=test ./build/Debug/lander @ LANDER_DATA_DIR=data LANDER_BASE_URL=http://localhost:18080/ LANDER_API_KEY=test ./build/Debug/lander
.PHONY: gdb
gdb: build
@ LANDER_DATA_DIR=data LANDER_BASE_URL=http://localhost:18080/ LANDER_API_KEY=test gdb --args ./build/Debug/lander
.PHONY: clean .PHONY: clean
clean: clean:
@ rm -rf '$(BUILD_DIR)' compile_commands.json @ rm -rf '$(BUILD_DIR)' compile_commands.json

View File

@ -4,6 +4,7 @@
#define ALPHABET_SIZE 256 #define ALPHABET_SIZE 256
#define DELIMITER '\0' #define DELIMITER '\0'
#define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define MAX(x, y) (((x) > (y)) ? (x) : (y))
#define TRIE_MAX_SKIP_SIZE 8
/** /**
* The implementation of a Ternary Trie. * The implementation of a Ternary Trie.

View File

@ -21,7 +21,7 @@ typedef struct ttrie {
*/ */
Trie *trie_init() { Trie *trie_init() {
Trie *trie = calloc(1, sizeof(Trie)); Trie *trie = calloc(1, sizeof(Trie));
trie->root = ttnode_init(); trie->root = tnode_init();
pthread_rwlock_init(&trie->lock, NULL); pthread_rwlock_init(&trie->lock, NULL);
return trie; return trie;
@ -33,11 +33,11 @@ Trie *trie_init() {
* @param trie trie to free * @param trie trie to free
*/ */
void trie_free(Trie *trie) { void trie_free(Trie *trie) {
ttnode_free(trie->root); tnode_free(trie->root);
free(trie); free(trie);
} }
bool trie_add_internal(Trie *trie, const char *key, Entry *entry); bool trie_add_no_lock(Trie *trie, const char *key, Entry *entry);
EntryType entry_type_from_char(char c) { EntryType entry_type_from_char(char c) {
switch (c) { switch (c) {
@ -116,7 +116,7 @@ int trie_populate(Trie *trie, const char *file_path) {
buffer[j] = '\0'; buffer[j] = '\0';
entry = entry_new(type, buffer + i + 3); entry = entry_new(type, buffer + i + 3);
trie_add_internal(trie, buffer, entry); trie_add_no_lock(trie, buffer, entry);
entries++; entries++;
} }
@ -136,7 +136,7 @@ SearchResult trie_search_node(Trie *trie, const char *key) {
// Edge case for empty string // Edge case for empty string
if (key[0] == DELIMITER) { if (key[0] == DELIMITER) {
if (trie->root->type == 1) { if (trie->root->represents) {
out.child = trie->root; out.child = trie->root;
} }
@ -144,38 +144,53 @@ SearchResult trie_search_node(Trie *trie, const char *key) {
} }
size_t i = 0; size_t i = 0;
size_t offset;
TrieNode **node_ptr = &(trie->root); TrieNode **node_ptr = &(trie->root);
TrieNode **child_ptr; TrieNode **child_ptr;
do { do {
child_ptr = ttnode_search(*node_ptr, key[i], false); child_ptr = tnode_search(*node_ptr, key[i], false);
// We don't have to check whether *node_ptr is NULL, because if it was // We don't have to check whether *node_ptr is NULL, because if it was
// NULL, it wouldn't be in the binary tree. // NULL, it wouldn't be in the binary tree.
if (child_ptr == NULL || *child_ptr == NULL) { if (child_ptr == NULL) {
return out; return out;
} }
i++; i++;
offset = 0;
if (key[i] == DELIMITER || (*child_ptr)->type == 2) { // We iterate over each character on the edge and compare it to the string.
break; while (offset < (*child_ptr)->string_len) {
// Our string ends in the middle of an edge, so it's definitely not in
// the trie.
if (key[i + offset] == DELIMITER) {
return out;
} }
// We compare each character with the characters in the skipped
// substring. If they don't match, we know the string isn't in the
// trie.
if (key[i + offset] != ((*child_ptr)->string[offset])) {
return out;
}
offset++;
}
i += offset;
if (key[i] != DELIMITER) {
node_ptr = child_ptr; node_ptr = child_ptr;
} while (1); }
} while (key[i] != DELIMITER);
if ((*child_ptr)->type == 2) { // At this point, we've either arrived at an empty child, or traversed through
if (key[i] != DELIMITER && strcmp(key + i, (*child_ptr)->ptr.string) == 0) { // the entire string. Therefore, all we have to do is check whether we're at
out.child = *child_ptr; // the end of the string and if node represents a string.
if (key[i] == DELIMITER && (*child_ptr)->represents) {
out.parent = *node_ptr; out.parent = *node_ptr;
}
}
// Here we know we've traversed through the entire string and have arrived at
// a node that isn't a full leaf
else if ((*child_ptr)->type == 1) {
out.child = *child_ptr; out.child = *child_ptr;
out.parent = *node_ptr;
} }
return out; return out;
@ -212,87 +227,114 @@ Entry *trie_search(Trie *trie, const char *key) {
* @return true if the string wasn't present in the trie and thus added, false * @return true if the string wasn't present in the trie and thus added, false
* otherwise * otherwise
*/ */
bool trie_add_internal(Trie *trie, const char *string, bool trie_add_no_lock(Trie *trie, const char *string,
Entry *entry) { Entry *entry) {
// Edge case for empty string // Edge case for empty string
if (string[0] == DELIMITER) { if (string[0] == DELIMITER) {
if (trie->root->type == 0) { if (trie->root->represents) {
trie->root->type = 1; return false;
trie->root->entry = entry; }
trie->root->represents = true;
trie->size++; trie->size++;
return true; return true;
} }
return false;
}
size_t i = 0; size_t i = 0;
uint8_t offset;
TrieNode **node_ptr = &(trie->root); TrieNode **node_ptr = &(trie->root);
TrieNode **new_node_ptr; TrieNode **child_node_ptr;
TrieNode *child_node;
do { do {
new_node_ptr = ttnode_search(*node_ptr, string[i], true); offset = 0;
child_node_ptr = tnode_search(*node_ptr, string[i], true);
// ttnode_search will only return NULL with create true if the node to look // We've reached a NULL child, so we add the remaining part of the string here
// in represents a full leaf. Therefore, we split the node and restart the if (*child_node_ptr == NULL) {
// iteration. child_node = tnode_init();
if (new_node_ptr == NULL) {
// It's possible we've ended up in the full leaf node that represents this while (offset < TRIE_MAX_SKIP_SIZE && string[i + 1 + offset] != DELIMITER) {
// string child_node->string[offset] = string[i + 1 + offset];
if (strcmp(string + i, (*node_ptr)->ptr.string) == 0) { offset++;
return false;
} }
ttnode_split(*node_ptr); child_node->string_len = offset;
*child_node_ptr = child_node;
// If the remaining part of the string is still longer than the maximum
// allowed skip length, we continue through the loop. The next iteration
// will enter this if statement again, and perform the same loop, until
// the string is fully added to the trie.
if (string[i + 1 + offset] != DELIMITER) {
node_ptr = child_node_ptr;
i += offset + 1;
continue; continue;
} }
node_ptr = new_node_ptr; child_node->represents = true;
child_node->entry = entry;
// The search function has added the character to the node
i++;
// The next node in the string's path doesn't exist yet, so we add it to the
// trie
if (*node_ptr == NULL) {
TrieNode *new_node = ttnode_init();
// If there's a remaining part of the string, we add it to the leaf
if (string[i] != DELIMITER) {
ttnode_set_string(new_node, string + i);
} else {
new_node->type = 1;
}
new_node->entry = entry;
*node_ptr = new_node;
trie->size++; trie->size++;
return true; return true;
} }
i++;
while (offset < (*child_node_ptr)->string_len) {
// String no longer aligns with edge, so we have to split
if (string[i + offset] != (*child_node_ptr)->string[offset]) {
TrieNode *split_node = tnode_init();
child_node = *child_node_ptr;
// New string of the split node is the prefix that we were able
// to skip
if (offset > 0) {
memcpy(split_node->string, child_node->string, offset);
split_node->string_len = offset;
}
// split_node replaces child_node as the child of node
*child_node_ptr = split_node;
TrieNode **new_node_ptr = tnode_search(split_node, child_node->string[offset], true);
*new_node_ptr = child_node;
// child_node has now become a child of split_node, so we update its
// string accordingely by removing the skipped prefix + the one
// character that's already stored by being a child of split_node
/* char *old_string = child_node->string.ptr; */
uint8_t new_skip_len = child_node->string_len - (offset + 1);
if (new_skip_len > 0) {
char old_string[TRIE_MAX_SKIP_SIZE];
memcpy(old_string, child_node->string + offset + 1, new_skip_len);
memcpy(child_node->string, old_string, new_skip_len);
}
child_node->string_len = new_skip_len;
// The while loop will exit either way after this has happened, as
// child_node is now split_node and split_node's len is already set to
// offset.
break;
}
offset++;
}
node_ptr = child_node_ptr;
i += offset;
} while (string[i] != DELIMITER); } while (string[i] != DELIMITER);
// If we've arrived here, we've traversed through the entire string and have if ((*child_node_ptr)->represents) {
// arrived at a node that already exists.
// The existing node is a full leaf, so we split it and make it
// represent our new string.
if ((*node_ptr)->type == 2) {
ttnode_split(*node_ptr);
}
// The string is already in the trie
else if ((*node_ptr)->type == 1) {
return false; return false;
} }
(*node_ptr)->type = 1; (*child_node_ptr)->represents = true;
(*node_ptr)->entry = entry;
trie->size++; trie->size++;
return true; return true;
} }
@ -326,7 +368,7 @@ bool trie_add_persistent(Trie *trie, const char *key,
// This function *should* always return true. Otherwise, the function would've // This function *should* always return true. Otherwise, the function would've
// exited because the string was found in the trie. // exited because the string was found in the trie.
return trie_add_internal(trie, key, entry); return trie_add_no_lock(trie, key, entry);
} }
bool trie_add(Trie *trie, const char *key, Entry *entry) { bool trie_add(Trie *trie, const char *key, Entry *entry) {
@ -381,57 +423,57 @@ char *trie_add_random(Trie *trie, Entry *entry, bool secure) {
* @param string string to remove * @param string string to remove
* @return true if the string was in the trie and thus removed, false otherwise * @return true if the string was in the trie and thus removed, false otherwise
*/ */
bool trie_remove(Trie *trie, const char *string) { /* bool trie_remove(Trie *trie, const char *string) { */
pthread_rwlock_wrlock(&trie->lock); /* pthread_rwlock_wrlock(&trie->lock); */
bool return_value = false; /* bool return_value = false; */
SearchResult res = trie_search_node(trie, string); /* SearchResult res = trie_search_node(trie, string); */
if (res.child == NULL) { /* if (res.child == NULL) { */
goto end; /* goto end; */
} /* } */
trie->size--; /* trie->size--; */
return_value = true; /* return_value = true; */
if (res.parent != NULL) { /* if (res.parent != NULL) { */
// We're removing a full leaf, so we calculate the offset of the character /* // We're removing a full leaf, so we calculate the offset of the character */
// to remove from the parent /* // to remove from the parent */
if (res.child->type == 2) { /* if (res.child->type == 2) { */
size_t str_len = strlen(string); /* size_t str_len = strlen(string); */
size_t suffix_len = strlen(res.child->ptr.string); /* size_t suffix_len = strlen(res.child->ptr.string); */
ttnode_remove(res.parent, string[str_len - suffix_len - 1]); /* tnode_remove(res.parent, string[str_len - suffix_len - 1]); */
} /* } */
// In the other case, the character to remove from the parent is the last /* // In the other case, the character to remove from the parent is the last */
// character of the string /* // character of the string */
else if (res.child->size == 0) { /* else if (res.child->size == 0) { */
size_t i = 0; /* size_t i = 0; */
while (string[i + 1] != DELIMITER) { /* while (string[i + 1] != DELIMITER) { */
i++; /* i++; */
} /* } */
ttnode_remove(res.parent, string[i]); /* tnode_remove(res.parent, string[i]); */
} else { /* } else { */
res.child->type = 0; /* res.child->type = 0; */
goto end; /* goto end; */
} /* } */
ttnode_free(res.child); /* tnode_free(res.child); */
} /* } */
// We're in the root here /* // We're in the root here */
else { /* else { */
res.child->type = 0; /* res.child->type = 0; */
} /* } */
end: /* end: */
pthread_rwlock_unlock(&trie->lock); /* pthread_rwlock_unlock(&trie->lock); */
return return_value; /* return return_value; */
} /* } */
/** /**
* Return the current size of the given trie. * Return the current size of the given trie.

View File

@ -8,10 +8,10 @@
* Represents a node of the binary tree contained within each non-leaf * Represents a node of the binary tree contained within each non-leaf
* TrieNode. * TrieNode.
*/ */
typedef struct ttinode { typedef struct tinode {
struct ttinode *left; struct tinode *left;
struct ttinode *right; struct tinode *right;
struct ttnode *next; struct tnode *next;
char key; char key;
} TrieInnerNode; } TrieInnerNode;
@ -26,25 +26,21 @@ typedef struct ttinode {
* to be stored as a single node. Its size will be zero, represents its true, * to be stored as a single node. Its size will be zero, represents its true,
* and its string pointer is initialized. * and its string pointer is initialized.
*/ */
typedef struct ttnode { typedef struct tnode {
union {
TrieInnerNode *root;
char *string;
} ptr;
Entry *entry; Entry *entry;
// What type of node this is
// 0: regular non-representing node TrieInnerNode* tree;
// 1: regular representing node uint8_t tree_size;
// 2: full leaf
uint8_t type; // Skips are at most 8 characters, and are stored in the nodes
// Dependent on type char string[TRIE_MAX_SKIP_SIZE];
// 0, 1: size of underlying binary tree uint8_t string_len: 4;
// 2: length of string
uint8_t size; bool represents : 1;
} TrieNode; } TrieNode;
// Required for recursively freeing tree structure // Required for recursively freeing tree structure
void ttnode_free(TrieNode *node); void tnode_free(TrieNode *node);
/** /**
* Allocate and initialize a new TrieInnerNode representing a given * Allocate and initialize a new TrieInnerNode representing a given
@ -53,7 +49,7 @@ void ttnode_free(TrieNode *node);
* @param c character to represent * @param c character to represent
* @return pointer to newly allocated struct * @return pointer to newly allocated struct
*/ */
TrieInnerNode *ttinode_init(char c) { TrieInnerNode *tinode_init(char c) {
TrieInnerNode *node = calloc(1, sizeof(TrieInnerNode)); TrieInnerNode *node = calloc(1, sizeof(TrieInnerNode));
node->key = c; node->key = c;
@ -65,7 +61,15 @@ TrieInnerNode *ttinode_init(char c) {
* *
* @return pointer to newly allocated struct * @return pointer to newly allocated struct
*/ */
TrieNode *ttnode_init() { return calloc(1, sizeof(TrieNode)); } TrieNode *tnode_init() {
TrieNode *node = malloc(sizeof(TrieNode));
node->tree_size = 0;
node->string_len = 0;
node->represents = false;
return node;
}
/** /**
* Free a TrieInnerNode and its underlying tree structure. This should * Free a TrieInnerNode and its underlying tree structure. This should
@ -74,17 +78,17 @@ TrieNode *ttnode_init() { return calloc(1, sizeof(TrieNode)); }
* *
* @param node node whose tree to free * @param node node whose tree to free
*/ */
void ttinode_free_cascade(TrieInnerNode *node) { void tinode_free_cascade(TrieInnerNode *node) {
if (node->left != NULL) { if (node->left != NULL) {
ttinode_free_cascade(node->left); tinode_free_cascade(node->left);
} }
if (node->right != NULL) { if (node->right != NULL) {
ttinode_free_cascade(node->right); tinode_free_cascade(node->right);
} }
if (node->next != NULL) { if (node->next != NULL) {
ttnode_free(node->next); tnode_free(node->next);
} }
free(node); free(node);
@ -95,11 +99,9 @@ void ttinode_free_cascade(TrieInnerNode *node) {
* *
* @param node node to free * @param node node to free
*/ */
void ttnode_free(TrieNode *node) { void tnode_free(TrieNode *node) {
if (node->type == 2) { if (node->tree_size > 0) {
free(node->ptr.string); tinode_free_cascade(node->tree);
} else if (node->size != 0) {
ttinode_free_cascade(node->ptr.root);
} }
// TODO properly free entry // TODO properly free entry
@ -110,18 +112,6 @@ void ttnode_free(TrieNode *node) {
free(node); free(node);
} }
/**
* Add the string to the given node & set its type accordingely.
*
* @param node node to add string to
* @param string string to add
*/
void ttnode_set_string(TrieNode *node, const char *string) {
node->type = 2;
node->size = strlen(string);
node->ptr.string = strdup(string);
}
/** /**
* This function performs a lookup in the underlying binary tree of the given * This function performs a lookup in the underlying binary tree of the given
* TrieNode. If found, the return value is a pointer to the memory * TrieNode. If found, the return value is a pointer to the memory
@ -140,26 +130,21 @@ void ttnode_set_string(TrieNode *node, const char *string) {
* node represents a leaf with a string, because the struct and therefore the * node represents a leaf with a string, because the struct and therefore the
* address is created if it doesn't exist yet. * address is created if it doesn't exist yet.
*/ */
TrieNode **ttnode_search(TrieNode *node, const char c, TrieNode **tnode_search(TrieNode *node, const char c,
bool create) { bool create) {
// Full leafs will always return NULL
if (node->type == 2) {
return NULL;
}
// It can happen that the node has no initialized root yet // It can happen that the node has no initialized root yet
if (node->size == 0) { if (node->tree_size == 0) {
if (create) { if (create) {
node->size++; node->tree_size++;
node->ptr.root = ttinode_init(c); node->tree = tinode_init(c);
return &node->ptr.root->next; return &node->tree->next;
} }
return NULL; return NULL;
} }
TrieInnerNode *parent = node->ptr.root; TrieInnerNode *parent = node->tree;
TrieInnerNode *child; TrieInnerNode *child;
// Iterate through the tree until we either find the character or realize it's // Iterate through the tree until we either find the character or realize it's
@ -186,7 +171,7 @@ TrieNode **ttnode_search(TrieNode *node, const char c,
// If create is true, we create the new node so that we can still return a // If create is true, we create the new node so that we can still return a
// non-NULL pointer. // non-NULL pointer.
if (create) { if (create) {
TrieInnerNode *new_node = ttinode_init(c); TrieInnerNode *new_node = tinode_init(c);
if (c < parent->key) { if (c < parent->key) {
parent->left = new_node; parent->left = new_node;
@ -194,7 +179,7 @@ TrieNode **ttnode_search(TrieNode *node, const char c,
parent->right = new_node; parent->right = new_node;
} }
node->size++; node->tree_size++;
return &new_node->next; return &new_node->next;
} }
@ -208,37 +193,37 @@ TrieNode **ttnode_search(TrieNode *node, const char c,
* *
* @param node node to split * @param node node to split
*/ */
void ttnode_split(TrieNode *node) { /* void tnode_split(TrieNode *node) { */
TrieNode *new_node = ttnode_init(); /* TrieNode *new_node = tnode_init(); */
char key = node->ptr.string[0]; /* char key = node->ptr.string[0]; */
// There's a chance the remaining string was only 1 character, meaning the new /* // There's a chance the remaining string was only 1 character, meaning the new */
// node doesn't have to store a string /* // node doesn't have to store a string */
if (node->ptr.string[1] != DELIMITER) { /* if (node->ptr.string[1] != DELIMITER) { */
ttnode_set_string(new_node, node->ptr.string + 1); /* tnode_set_string(new_node, node->ptr.string + 1); */
} else { /* } else { */
new_node->type = 1; /* new_node->type = 1; */
} /* } */
new_node->entry = node->entry; /* new_node->entry = node->entry; */
node->type = 0; /* node->type = 0; */
node->size = 0; /* node->size = 0; */
node->entry = NULL; /* node->entry = NULL; */
free(node->ptr.string); /* free(node->ptr.string); */
node->ptr.string = NULL; /* node->ptr.string = NULL; */
// Initialize node's binary tree with the correct character /* // Initialize node's binary tree with the correct character */
TrieNode **node_ptr = ttnode_search(node, key, true); /* TrieNode **node_ptr = tnode_search(node, key, true); */
*node_ptr = new_node; /* *node_ptr = new_node; */
} /* } */
/* /*
* Remove the given character from a TrieInnerNode's subtree. The * Remove the given character from a TrieInnerNode's subtree. The
* function assumes the character is indeed in the subtree. * function assumes the character is indeed in the subtree.
*/ */
void ttinode_remove(TrieInnerNode *node, const char c) { void tinode_remove(TrieInnerNode *node, const char c) {
TrieInnerNode **to_remove_ptr = &node; TrieInnerNode **to_remove_ptr = &node;
// We use pointers to pointers here so we can later free the removed node // We use pointers to pointers here so we can later free the removed node
@ -311,12 +296,8 @@ void ttinode_remove(TrieInnerNode *node, const char c) {
* @param node node to remove character from * @param node node to remove character from
* @param c character to remove * @param c character to remove
*/ */
void ttnode_remove(TrieNode *node, const char c) { void tnode_remove(TrieNode *node, const char c) {
ttinode_remove(node->ptr.root, c); tinode_remove(node->tree, c);
node->size--; node->tree_size--;
if (node->size == 0) {
node->ptr.root = NULL;
}
} }