lander/trie/src/trie.c

412 lines
10 KiB
C
Raw Normal View History

#include <pthread.h>
2022-11-15 16:21:27 +01:00
#include <stdint.h>
#include <stdio.h>
2022-11-15 16:21:27 +01:00
#include <stdlib.h>
#include <string.h>
2022-11-21 12:03:16 +01:00
2022-11-29 11:27:28 +01:00
#include "trie.h"
#include "trie_entry.c"
2022-11-29 11:27:28 +01:00
#include "trie_node.c"
2022-11-15 16:21:27 +01:00
typedef struct ttrie {
2022-11-29 11:27:28 +01:00
TrieNode *root;
2022-12-07 23:20:39 +01:00
uint64_t size;
char *file_path;
2022-11-21 12:03:16 +01:00
pthread_rwlock_t lock;
2022-11-29 11:27:28 +01:00
} Trie;
2022-11-15 16:21:27 +01:00
2022-12-07 23:20:39 +01:00
TrieExitCode trie_add_no_lock(Trie *trie, const char *key, void *data);
2022-11-15 16:21:27 +01:00
/**
2022-11-29 11:27:28 +01:00
* Allocate and initialize an empty Trie
2022-11-15 16:21:27 +01:00
*
2022-11-29 11:27:28 +01:00
* @return pointer to the empty Trie
2022-11-15 16:21:27 +01:00
*/
TrieExitCode trie_init(Trie **trie_ptr, const char *file_path) {
// Allocate & initialize trie
2022-11-29 11:27:28 +01:00
Trie *trie = calloc(1, sizeof(Trie));
trie->root = tnode_init();
pthread_rwlock_init(&trie->lock, NULL);
2022-12-07 12:56:31 +01:00
if (file_path == NULL) {
trie->file_path = NULL;
2022-12-07 13:29:21 +01:00
*trie_ptr = trie;
2022-12-07 12:56:31 +01:00
return Ok;
}
trie->file_path = strdup(file_path);
// Populate trie with data from file
FILE *fp = fopen(file_path, "r");
2022-11-21 12:03:16 +01:00
if (fp == NULL) {
return FileError;
2022-11-21 12:03:16 +01:00
}
2022-11-21 12:03:16 +01:00
// We read in lines of at most 8192 characters (sounds like enough)
char buffer[8192];
EntryType type;
Entry *entry;
char *string;
2022-11-21 12:03:16 +01:00
int i, j;
2022-12-07 13:29:21 +01:00
TrieExitCode status;
2022-11-21 12:03:16 +01:00
while (fgets(buffer, 8192, fp)) {
i = 0;
// Move index in buffer until we encounter first space character
2022-11-21 12:03:16 +01:00
while (buffer[i] != ' ') {
i++;
}
2022-11-21 12:03:16 +01:00
// Split the buffer into two strings, the key and the payload
buffer[i] = '\0';
type = entry_type_from_char(buffer[i + 1]);
// Skip type character & its surrounding spaces
j = i + 3;
2022-11-21 12:03:16 +01:00
// Now remove the newline character
while (buffer[j] != '\n') {
j++;
}
2022-11-21 12:03:16 +01:00
buffer[j] = '\0';
entry = entry_new(type, buffer + i + 3);
2022-12-07 13:29:21 +01:00
status = trie_add_no_lock(trie, buffer, entry);
2022-12-07 13:29:21 +01:00
if (status != Ok) {
trie_free(trie);
return status;
}
2022-11-21 12:03:16 +01:00
}
2022-11-21 12:03:16 +01:00
fclose(fp);
2022-11-21 14:19:56 +01:00
2022-12-07 13:29:21 +01:00
*trie_ptr = trie;
return Ok;
}
/**
* De-allocate a TernaryTree by freeing its entire underlying structure.
*
* @param trie trie to free
*/
void trie_free(Trie *trie) {
tnode_free(trie->root);
free(trie);
}
2022-11-15 16:21:27 +01:00
typedef struct searchresult {
2022-11-29 11:27:28 +01:00
TrieNode *parent;
TrieNode *child;
2022-11-15 16:21:27 +01:00
} SearchResult;
2022-11-29 11:27:28 +01:00
SearchResult trie_search_node(Trie *trie, const char *key) {
2022-11-15 16:21:27 +01:00
SearchResult out = {NULL, NULL};
size_t i = 0;
size_t offset;
2022-11-29 11:27:28 +01:00
TrieNode **node_ptr = &(trie->root);
TrieNode **child_ptr;
2022-11-15 16:21:27 +01:00
do {
child_ptr = tnode_search(*node_ptr, key[i], false);
2022-11-15 16:21:27 +01:00
// We don't have to check whether *node_ptr is NULL, because if it was
// NULL, it wouldn't be in the binary tree.
if (child_ptr == NULL) {
2022-11-15 16:21:27 +01:00
return out;
}
i++;
if (memcmp((*child_ptr)->string, key + i, (*child_ptr)->string_len) != 0) {
return out;
}
i += (*child_ptr)->string_len;
2022-11-15 16:21:27 +01:00
if (key[i] != DELIMITER) {
node_ptr = child_ptr;
2022-11-15 16:21:27 +01:00
}
} while (key[i] != DELIMITER);
// At this point, we've either arrived at an empty child, or traversed through
// the entire string. Therefore, all we have to do is check whether we're at
// the end of the string and if node represents a string.
2022-12-07 23:20:39 +01:00
if (key[i] == DELIMITER && (*child_ptr)->data_size > 0) {
2022-11-15 16:21:27 +01:00
out.parent = *node_ptr;
out.child = *child_ptr;
2022-11-15 16:21:27 +01:00
}
return out;
}
/**
* Returns whether the given string is present in the trie.
*
* @param trie trie to look in
* @param string string to look up
* @return true if the string is present in the trie, false otherwise
*/
2022-12-07 23:20:39 +01:00
TrieExitCode trie_search(Trie *trie, void **data_ptr, const char *key) {
2022-11-29 11:27:28 +01:00
SearchResult res = trie_search_node(trie, key);
if (res.child == NULL) {
return NotFound;
}
2022-12-07 23:20:39 +01:00
*data_ptr = res.child->data;
return Ok;
2022-11-15 16:21:27 +01:00
}
/**
2022-11-29 11:27:28 +01:00
* Add the given string to the Trie.
2022-11-15 16:21:27 +01:00
*
* @param trie trie to add string to
* @param string string to add
* @return true if the string wasn't present in the trie and thus added, false
* otherwise
*/
2022-12-07 23:20:39 +01:00
TrieExitCode trie_add_no_lock(Trie *trie, const char *string, void *data) {
2022-11-15 16:21:27 +01:00
size_t i = 0;
uint8_t offset;
2022-11-29 11:27:28 +01:00
TrieNode **node_ptr = &(trie->root);
TrieNode **child_node_ptr;
TrieNode *child_node;
2022-11-15 16:21:27 +01:00
do {
offset = 0;
child_node_ptr = tnode_search(*node_ptr, string[i], true);
2022-11-15 16:21:27 +01:00
2022-12-03 13:27:34 +01:00
i++;
// We've reached a NULL child, so we add the remaining part of the string
// here
if (*child_node_ptr == NULL) {
child_node = tnode_init();
2022-11-15 16:21:27 +01:00
while (offset < TRIE_MAX_SKIP_SIZE && string[i + offset] != DELIMITER) {
offset++;
}
2022-11-15 16:21:27 +01:00
2022-12-03 13:27:34 +01:00
memcpy(child_node->string, string + i, offset);
child_node->string_len = offset;
*child_node_ptr = child_node;
2022-11-15 16:21:27 +01:00
// If the remaining part of the string is still longer than the maximum
// allowed skip length, we continue through the loop. The next iteration
// will enter this if statement again, and perform the same loop, until
// the string is fully added to the trie.
2022-12-03 13:27:34 +01:00
if (string[i + offset] != DELIMITER) {
node_ptr = child_node_ptr;
2022-12-03 13:27:34 +01:00
i += offset;
2022-11-15 16:21:27 +01:00
continue;
2022-11-15 16:21:27 +01:00
}
2022-12-07 23:20:39 +01:00
child_node->data_size = sizeof(data);
child_node->data = data;
2022-11-15 16:21:27 +01:00
trie->size++;
return Ok;
2022-11-15 16:21:27 +01:00
}
while (offset < (*child_node_ptr)->string_len) {
// String no longer aligns with edge, so we have to split
if (string[i + offset] != (*child_node_ptr)->string[offset]) {
TrieNode *split_node = tnode_init();
child_node = *child_node_ptr;
// New string of the split node is the prefix that we were able
// to skip
if (offset > 0) {
memcpy(split_node->string, child_node->string, offset);
split_node->string_len = offset;
}
// split_node replaces child_node as the child of node
*child_node_ptr = split_node;
TrieNode **new_node_ptr =
tnode_search(split_node, child_node->string[offset], true);
*new_node_ptr = child_node;
// child_node has now become a child of split_node, so we update its
// string accordingely by removing the skipped prefix + the one
// character that's already stored by being a child of split_node
/* char *old_string = child_node->string.ptr; */
uint8_t new_skip_len = child_node->string_len - (offset + 1);
if (new_skip_len > 0) {
char old_string[TRIE_MAX_SKIP_SIZE];
memcpy(old_string, child_node->string + offset + 1, new_skip_len);
memcpy(child_node->string, old_string, new_skip_len);
}
child_node->string_len = new_skip_len;
// The while loop will exit either way after this has happened, as
// child_node is now split_node and split_node's len is already set to
// offset.
break;
}
2022-11-15 16:21:27 +01:00
offset++;
}
node_ptr = child_node_ptr;
i += offset;
} while (string[i] != DELIMITER);
2022-12-07 23:20:39 +01:00
if ((*child_node_ptr)->data_size > 0) {
return AlreadyPresent;
2022-11-15 16:21:27 +01:00
}
2022-12-07 23:20:39 +01:00
(*child_node_ptr)->data_size = sizeof(data);
(*child_node_ptr)->data = data;
2022-11-15 16:21:27 +01:00
trie->size++;
return Ok;
2022-11-15 16:21:27 +01:00
}
2022-12-07 23:20:39 +01:00
TrieExitCode trie_add(Trie *trie, const char *key, void *entry) {
2022-12-07 12:56:31 +01:00
if (trie->file_path != NULL) {
// Easiest way to make sure we don't add duplicate entries
// We use an internal function that doesn't require a read lock, as we're
// already inside a write lock
if (trie_search_node(trie, key).child != NULL) {
return AlreadyPresent;
}
2022-11-21 12:03:16 +01:00
2022-12-07 12:56:31 +01:00
FILE *fp = fopen(trie->file_path, "a");
2022-11-21 12:03:16 +01:00
2022-12-07 12:56:31 +01:00
if (fp == NULL) {
return FileError;
}
2022-11-21 12:03:16 +01:00
2022-12-07 23:20:39 +01:00
/* fputs(key, fp); */
/* fputs(" ", fp); */
/* fputc(entry_type_to_char(entry->type), fp); */
/* fputs(" ", fp); */
/* fputs(entry->string, fp); */
/* fputs("\n", fp); */
2022-12-07 12:56:31 +01:00
fclose(fp);
}
// This function *should* always return Ok. Otherwise, the function would've
2022-11-21 12:03:16 +01:00
// exited because the string was found in the trie.
return trie_add_no_lock(trie, key, entry);
2022-11-21 12:03:16 +01:00
}
2022-12-07 23:20:39 +01:00
TrieExitCode trie_add_random(Trie *trie, char **key_ptr, void *data,
bool secure) {
2022-11-21 12:03:16 +01:00
// Generate random key
bool ok = false;
2022-11-21 21:02:33 +01:00
int key_length = secure ? RANDOM_KEY_LENGTH_LONG : RANDOM_KEY_LENGTH_SHORT;
2022-11-29 11:27:28 +01:00
char *key = malloc(key_length + 1);
2022-11-21 21:02:33 +01:00
key[key_length] = '\0';
2022-11-21 12:03:16 +01:00
// We naively generate new keys until we find a key that isn't in the trie
// yet. With charset_len ** RANDOM_KEY_LENGTH sufficiently large, this isn't a
// problem, because the chances of collisions are extremely small.
while (!ok) {
2022-11-21 21:02:33 +01:00
for (int i = 0; i < key_length; i++) {
2022-11-21 12:03:16 +01:00
key[i] = charset[rand() % charset_len];
}
2022-11-29 11:27:28 +01:00
ok = trie_search_node(trie, key).child == NULL;
2022-11-21 12:03:16 +01:00
}
2022-12-07 23:20:39 +01:00
TrieExitCode return_value = trie_add(trie, key, data);
2022-11-21 12:03:16 +01:00
if (return_value == Ok) {
*key_ptr = key;
2022-11-21 12:03:16 +01:00
} else {
free(key);
}
return return_value;
}
2022-11-15 16:21:27 +01:00
/**
2022-11-29 11:27:28 +01:00
* Remove the given string from a Trie.
2022-11-15 16:21:27 +01:00
*
* @param trie trie to remove string from
* @param string string to remove
* @return true if the string was in the trie and thus removed, false otherwise
*/
/* bool trie_remove(Trie *trie, const char *string) { */
/* pthread_rwlock_wrlock(&trie->lock); */
2022-11-21 12:03:16 +01:00
/* bool return_value = false; */
2022-11-21 12:03:16 +01:00
/* SearchResult res = trie_search_node(trie, string); */
2022-11-15 16:21:27 +01:00
/* if (res.child == NULL) { */
/* goto end; */
/* } */
2022-11-15 16:21:27 +01:00
/* trie->size--; */
/* return_value = true; */
2022-11-15 16:21:27 +01:00
/* if (res.parent != NULL) { */
/* // We're removing a full leaf, so we calculate the offset of the
* character */
/* // to remove from the parent */
/* if (res.child->type == 2) { */
/* size_t str_len = strlen(string); */
/* size_t suffix_len = strlen(res.child->ptr.string); */
2022-11-15 16:21:27 +01:00
/* tnode_remove(res.parent, string[str_len - suffix_len - 1]); */
/* } */
/* // In the other case, the character to remove from the parent is the last
*/
/* // character of the string */
/* else if (res.child->size == 0) { */
/* size_t i = 0; */
2022-11-15 16:21:27 +01:00
/* while (string[i + 1] != DELIMITER) { */
/* i++; */
/* } */
2022-11-15 16:21:27 +01:00
/* tnode_remove(res.parent, string[i]); */
/* } else { */
/* res.child->type = 0; */
2022-11-15 16:21:27 +01:00
/* goto end; */
/* } */
2022-11-15 16:21:27 +01:00
/* tnode_free(res.child); */
/* } */
/* // We're in the root here */
/* else { */
/* res.child->type = 0; */
/* } */
2022-11-15 16:21:27 +01:00
/* end: */
/* pthread_rwlock_unlock(&trie->lock); */
2022-11-21 12:03:16 +01:00
/* return return_value; */
/* } */
2022-11-15 16:21:27 +01:00
/**
* Return the current size of the given trie.
*
* @param trie trie to return size for
* @return size of the trie
*/
2022-12-07 23:20:39 +01:00
uint64_t trie_size(Trie *trie) { return trie->size; }
int trie_rlock(Trie *trie) { return pthread_rwlock_rdlock(&trie->lock); }
int trie_wlock(Trie *trie) { return pthread_rwlock_wrlock(&trie->lock); }
int trie_unlock(Trie *trie) { return pthread_rwlock_unlock(&trie->lock); }