diff --git a/TRIE.md b/TRIE.md new file mode 100644 index 0000000..a9f0802 --- /dev/null +++ b/TRIE.md @@ -0,0 +1,16 @@ +# Trie design + +The underlying data structure is based on a combination of a ternary and a +Patricia trie. + +* Nodes are classic ternary trie nodes, meaning each node contains a binary + search tree +* Each node can define a skip, like a Patricia trie, of at most 8 characters. + These skipped characters are stored directly in the structs defining the + nodes. +* While the add function relies on the fact that the input is a NULL-terminated + C string, the trie itself does not store any NULL bytes. + +The goal of this datastructure is to be as optimized as possible for search +operations with short (usually < 8 characters) keys, as this is by far the most +common operation for a URL shortener/pastebin. diff --git a/src/main.cpp b/src/main.cpp index 1ece188..3e98a0b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -31,8 +31,8 @@ static const std::string index_page = R"( return crow::response(crow::status::UNAUTHORIZED); \ } -crow::response add_redirect(std::string base_url, Trie *trie, - const char *url, bool secure) { +crow::response add_redirect(std::string base_url, Trie *trie, const char *url, + bool secure) { Entry *new_entry = entry_new(Redirect, url); char *key = trie_add_random(trie, new_entry, secure); @@ -61,8 +61,8 @@ bool store_paste(const char *key, const char *body) { return true; } -crow::response add_paste(std::string base_url, Trie *trie, - const char *body, bool secure) { +crow::response add_paste(std::string base_url, Trie *trie, const char *body, + bool secure) { Entry *new_entry = entry_new(Paste, ""); char *key = trie_add_random(trie, new_entry, secure); diff --git a/src/trie.c b/src/trie.c index e85e9dc..df426c3 100644 --- a/src/trie.c +++ b/src/trie.c @@ -218,8 +218,7 @@ Entry *trie_search(Trie *trie, const char *key) { * @return true if the string wasn't present in the trie and thus added, false * otherwise */ -bool trie_add_no_lock(Trie *trie, const char *string, - Entry *entry) { +bool trie_add_no_lock(Trie *trie, const char *string, Entry *entry) { size_t i = 0; uint8_t offset; TrieNode **node_ptr = &(trie->root); @@ -230,11 +229,13 @@ bool trie_add_no_lock(Trie *trie, const char *string, offset = 0; child_node_ptr = tnode_search(*node_ptr, string[i], true); - // We've reached a NULL child, so we add the remaining part of the string here + // We've reached a NULL child, so we add the remaining part of the string + // here if (*child_node_ptr == NULL) { child_node = tnode_init(); - while (offset < TRIE_MAX_SKIP_SIZE && string[i + 1 + offset] != DELIMITER) { + while (offset < TRIE_MAX_SKIP_SIZE && + string[i + 1 + offset] != DELIMITER) { child_node->string[offset] = string[i + 1 + offset]; offset++; } @@ -277,7 +278,8 @@ bool trie_add_no_lock(Trie *trie, const char *string, // split_node replaces child_node as the child of node *child_node_ptr = split_node; - TrieNode **new_node_ptr = tnode_search(split_node, child_node->string[offset], true); + TrieNode **new_node_ptr = + tnode_search(split_node, child_node->string[offset], true); *new_node_ptr = child_node; // child_node has now become a child of split_node, so we update its @@ -317,8 +319,7 @@ bool trie_add_no_lock(Trie *trie, const char *string, return true; } -bool trie_add_persistent(Trie *trie, const char *key, - Entry *entry) { +bool trie_add_persistent(Trie *trie, const char *key, Entry *entry) { bool return_value = false; if (trie->file_path != NULL) { @@ -417,7 +418,8 @@ char *trie_add_random(Trie *trie, Entry *entry, bool secure) { /* return_value = true; */ /* if (res.parent != NULL) { */ -/* // We're removing a full leaf, so we calculate the offset of the character */ +/* // We're removing a full leaf, so we calculate the offset of the + * character */ /* // to remove from the parent */ /* if (res.child->type == 2) { */ /* size_t str_len = strlen(string); */ @@ -425,7 +427,8 @@ char *trie_add_random(Trie *trie, Entry *entry, bool secure) { /* tnode_remove(res.parent, string[str_len - suffix_len - 1]); */ /* } */ -/* // In the other case, the character to remove from the parent is the last */ +/* // In the other case, the character to remove from the parent is the last + */ /* // character of the string */ /* else if (res.child->size == 0) { */ /* size_t i = 0; */ diff --git a/src/trie_node.c b/src/trie_node.c index 1dbcb8f..7d70ce9 100644 --- a/src/trie_node.c +++ b/src/trie_node.c @@ -29,12 +29,12 @@ typedef struct tinode { typedef struct tnode { Entry *entry; - TrieInnerNode* tree; - uint8_t tree_size; + TrieInnerNode *tree; + uint8_t tree_size; // Skips are at most 8 characters, and are stored in the nodes char string[TRIE_MAX_SKIP_SIZE]; - uint8_t string_len: 4; + uint8_t string_len : 4; bool represents : 1; } TrieNode; @@ -130,8 +130,7 @@ void tnode_free(TrieNode *node) { * node represents a leaf with a string, because the struct and therefore the * address is created if it doesn't exist yet. */ -TrieNode **tnode_search(TrieNode *node, const char c, - bool create) { +TrieNode **tnode_search(TrieNode *node, const char c, bool create) { // It can happen that the node has no initialized root yet if (node->tree_size == 0) { if (create) { @@ -197,7 +196,8 @@ TrieNode **tnode_search(TrieNode *node, const char c, /* TrieNode *new_node = tnode_init(); */ /* char key = node->ptr.string[0]; */ -/* // There's a chance the remaining string was only 1 character, meaning the new */ +/* // There's a chance the remaining string was only 1 character, meaning the + * new */ /* // node doesn't have to store a string */ /* if (node->ptr.string[1] != DELIMITER) { */ /* tnode_set_string(new_node, node->ptr.string + 1); */