lander/trie/src/trie.c

#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "trie.h"
#include "trie_node.c"

typedef struct ttrie {
  TrieNode *root;
  uint64_t size;
  char *file_path;
  pthread_rwlock_t lock;
} Trie;

TrieExitCode trie_add_no_lock(Trie *trie, const char *key, void *data);

/**
 * Allocate and initialize an empty Trie
 *
 * @return pointer to the empty Trie
 */
TrieExitCode trie_init(Trie **trie_ptr, const char *file_path) {
  // Allocate & initialize trie
  Trie *trie = calloc(1, sizeof(Trie));
  trie->root = tnode_init();
  pthread_rwlock_init(&trie->lock, NULL);

  if (file_path == NULL) {
    trie->file_path = NULL;
    *trie_ptr = trie;
    return Ok;
  }

  trie->file_path = strdup(file_path);

  // Populate trie with data from file
  FILE *fp = fopen(file_path, "r");

  if (fp == NULL) {
    return FileError;
  }

  uint64_t key_size, data_size;
  char *key;
  void *data;
  TrieExitCode status;
  size_t items_read;

  while (!feof(fp)) {
    items_read = fread(&key_size, sizeof(uint64_t), 1, fp);
    if (items_read < 1) {
      break;
    }

    key = malloc(key_size + 1);

    items_read = fread(key, 1, key_size, fp);
    if (items_read < key_size) {
      break;
    }

    key[key_size] = '\0';

    items_read = fread(&data_size, sizeof(uint64_t), 1, fp);
    if (items_read < 1) {
      break;
    }

    data = malloc(data_size);
    items_read = fread(data, 1, data_size, fp);
    if (items_read < data_size) {
      break;
    }

    status = trie_add_no_lock(trie, key, data);

    if (status != Ok) {
      trie_free(trie);
      free(key);
      free(data);

      return status;
    }
  }

  fclose(fp);

  *trie_ptr = trie;

  return Ok;
}

/**
 * De-allocate a TernaryTree by freeing its entire underlying structure.
 *
 * @param trie trie to free
 */
void trie_free(Trie *trie) {
  tnode_free(trie->root);
  free(trie);
}

typedef struct searchresult {
  TrieNode *parent;
  TrieNode *child;
} SearchResult;

SearchResult trie_search_node(Trie *trie, const char *key) {
  SearchResult out = {NULL, NULL};

  size_t i = 0;
  size_t offset;
  TrieNode **node_ptr = &(trie->root);
  TrieNode **child_ptr;

  do {
    child_ptr = tnode_search(*node_ptr, key[i], false);

    // We don't have to check whether *node_ptr is NULL, because if it was
    // NULL, it wouldn't be in the binary tree.
    if (child_ptr == NULL) {
      return out;
    }

    i++;

    if (memcmp((*child_ptr)->string, key + i, (*child_ptr)->string_len) != 0) {
      return out;
    }

    i += (*child_ptr)->string_len;

    if (key[i] != DELIMITER) {
      node_ptr = child_ptr;
    }
  } while (key[i] != DELIMITER);

  // At this point, we've either arrived at an empty child, or traversed through
  // the entire string. Therefore, all we have to do is check whether we're at
  // the end of the string and if node represents a string.
  if (key[i] == DELIMITER && (*child_ptr)->data_size > 0) {
    out.parent = *node_ptr;
    out.child = *child_ptr;
  }

  return out;
}

/**
 * Returns whether the given string is present in the trie.
 *
 * @param trie trie to look in
 * @param string string to look up
 * @return true if the string is present in the trie, false otherwise
 */
TrieExitCode trie_search(Trie *trie, void **data_ptr, const char *key) {
  SearchResult res = trie_search_node(trie, key);

  if (res.child == NULL) {
    return NotFound;
  }

  *data_ptr = res.child->data;

  return Ok;
}

/**
 * Add the given string to the Trie.
 *
 * @param trie trie to add string to
 * @param string string to add
 * @return true if the string wasn't present in the trie and thus added, false
 * otherwise
 */
TrieExitCode trie_add_no_lock(Trie *trie, const char *string, void *data) {
  size_t i = 0;
  uint8_t offset;
  TrieNode **node_ptr = &(trie->root);
  TrieNode **child_node_ptr;
  TrieNode *child_node;

  do {
    offset = 0;
    child_node_ptr = tnode_search(*node_ptr, string[i], true);

    i++;

    // We've reached a NULL child, so we add the remaining part of the string
    // here
    if (*child_node_ptr == NULL) {
      child_node = tnode_init();

      while (offset < TRIE_MAX_SKIP_SIZE && string[i + offset] != DELIMITER) {
        offset++;
      }

      memcpy(child_node->string, string + i, offset);

      child_node->string_len = offset;
      *child_node_ptr = child_node;

      // If the remaining part of the string is still longer than the maximum
      // allowed skip length, we continue through the loop. The next iteration
      // will enter this if statement again, and perform the same loop, until
      // the string is fully added to the trie.
      if (string[i + offset] != DELIMITER) {
        node_ptr = child_node_ptr;
        i += offset;

        continue;
      }

      child_node->data_size = sizeof(data);
      child_node->data = data;

      trie->size++;
      return Ok;
    }

    while (offset < (*child_node_ptr)->string_len) {
      // String no longer aligns with edge, so we have to split
      if (string[i + offset] != (*child_node_ptr)->string[offset]) {
        TrieNode *split_node = tnode_init();
        child_node = *child_node_ptr;

        // New string of the split node is the prefix that we were able
        // to skip
        if (offset > 0) {
          memcpy(split_node->string, child_node->string, offset);
          split_node->string_len = offset;
        }

        // split_node replaces child_node as the child of node
        *child_node_ptr = split_node;
        TrieNode **new_node_ptr =
            tnode_search(split_node, child_node->string[offset], true);
        *new_node_ptr = child_node;

        // child_node has now become a child of split_node, so we update its
        // string accordingely by removing the skipped prefix + the one
        // character that's already stored by being a child of split_node
        /* char *old_string = child_node->string.ptr; */
        uint8_t new_skip_len = child_node->string_len - (offset + 1);

        if (new_skip_len > 0) {
          char old_string[TRIE_MAX_SKIP_SIZE];
          memcpy(old_string, child_node->string + offset + 1, new_skip_len);
          memcpy(child_node->string, old_string, new_skip_len);
        }

        child_node->string_len = new_skip_len;

        // The while loop will exit either way after this has happened, as
        // child_node is now split_node and split_node's len is already set to
        // offset.
        break;
      }

      offset++;
    }

    node_ptr = child_node_ptr;

    i += offset;
  } while (string[i] != DELIMITER);

  if ((*child_node_ptr)->data_size > 0) {
    return AlreadyPresent;
  }

  (*child_node_ptr)->data_size = sizeof(data);
  (*child_node_ptr)->data = data;
  trie->size++;
  return Ok;
}

TrieExitCode trie_add(Trie *trie, const char *key, void *data,
                      uint64_t data_len) {
  if (trie->file_path != NULL) {
    // Easiest way to make sure we don't add duplicate entries
    // We use an internal function that doesn't require a read lock, as we're
    // already inside a write lock
    if (trie_search_node(trie, key).child != NULL) {
      return AlreadyPresent;
    }

    FILE *fp = fopen(trie->file_path, "a");

    if (fp == NULL) {
      return FileError;
    }

    // First we write the key, then the actual data
    uint64_t key_len = (uint64_t)strlen(key);
    fwrite(&key_len, 1, sizeof(uint64_t), fp);
    fwrite(key, 1, key_len, fp);

    fwrite(&data_len, 1, sizeof(uint64_t), fp);
    fwrite(data, 1, data_len, fp);

    fclose(fp);
  }

  // This function *should* always return Ok. Otherwise, the function would've
  // exited because the string was found in the trie.
  return trie_add_no_lock(trie, key, data);
}

TrieExitCode trie_add_random(Trie *trie, char **key_ptr, void *data,
                             uint64_t data_len, bool secure) {
  // Generate random key
  bool ok = false;
  int key_length = secure ? RANDOM_KEY_LENGTH_LONG : RANDOM_KEY_LENGTH_SHORT;
  char *key = malloc(key_length + 1);
  key[key_length] = '\0';

  // We naively generate new keys until we find a key that isn't in the trie
  // yet. With charset_len ** RANDOM_KEY_LENGTH sufficiently large, this isn't a
  // problem, because the chances of collisions are extremely small.
  while (!ok) {
    for (int i = 0; i < key_length; i++) {
      key[i] = charset[rand() % charset_len];
    }

    ok = trie_search_node(trie, key).child == NULL;
  }

  TrieExitCode return_value = trie_add(trie, key, data, data_len);

  if (return_value == Ok) {
    *key_ptr = key;
  } else {
    free(key);
  }

  return return_value;
}

/**
 * Remove the given string from a Trie.
 *
 * @param trie trie to remove string from
 * @param string string to remove
 * @return true if the string was in the trie and thus removed, false otherwise
 */
/* bool trie_remove(Trie *trie, const char *string) { */
/*   pthread_rwlock_wrlock(&trie->lock); */

/*   bool return_value = false; */

/*   SearchResult res = trie_search_node(trie, string); */

/*   if (res.child == NULL) { */
/*     goto end; */
/*   } */

/*   trie->size--; */
/*   return_value = true; */

/*   if (res.parent != NULL) { */
/*     // We're removing a full leaf, so we calculate the offset of the
 * character */
/*     // to remove from the parent */
/*     if (res.child->type == 2) { */
/*       size_t str_len = strlen(string); */
/*       size_t suffix_len = strlen(res.child->ptr.string); */

/*       tnode_remove(res.parent, string[str_len - suffix_len - 1]); */
/*     } */
/*     // In the other case, the character to remove from the parent is the last
 */
/*     // character of the string */
/*     else if (res.child->size == 0) { */
/*       size_t i = 0; */

/*       while (string[i + 1] != DELIMITER) { */
/*         i++; */
/*       } */

/*       tnode_remove(res.parent, string[i]); */
/*     } else { */
/*       res.child->type = 0; */

/*       goto end; */
/*     } */

/*     tnode_free(res.child); */
/*   } */
/*   // We're in the root here */
/*   else { */
/*     res.child->type = 0; */
/*   } */

/* end: */
/*   pthread_rwlock_unlock(&trie->lock); */

/*   return return_value; */
/* } */

/**
 * Return the current size of the given trie.
 *
 * @param trie trie to return size for
 * @return size of the trie
 */
uint64_t trie_size(Trie *trie) { return trie->size; }

int trie_rlock(Trie *trie) { return pthread_rwlock_rdlock(&trie->lock); }

int trie_wlock(Trie *trie) { return pthread_rwlock_wrlock(&trie->lock); }

int trie_unlock(Trie *trie) { return pthread_rwlock_unlock(&trie->lock); }