lander/trie/src/trie.c

#include <pthread.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "trie.h"
#include "trie_node.c"

typedef struct ttrie {
  TrieNode *root;
  size_t size;
  char *file_path;
  pthread_rwlock_t lock;
} Trie;

/**
 * Allocate and initialize an empty Trie
 *
 * @return pointer to the empty Trie
 */
Trie *trie_init() {
  Trie *trie = calloc(1, sizeof(Trie));
  trie->root = tnode_init();
  pthread_rwlock_init(&trie->lock, NULL);

  return trie;
}

/**
 * De-allocate a TernaryTree by freeing its entire underlying structure.
 *
 * @param trie trie to free
 */
void trie_free(Trie *trie) {
  tnode_free(trie->root);
  free(trie);
}

bool trie_add_no_lock(Trie *trie, const char *key, Entry *entry);

EntryType entry_type_from_char(char c) {
  switch (c) {
  case '0':
    return Redirect;
  case '1':
    return Paste;
  default:
    return Unknown;
  }
}

char entry_type_to_char(EntryType et) {
  switch (et) {
  case Redirect:
    return '0';
  case Paste:
    return '1';
  default:
    return '\0';
  }
}

Entry *entry_new(EntryType type, const char *string) {
  Entry *entry = malloc(sizeof(Entry));
  entry->type = type;

  if (string != NULL) {
    entry->string = strdup(string);
  } else {
    entry->string = NULL;
  }

  return entry;
}

int trie_populate(Trie *trie, const char *file_path) {
  trie->file_path = strdup(file_path);

  FILE *fp = fopen(file_path, "r");

  // TODO properly handle this
  if (fp == NULL) {
    return -1;
  }

  // We read in lines of at most 8192 characters (sounds like enough)
  char buffer[8192];
  EntryType type;
  Entry *entry;
  char *string;
  int i, j;
  int entries = 0;

  while (fgets(buffer, 8192, fp)) {
    i = 0;

    // Move index in buffer until we encounter first space character
    while (buffer[i] != ' ') {
      i++;
    }

    // Split the buffer into two strings, the key and the payload
    buffer[i] = '\0';

    type = entry_type_from_char(buffer[i + 1]);

    // Skip type character & its surrounding spaces
    j = i + 3;

    // Now remove the newline character
    while (buffer[j] != '\n') {
      j++;
    }

    buffer[j] = '\0';

    entry = entry_new(type, buffer + i + 3);
    trie_add_no_lock(trie, buffer, entry);

    entries++;
  }

  fclose(fp);

  return entries;
}

typedef struct searchresult {
  TrieNode *parent;
  TrieNode *child;
} SearchResult;

SearchResult trie_search_node(Trie *trie, const char *key) {
  SearchResult out = {NULL, NULL};

  size_t i = 0;
  size_t offset;
  TrieNode **node_ptr = &(trie->root);
  TrieNode **child_ptr;

  do {
    child_ptr = tnode_search(*node_ptr, key[i], false);

    // We don't have to check whether *node_ptr is NULL, because if it was
    // NULL, it wouldn't be in the binary tree.
    if (child_ptr == NULL) {
      return out;
    }

    i++;

    if (memcmp((*child_ptr)->string, key + i, (*child_ptr)->string_len) != 0) {
      return out;
    }

    i += (*child_ptr)->string_len;

/*     offset = 0; */

/*     // We iterate over each character on the edge and compare it to the string. */
/*     while (offset < (*child_ptr)->string_len) { */
/*       // Our string ends in the middle of an edge, so it's definitely not in */
/*       // the trie. */
/*       if (key[i + offset] == DELIMITER) { */
/*         return out; */
/*       } */

/*       // We compare each character with the characters in the skipped */
/*       // substring. If they don't match, we know the string isn't in the */
/*       // trie. */
/*       if (key[i + offset] != ((*child_ptr)->string[offset])) { */
/*         return out; */
/*       } */

/*       offset++; */
/*     } */

/*     i += offset; */

    if (key[i] != DELIMITER) {
      node_ptr = child_ptr;
    }
  } while (key[i] != DELIMITER);

  // At this point, we've either arrived at an empty child, or traversed through
  // the entire string. Therefore, all we have to do is check whether we're at
  // the end of the string and if node represents a string.
  if (key[i] == DELIMITER && (*child_ptr)->represents) {
    out.parent = *node_ptr;
    out.child = *child_ptr;
  }

  return out;
}

/**
 * Returns whether the given string is present in the trie.
 *
 * @param trie trie to look in
 * @param string string to look up
 * @return true if the string is present in the trie, false otherwise
 */
Entry *trie_search(Trie *trie, const char *key) {
  pthread_rwlock_rdlock(&trie->lock);

  SearchResult res = trie_search_node(trie, key);

  Entry *return_value = NULL;

  if (res.child != NULL) {
    return_value = res.child->entry;
  }

  pthread_rwlock_unlock(&trie->lock);

  return return_value;
}

/**
 * Add the given string to the Trie.
 *
 * @param trie trie to add string to
 * @param string string to add
 * @return true if the string wasn't present in the trie and thus added, false
 * otherwise
 */
bool trie_add_no_lock(Trie *trie, const char *string, Entry *entry) {
  size_t i = 0;
  uint8_t offset;
  TrieNode **node_ptr = &(trie->root);
  TrieNode **child_node_ptr;
  TrieNode *child_node;

  do {
    offset = 0;
    child_node_ptr = tnode_search(*node_ptr, string[i], true);

    i++;

    // We've reached a NULL child, so we add the remaining part of the string
    // here
    if (*child_node_ptr == NULL) {
      child_node = tnode_init();

      while (offset < TRIE_MAX_SKIP_SIZE &&
             string[i + offset] != DELIMITER) {
        offset++;
      }

      memcpy(child_node->string, string + i, offset);

      child_node->string_len = offset;
      *child_node_ptr = child_node;

      // If the remaining part of the string is still longer than the maximum
      // allowed skip length, we continue through the loop. The next iteration
      // will enter this if statement again, and perform the same loop, until
      // the string is fully added to the trie.
      if (string[i + offset] != DELIMITER) {
        node_ptr = child_node_ptr;
        i += offset;

        continue;
      }

      child_node->represents = true;
      child_node->entry = entry;

      trie->size++;
      return true;
    }

    while (offset < (*child_node_ptr)->string_len) {
      // String no longer aligns with edge, so we have to split
      if (string[i + offset] != (*child_node_ptr)->string[offset]) {
        TrieNode *split_node = tnode_init();
        child_node = *child_node_ptr;

        // New string of the split node is the prefix that we were able
        // to skip
        if (offset > 0) {
          memcpy(split_node->string, child_node->string, offset);
          split_node->string_len = offset;
        }

        // split_node replaces child_node as the child of node
        *child_node_ptr = split_node;
        TrieNode **new_node_ptr =
            tnode_search(split_node, child_node->string[offset], true);
        *new_node_ptr = child_node;

        // child_node has now become a child of split_node, so we update its
        // string accordingely by removing the skipped prefix + the one
        // character that's already stored by being a child of split_node
        /* char *old_string = child_node->string.ptr; */
        uint8_t new_skip_len = child_node->string_len - (offset + 1);

        if (new_skip_len > 0) {
          char old_string[TRIE_MAX_SKIP_SIZE];
          memcpy(old_string, child_node->string + offset + 1, new_skip_len);
          memcpy(child_node->string, old_string, new_skip_len);
        }

        child_node->string_len = new_skip_len;

        // The while loop will exit either way after this has happened, as
        // child_node is now split_node and split_node's len is already set to
        // offset.
        break;
      }

      offset++;
    }

    node_ptr = child_node_ptr;

    i += offset;
  } while (string[i] != DELIMITER);

  if ((*child_node_ptr)->represents) {
    return false;
  }

  (*child_node_ptr)->represents = true;
  trie->size++;
  return true;
}

bool trie_add_persistent(Trie *trie, const char *key, Entry *entry) {
  bool return_value = false;

  if (trie->file_path != NULL) {
    // Easiest way to make sure we don't add duplicate entries
    // We use an internal function that doesn't require a read lock, as we're
    // already inside a write lock
    if (trie_search_node(trie, key).child != NULL) {
      return false;
    }

    FILE *fp = fopen(trie->file_path, "a");

    if (fp == NULL) {
      return false;
    }

    fputs(key, fp);
    fputs(" ", fp);
    fputc(entry_type_to_char(entry->type), fp);
    fputs(" ", fp);
    fputs(entry->string, fp);
    fputs("\n", fp);

    fclose(fp);
  }

  // This function *should* always return true. Otherwise, the function would've
  // exited because the string was found in the trie.
  return trie_add_no_lock(trie, key, entry);
}

bool trie_add(Trie *trie, const char *key, Entry *entry) {
  pthread_rwlock_wrlock(&trie->lock);

  bool return_value = trie_add_persistent(trie, key, entry);

  pthread_rwlock_unlock(&trie->lock);

  return return_value;
}

char *trie_add_random(Trie *trie, Entry *entry, bool secure) {
  pthread_rwlock_wrlock(&trie->lock);

  // Generate random key
  bool ok = false;
  int key_length = secure ? RANDOM_KEY_LENGTH_LONG : RANDOM_KEY_LENGTH_SHORT;
  char *key = malloc(key_length + 1);
  key[key_length] = '\0';

  // We naively generate new keys until we find a key that isn't in the trie
  // yet. With charset_len ** RANDOM_KEY_LENGTH sufficiently large, this isn't a
  // problem, because the chances of collisions are extremely small.
  while (!ok) {
    for (int i = 0; i < key_length; i++) {
      key[i] = charset[rand() % charset_len];
    }

    ok = trie_search_node(trie, key).child == NULL;
  }

  bool res = trie_add_persistent(trie, key, entry);
  char *return_value;

  if (res) {
    return_value = key;
  } else {
    return_value = NULL;
    free(key);
  }

  pthread_rwlock_unlock(&trie->lock);

  return return_value;
}

/**
 * Remove the given string from a Trie.
 *
 * @param trie trie to remove string from
 * @param string string to remove
 * @return true if the string was in the trie and thus removed, false otherwise
 */
/* bool trie_remove(Trie *trie, const char *string) { */
/*   pthread_rwlock_wrlock(&trie->lock); */

/*   bool return_value = false; */

/*   SearchResult res = trie_search_node(trie, string); */

/*   if (res.child == NULL) { */
/*     goto end; */
/*   } */

/*   trie->size--; */
/*   return_value = true; */

/*   if (res.parent != NULL) { */
/*     // We're removing a full leaf, so we calculate the offset of the
 * character */
/*     // to remove from the parent */
/*     if (res.child->type == 2) { */
/*       size_t str_len = strlen(string); */
/*       size_t suffix_len = strlen(res.child->ptr.string); */

/*       tnode_remove(res.parent, string[str_len - suffix_len - 1]); */
/*     } */
/*     // In the other case, the character to remove from the parent is the last
 */
/*     // character of the string */
/*     else if (res.child->size == 0) { */
/*       size_t i = 0; */

/*       while (string[i + 1] != DELIMITER) { */
/*         i++; */
/*       } */

/*       tnode_remove(res.parent, string[i]); */
/*     } else { */
/*       res.child->type = 0; */

/*       goto end; */
/*     } */

/*     tnode_free(res.child); */
/*   } */
/*   // We're in the root here */
/*   else { */
/*     res.child->type = 0; */
/*   } */

/* end: */
/*   pthread_rwlock_unlock(&trie->lock); */

/*   return return_value; */
/* } */

/**
 * Return the current size of the given trie.
 *
 * @param trie trie to return size for
 * @return size of the trie
 */
size_t trie_size(Trie *trie) { return trie->size; }