Split off utf8 encoding

2020-12-02 16:09:09 +01:00 · 2020-12-02 16:09:09 +01:00 · 010fb66cb6
parent 25af8ba5fc
commit 010fb66cb6
6 changed files with 137 additions and 122 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -8,20 +8,18 @@ endif()
 set(SYSTEM_TYPE x64)


-# =====COMPILER=====
+# =====COMMON SETTINGS=====
 set(CMAKE_C_COMPILER "clang-11")
 set(CMAKE_C_STANDARD 11)

-
 project(stj VERSION 0.1)

-
-# =====COMPILE FLAGS=====
 add_definitions(-DVERSION="${CMAKE_PROJECT_VERSION}" -D_XOPEN_SOURCE=600)

-# Debug flags
-# -g flag gets auto-added by CMake for the debug build
-set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -Weverything -O0 -fsanitize=address -fno-omit-frame-pointer")
+
+# =====BUILD TYPES=====
+# Debug
+set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -Wall -O0 -fsanitize=address -fno-omit-frame-pointer")
 set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address -pedantic")

 # Arch doesn't use static libraries
@ -30,13 +28,9 @@ if(NOT EXISTS "/etc/arch-release")
    set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -static-libasan")
 endif()

-# Release flags
+# Release
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -Werror -pedantic-errors")

-# MinSizeRel flags
-# This one's just here for fun
-set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL} -Oz -Werror -pedantic-errors")
-

 # =====EXECUTABLE=====
 file(GLOB st_SRC "st/*.c" "st/*.h")
--- a/src/st/st.c
+++ b/src/st/st.c
@ -17,6 +17,7 @@
 #include <unistd.h>
 #include <wchar.h>

+#include "utf8.h"
 #include "../win.h"
 #include "macros.h"
 #include "st.h"
@ -30,8 +31,6 @@
 #endif

 /* Arbitrary sizes */
-#define UTF_INVALID 0xFFFD
-#define UTF_SIZE 4
 #define ESC_BUF_SIZ (128 * UTF_SIZE)
 #define ESC_ARG_SIZ 16
 #define STR_BUF_SIZ ESC_BUF_SIZ
@ -205,10 +204,6 @@ static void selnormalize(void);
 static void selscroll(int, int);
 static void selsnap(int *, int *, int);

-static size_t utf8decode(const char *, Rune *, size_t);
-static Rune utf8decodebyte(char, size_t *);
-static char utf8encodebyte(Rune, size_t);
-static size_t utf8validate(Rune *, size_t);

 static char *base64dec(const char *);
 static char base64dec_getc(const char **);
@ -224,11 +219,6 @@ static int iofd = 1;
 static int cmdfd;
 static pid_t pid;

-static uchar utfbyte[UTF_SIZE + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
-static uchar utfmask[UTF_SIZE + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
-static Rune utfmin[UTF_SIZE + 1] = {0, 0, 0x80, 0x800, 0x10000};
-static Rune utfmax[UTF_SIZE + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
-
 /**
 * Same as write, but ensures that all bytes are written to the descriptor
 * (write sometimes isn't able to write all bytes)
@ -299,99 +289,6 @@ char *safe_strdup(char *p_str) {
    return p_str;
 }

-/**
- * Decode a given char array into a utf8 Rune
- *
- * @param p_char char array to decode
- * @param p_rune rune pointer to decode to
- * @param p_char_len length of the char array
- * @return size of the decoded rune
- */
-size_t utf8decode(const char *p_char, Rune *p_rune, size_t p_char_len) {
-    size_t decoded_len, type, i;
-    Rune decoded_rune;
-
-    *p_rune = UTF_INVALID;
-    if (!p_char_len) // p_char_len is 0, so just return 0
-        return 0;
-
-    decoded_rune = utf8decodebyte(p_char[0], &decoded_len);
-    if (!BETWEEN(decoded_len, 1, UTF_SIZE))
-        return 1;
-
-    for (i = 1; i < p_char_len && i < decoded_len; ++i) {
-        decoded_rune = (decoded_rune << 6) | utf8decodebyte(p_char[i], &type);
-
-        if (type != 0)
-            return i;
-    }
-
-    if (i < decoded_len)
-        return 0;
-
-    *p_rune = decoded_rune;
-    utf8validate(p_rune, decoded_len);
-
-    return decoded_len;
-}
-
-/**
- * Decode a single byte to UTF-8
- *
- * @param p_char char to decode
- * @param p_i counter used in internal for loop
- * @return decoded rune
- */
-Rune utf8decodebyte(char p_char, size_t *p_i) {
-    for (*p_i = 0; *p_i < LEN(utfmask); ++(*p_i)) {
-        if (((uchar)p_char & utfmask[*p_i]) == utfbyte[*p_i])
-            return (uchar)p_char & ~utfmask[*p_i];
-    }
-
-    return 0;
-}
-
-/**
- * Encode a UTF-8 rune
- *
- * @param p_rune rune to encode
- * @param p_char char array to encode to
- * @return 
- */
-size_t utf8encode(Rune p_rune, char *p_char) {
-    size_t len, i;
-
-    len = utf8validate(&p_rune, 0);
-    if (len > UTF_SIZE)
-        return 0;
-
-    for (i = len - 1; i != 0; --i) {
-        p_char[i] = utf8encodebyte(p_rune, 0);
-        p_rune >>= 6;
-    }
-    p_char[0] = utf8encodebyte(p_rune, len);
-
-    return len;
-}
-
-char utf8encodebyte(Rune u, size_t i) { return utfbyte[i] | (u & ~utfmask[i]); }
-
-/**
- * Check if a given rune is a valid UTF-8 rune
- *
- * @param p_rune rune to validate
- * @param i 
- */
-size_t utf8validate(Rune *p_rune, size_t i) {
-    if (!BETWEEN(*p_rune, utfmin[i], utfmax[i]) || BETWEEN(*p_rune, 0xD800, 0xDFFF))
-        *p_rune = UTF_INVALID;
-
-    // Count up i until you find a utfmax entry that's greater than *p_rune
-    for (i = 1; *p_rune > utfmax[i]; ++i);
-
-    return i;
-}
-
 static const char base64_digits[] = {
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
--- a/src/st/st.h
+++ b/src/st/st.h
@ -7,6 +7,9 @@
 #include <stdint.h>
 #include <sys/types.h>

+#include "types.h"
+#include "utf8.h"
+
 enum glyph_attribute {
    ATTR_NULL = 0,
    ATTR_BOLD = 1 << 0,
@ -29,12 +32,6 @@ enum selection_type { SEL_REGULAR = 1, SEL_RECTANGULAR = 2 };

 enum selection_snap { SNAP_WORD = 1, SNAP_LINE = 2 };

-typedef unsigned char uchar;
-typedef unsigned int uint;
-typedef unsigned long ulong;
-typedef unsigned short ushort;
-
-typedef uint_least32_t Rune;

 #define Glyph Glyph_
 typedef struct {
--- a/src/st/types.h
+++ b/src/st/types.h
@ -0,0 +1,9 @@
+#ifndef TYPES_H
+#define TYPES_H
+
+typedef unsigned char uchar;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef unsigned short ushort;
+
+#endif
--- a/src/st/utf8.c
+++ b/src/st/utf8.c
@ -0,0 +1,101 @@
+#include "utf8.h"
+#include "macros.h"
+#include "types.h"
+
+static uchar utfbyte[UTF_SIZE + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
+static uchar utfmask[UTF_SIZE + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
+static Rune utfmin[UTF_SIZE + 1] = {0, 0, 0x80, 0x800, 0x10000};
+static Rune utfmax[UTF_SIZE + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
+
+/**
+ * Decode a given char array into a utf8 Rune
+ *
+ * @param p_char char array to decode
+ * @param p_rune rune pointer to decode to
+ * @param p_char_len length of the char array
+ * @return size of the decoded rune
+ */
+size_t utf8decode(const char *p_char, Rune *p_rune, size_t p_char_len) {
+    size_t decoded_len, type, i;
+    Rune decoded_rune;
+
+    *p_rune = UTF_INVALID;
+    if (!p_char_len) // p_char_len is 0, so just return 0
+        return 0;
+
+    decoded_rune = utf8decodebyte(p_char[0], &decoded_len);
+    if (!BETWEEN(decoded_len, 1, UTF_SIZE))
+        return 1;
+
+    for (i = 1; i < p_char_len && i < decoded_len; ++i) {
+        decoded_rune = (decoded_rune << 6) | utf8decodebyte(p_char[i], &type);
+
+        if (type != 0)
+            return i;
+    }
+
+    if (i < decoded_len)
+        return 0;
+
+    *p_rune = decoded_rune;
+    utf8validate(p_rune, decoded_len);
+
+    return decoded_len;
+}
+
+/**
+ * Decode a single byte to UTF-8
+ *
+ * @param p_char char to decode
+ * @param p_i counter used in internal for loop
+ * @return decoded rune
+ */
+Rune utf8decodebyte(char p_char, size_t *p_i) {
+    for (*p_i = 0; *p_i < LEN(utfmask); ++(*p_i)) {
+        if (((uchar)p_char & utfmask[*p_i]) == utfbyte[*p_i])
+            return (uchar)p_char & ~utfmask[*p_i];
+    }
+
+    return 0;
+}
+
+/**
+ * Encode a UTF-8 rune
+ *
+ * @param p_rune rune to encode
+ * @param p_char char array to encode to
+ * @return 
+ */
+size_t utf8encode(Rune p_rune, char *p_char) {
+    size_t len, i;
+
+    len = utf8validate(&p_rune, 0);
+    if (len > UTF_SIZE)
+        return 0;
+
+    for (i = len - 1; i != 0; --i) {
+        p_char[i] = utf8encodebyte(p_rune, 0);
+        p_rune >>= 6;
+    }
+    p_char[0] = utf8encodebyte(p_rune, len);
+
+    return len;
+}
+
+char utf8encodebyte(Rune u, size_t i) { return utfbyte[i] | (u & ~utfmask[i]); }
+
+/**
+ * Check if a given rune is a valid UTF-8 rune
+ *
+ * @param p_rune rune to validate
+ * @param i 
+ */
+size_t utf8validate(Rune *p_rune, size_t i) {
+    if (!BETWEEN(*p_rune, utfmin[i], utfmax[i]) || BETWEEN(*p_rune, 0xD800, 0xDFFF))
+        *p_rune = UTF_INVALID;
+
+    // Count up i until you find a utfmax entry that's greater than *p_rune
+    for (i = 1; *p_rune > utfmax[i]; ++i);
+
+    return i;
+}
--- a/src/st/utf8.h
+++ b/src/st/utf8.h
@ -0,0 +1,17 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#define UTF_INVALID 0xFFFD
+#define UTF_SIZE 4
+
+typedef uint_least32_t Rune;
+
+size_t utf8decode(const char *, Rune *, size_t);
+Rune utf8decodebyte(char, size_t *);
+char utf8encodebyte(Rune, size_t);
+size_t utf8validate(Rune *, size_t);
+
+#endif