Split off utf8 encoding

develop
Jef Roosens 2020-12-02 16:09:09 +01:00
parent 25af8ba5fc
commit 010fb66cb6
6 changed files with 137 additions and 122 deletions

View File

@ -8,20 +8,18 @@ endif()
set(SYSTEM_TYPE x64)
# =====COMPILER=====
# =====COMMON SETTINGS=====
set(CMAKE_C_COMPILER "clang-11")
set(CMAKE_C_STANDARD 11)
project(stj VERSION 0.1)
# =====COMPILE FLAGS=====
add_definitions(-DVERSION="${CMAKE_PROJECT_VERSION}" -D_XOPEN_SOURCE=600)
# Debug flags
# -g flag gets auto-added by CMake for the debug build
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -Weverything -O0 -fsanitize=address -fno-omit-frame-pointer")
# =====BUILD TYPES=====
# Debug
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -g -Wall -O0 -fsanitize=address -fno-omit-frame-pointer")
set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -fno-omit-frame-pointer -fsanitize=address -pedantic")
# Arch doesn't use static libraries
@ -30,13 +28,9 @@ if(NOT EXISTS "/etc/arch-release")
set(CMAKE_EXE_LINKER_FLAGS_DEBUG "${CMAKE_EXE_LINKER_FLAGS_DEBUG} -static-libasan")
endif()
# Release flags
# Release
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -Werror -pedantic-errors")
# MinSizeRel flags
# This one's just here for fun
set(CMAKE_C_FLAGS_MINSIZEREL "${CMAKE_C_FLAGS_MINSIZEREL} -Oz -Werror -pedantic-errors")
# =====EXECUTABLE=====
file(GLOB st_SRC "st/*.c" "st/*.h")

View File

@ -17,6 +17,7 @@
#include <unistd.h>
#include <wchar.h>
#include "utf8.h"
#include "../win.h"
#include "macros.h"
#include "st.h"
@ -30,8 +31,6 @@
#endif
/* Arbitrary sizes */
#define UTF_INVALID 0xFFFD
#define UTF_SIZE 4
#define ESC_BUF_SIZ (128 * UTF_SIZE)
#define ESC_ARG_SIZ 16
#define STR_BUF_SIZ ESC_BUF_SIZ
@ -205,10 +204,6 @@ static void selnormalize(void);
static void selscroll(int, int);
static void selsnap(int *, int *, int);
static size_t utf8decode(const char *, Rune *, size_t);
static Rune utf8decodebyte(char, size_t *);
static char utf8encodebyte(Rune, size_t);
static size_t utf8validate(Rune *, size_t);
static char *base64dec(const char *);
static char base64dec_getc(const char **);
@ -224,11 +219,6 @@ static int iofd = 1;
static int cmdfd;
static pid_t pid;
static uchar utfbyte[UTF_SIZE + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
static uchar utfmask[UTF_SIZE + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
static Rune utfmin[UTF_SIZE + 1] = {0, 0, 0x80, 0x800, 0x10000};
static Rune utfmax[UTF_SIZE + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
/**
* Same as write, but ensures that all bytes are written to the descriptor
* (write sometimes isn't able to write all bytes)
@ -299,99 +289,6 @@ char *safe_strdup(char *p_str) {
return p_str;
}
/**
* Decode a given char array into a utf8 Rune
*
* @param p_char char array to decode
* @param p_rune rune pointer to decode to
* @param p_char_len length of the char array
* @return size of the decoded rune
*/
size_t utf8decode(const char *p_char, Rune *p_rune, size_t p_char_len) {
size_t decoded_len, type, i;
Rune decoded_rune;
*p_rune = UTF_INVALID;
if (!p_char_len) // p_char_len is 0, so just return 0
return 0;
decoded_rune = utf8decodebyte(p_char[0], &decoded_len);
if (!BETWEEN(decoded_len, 1, UTF_SIZE))
return 1;
for (i = 1; i < p_char_len && i < decoded_len; ++i) {
decoded_rune = (decoded_rune << 6) | utf8decodebyte(p_char[i], &type);
if (type != 0)
return i;
}
if (i < decoded_len)
return 0;
*p_rune = decoded_rune;
utf8validate(p_rune, decoded_len);
return decoded_len;
}
/**
* Decode a single byte to UTF-8
*
* @param p_char char to decode
* @param p_i counter used in internal for loop
* @return decoded rune
*/
Rune utf8decodebyte(char p_char, size_t *p_i) {
for (*p_i = 0; *p_i < LEN(utfmask); ++(*p_i)) {
if (((uchar)p_char & utfmask[*p_i]) == utfbyte[*p_i])
return (uchar)p_char & ~utfmask[*p_i];
}
return 0;
}
/**
* Encode a UTF-8 rune
*
* @param p_rune rune to encode
* @param p_char char array to encode to
* @return
*/
size_t utf8encode(Rune p_rune, char *p_char) {
size_t len, i;
len = utf8validate(&p_rune, 0);
if (len > UTF_SIZE)
return 0;
for (i = len - 1; i != 0; --i) {
p_char[i] = utf8encodebyte(p_rune, 0);
p_rune >>= 6;
}
p_char[0] = utf8encodebyte(p_rune, len);
return len;
}
char utf8encodebyte(Rune u, size_t i) { return utfbyte[i] | (u & ~utfmask[i]); }
/**
* Check if a given rune is a valid UTF-8 rune
*
* @param p_rune rune to validate
* @param i
*/
size_t utf8validate(Rune *p_rune, size_t i) {
if (!BETWEEN(*p_rune, utfmin[i], utfmax[i]) || BETWEEN(*p_rune, 0xD800, 0xDFFF))
*p_rune = UTF_INVALID;
// Count up i until you find a utfmax entry that's greater than *p_rune
for (i = 1; *p_rune > utfmax[i]; ++i);
return i;
}
static const char base64_digits[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

View File

@ -7,6 +7,9 @@
#include <stdint.h>
#include <sys/types.h>
#include "types.h"
#include "utf8.h"
enum glyph_attribute {
ATTR_NULL = 0,
ATTR_BOLD = 1 << 0,
@ -29,12 +32,6 @@ enum selection_type { SEL_REGULAR = 1, SEL_RECTANGULAR = 2 };
enum selection_snap { SNAP_WORD = 1, SNAP_LINE = 2 };
typedef unsigned char uchar;
typedef unsigned int uint;
typedef unsigned long ulong;
typedef unsigned short ushort;
typedef uint_least32_t Rune;
#define Glyph Glyph_
typedef struct {

9
src/st/types.h 100644
View File

@ -0,0 +1,9 @@
#ifndef TYPES_H
#define TYPES_H
typedef unsigned char uchar;
typedef unsigned int uint;
typedef unsigned long ulong;
typedef unsigned short ushort;
#endif

101
src/st/utf8.c 100644
View File

@ -0,0 +1,101 @@
#include "utf8.h"
#include "macros.h"
#include "types.h"
static uchar utfbyte[UTF_SIZE + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
static uchar utfmask[UTF_SIZE + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
static Rune utfmin[UTF_SIZE + 1] = {0, 0, 0x80, 0x800, 0x10000};
static Rune utfmax[UTF_SIZE + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
/**
* Decode a given char array into a utf8 Rune
*
* @param p_char char array to decode
* @param p_rune rune pointer to decode to
* @param p_char_len length of the char array
* @return size of the decoded rune
*/
size_t utf8decode(const char *p_char, Rune *p_rune, size_t p_char_len) {
size_t decoded_len, type, i;
Rune decoded_rune;
*p_rune = UTF_INVALID;
if (!p_char_len) // p_char_len is 0, so just return 0
return 0;
decoded_rune = utf8decodebyte(p_char[0], &decoded_len);
if (!BETWEEN(decoded_len, 1, UTF_SIZE))
return 1;
for (i = 1; i < p_char_len && i < decoded_len; ++i) {
decoded_rune = (decoded_rune << 6) | utf8decodebyte(p_char[i], &type);
if (type != 0)
return i;
}
if (i < decoded_len)
return 0;
*p_rune = decoded_rune;
utf8validate(p_rune, decoded_len);
return decoded_len;
}
/**
* Decode a single byte to UTF-8
*
* @param p_char char to decode
* @param p_i counter used in internal for loop
* @return decoded rune
*/
Rune utf8decodebyte(char p_char, size_t *p_i) {
for (*p_i = 0; *p_i < LEN(utfmask); ++(*p_i)) {
if (((uchar)p_char & utfmask[*p_i]) == utfbyte[*p_i])
return (uchar)p_char & ~utfmask[*p_i];
}
return 0;
}
/**
* Encode a UTF-8 rune
*
* @param p_rune rune to encode
* @param p_char char array to encode to
* @return
*/
size_t utf8encode(Rune p_rune, char *p_char) {
size_t len, i;
len = utf8validate(&p_rune, 0);
if (len > UTF_SIZE)
return 0;
for (i = len - 1; i != 0; --i) {
p_char[i] = utf8encodebyte(p_rune, 0);
p_rune >>= 6;
}
p_char[0] = utf8encodebyte(p_rune, len);
return len;
}
char utf8encodebyte(Rune u, size_t i) { return utfbyte[i] | (u & ~utfmask[i]); }
/**
* Check if a given rune is a valid UTF-8 rune
*
* @param p_rune rune to validate
* @param i
*/
size_t utf8validate(Rune *p_rune, size_t i) {
if (!BETWEEN(*p_rune, utfmin[i], utfmax[i]) || BETWEEN(*p_rune, 0xD800, 0xDFFF))
*p_rune = UTF_INVALID;
// Count up i until you find a utfmax entry that's greater than *p_rune
for (i = 1; *p_rune > utfmax[i]; ++i);
return i;
}

17
src/st/utf8.h 100644
View File

@ -0,0 +1,17 @@
#ifndef UTF8_H
#define UTF8_H
#include <stdlib.h>
#include <stdint.h>
#define UTF_INVALID 0xFFFD
#define UTF_SIZE 4
typedef uint_least32_t Rune;
size_t utf8decode(const char *, Rune *, size_t);
Rune utf8decodebyte(char, size_t *);
char utf8encodebyte(Rune, size_t);
size_t utf8validate(Rune *, size_t);
#endif