This repository has been archived on 2021-04-22. You can view files and clone it, but cannot push or open issues/pull-requests.
stj/src/st/utf8.c

104 lines
2.5 KiB
C

#include "utf8.h"
#include "macros.h"
#include "types.h"
static uchar utfbyte[UTF_SIZE + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0};
static uchar utfmask[UTF_SIZE + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
static Rune utfmin[UTF_SIZE + 1] = {0, 0, 0x80, 0x800, 0x10000};
static Rune utfmax[UTF_SIZE + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};
/**
* Decode a given char array into a utf8 Rune
*
* @param p_char char array to decode
* @param p_rune rune pointer to decode to
* @param p_char_len length of the char array
* @return size of the decoded rune
*/
size_t utf8decode(const char *p_char, Rune *p_rune, size_t p_char_len) {
size_t decoded_len, type, i;
Rune decoded_rune;
*p_rune = UTF_INVALID;
if (!p_char_len) // p_char_len is 0, so just return 0
return 0;
decoded_rune = utf8decodebyte(p_char[0], &decoded_len);
if (!BETWEEN(decoded_len, 1, UTF_SIZE))
return 1;
for (i = 1; i < p_char_len && i < decoded_len; ++i) {
decoded_rune = (decoded_rune << 6) | utf8decodebyte(p_char[i], &type);
if (type != 0)
return i;
}
if (i < decoded_len)
return 0;
*p_rune = decoded_rune;
utf8validate(p_rune, decoded_len);
return decoded_len;
}
/**
* Decode a single byte to UTF-8
*
* @param p_char char to decode
* @param p_i counter used in internal for loop
* @return decoded rune
*/
Rune utf8decodebyte(char p_char, size_t *p_i) {
for (*p_i = 0; *p_i < LEN(utfmask); ++(*p_i)) {
if (((uchar)p_char & utfmask[*p_i]) == utfbyte[*p_i])
return (uchar)p_char & ~utfmask[*p_i];
}
return 0;
}
/**
* Encode a UTF-8 rune
*
* @param p_rune rune to encode
* @param p_char char array to encode to
* @return
*/
size_t utf8encode(Rune p_rune, char *p_char) {
size_t len, i;
len = utf8validate(&p_rune, 0);
if (len > UTF_SIZE)
return 0;
for (i = len - 1; i != 0; --i) {
p_char[i] = utf8encodebyte(p_rune, 0);
p_rune >>= 6;
}
p_char[0] = utf8encodebyte(p_rune, len);
return len;
}
char utf8encodebyte(Rune u, size_t i) { return utfbyte[i] | (u & ~utfmask[i]); }
/**
* Check if a given rune is a valid UTF-8 rune
*
* @param p_rune rune to validate
* @param i
*/
size_t utf8validate(Rune *p_rune, size_t i) {
if (!BETWEEN(*p_rune, utfmin[i], utfmax[i]) ||
BETWEEN(*p_rune, 0xD800, 0xDFFF))
*p_rune = UTF_INVALID;
// Count up i until you find a utfmax entry that's greater than *p_rune
for (i = 1; *p_rune > utfmax[i]; ++i)
;
return i;
}