#include "utf8.h" #include "macros.h" #include "types.h" static uchar utfbyte[UTF_SIZE + 1] = {0x80, 0, 0xC0, 0xE0, 0xF0}; static uchar utfmask[UTF_SIZE + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8}; static Rune utfmin[UTF_SIZE + 1] = {0, 0, 0x80, 0x800, 0x10000}; static Rune utfmax[UTF_SIZE + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; /** * Decode a given char array into a utf8 Rune * * @param p_char char array to decode * @param p_rune rune pointer to decode to * @param p_char_len length of the char array * @return size of the decoded rune */ size_t utf8decode(const char *p_char, Rune *p_rune, size_t p_char_len) { size_t decoded_len, type, i; Rune decoded_rune; *p_rune = UTF_INVALID; if (!p_char_len) // p_char_len is 0, so just return 0 return 0; decoded_rune = utf8decodebyte(p_char[0], &decoded_len); if (!BETWEEN(decoded_len, 1, UTF_SIZE)) return 1; for (i = 1; i < p_char_len && i < decoded_len; ++i) { decoded_rune = (decoded_rune << 6) | utf8decodebyte(p_char[i], &type); if (type != 0) return i; } if (i < decoded_len) return 0; *p_rune = decoded_rune; utf8validate(p_rune, decoded_len); return decoded_len; } /** * Decode a single byte to UTF-8 * * @param p_char char to decode * @param p_i counter used in internal for loop * @return decoded rune */ Rune utf8decodebyte(char p_char, size_t *p_i) { for (*p_i = 0; *p_i < LEN(utfmask); ++(*p_i)) { if (((uchar)p_char & utfmask[*p_i]) == utfbyte[*p_i]) return (uchar)p_char & ~utfmask[*p_i]; } return 0; } /** * Encode a UTF-8 rune * * @param p_rune rune to encode * @param p_char char array to encode to * @return */ size_t utf8encode(Rune p_rune, char *p_char) { size_t len, i; len = utf8validate(&p_rune, 0); if (len > UTF_SIZE) return 0; for (i = len - 1; i != 0; --i) { p_char[i] = utf8encodebyte(p_rune, 0); p_rune >>= 6; } p_char[0] = utf8encodebyte(p_rune, len); return len; } char utf8encodebyte(Rune u, size_t i) { return utfbyte[i] | (u & ~utfmask[i]); } /** * Check if a given rune is a valid UTF-8 rune * * @param p_rune rune to validate * @param i */ size_t utf8validate(Rune *p_rune, size_t i) { if (!BETWEEN(*p_rune, utfmin[i], utfmax[i]) || BETWEEN(*p_rune, 0xD800, 0xDFFF)) *p_rune = UTF_INVALID; // Count up i until you find a utfmax entry that's greater than *p_rune for (i = 1; *p_rune > utfmax[i]; ++i); return i; }