166 lines
5.5 KiB
C
166 lines
5.5 KiB
C
|
/*
|
||
|
Copyright (c) 2015, Andreas Fett
|
||
|
All rights reserved.
|
||
|
Redistribution and use in source and binary forms, with or without
|
||
|
modification, are permitted provided that the following conditions are met:
|
||
|
* Redistributions of source code must retain the above copyright notice, this
|
||
|
list of conditions and the following disclaimer.
|
||
|
* Redistributions in binary form must reproduce the above copyright notice,
|
||
|
this list of conditions and the following disclaimer in the documentation
|
||
|
and/or other materials provided with the distribution.
|
||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
#include <assert.h>
|
||
|
|
||
|
typedef int utf8_state;
|
||
|
|
||
|
static utf8_state next_state(utf8_state, unsigned char);
|
||
|
|
||
|
// Public API see utf8-validate.h for docs of the following function
|
||
|
|
||
|
bool utf8_validate(utf8_state *const state, int c)
|
||
|
{
|
||
|
assert(state);
|
||
|
return (*state = next_state(*state, c)) != -1;
|
||
|
}
|
||
|
|
||
|
bool utf8_validate_some(utf8_state *const state, const void * const src, size_t len)
|
||
|
{
|
||
|
assert(state);
|
||
|
assert(src);
|
||
|
for (size_t i = 0; i < len; ++i) {
|
||
|
*state = next_state(*state, *((unsigned char *)src + i));
|
||
|
if (*state == -1) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool utf8_validate_mem(const void * const src, size_t len)
|
||
|
{
|
||
|
assert(src);
|
||
|
utf8_state state = 0;
|
||
|
for (size_t i = 0; i < len; ++i) {
|
||
|
state = next_state(state, *((unsigned char *)src + i));
|
||
|
if (state == -1) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// detect unterminated sequence
|
||
|
return state == 0;
|
||
|
}
|
||
|
|
||
|
bool utf8_validate_str(const char * const str)
|
||
|
{
|
||
|
assert(str);
|
||
|
utf8_state state = 0;
|
||
|
for (size_t i = 0; str[i] != 0; ++i) {
|
||
|
state = next_state(state, str[i]);
|
||
|
if (state == -1) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
// detect unterminated sequence
|
||
|
return state == 0;
|
||
|
}
|
||
|
|
||
|
/* Private state engine
|
||
|
*
|
||
|
* The macros below assemble the cases for a switch statement
|
||
|
* matching the language of the ABNF grammar given in rfc3629.
|
||
|
*
|
||
|
* Each SEQ# macro adds the states to match a # char long sequence.
|
||
|
*
|
||
|
* The SEQ#_HELPERs all have a 'fall through' to the next sequence.
|
||
|
* for # > 1 this is an explicit goto
|
||
|
*/
|
||
|
|
||
|
#define SEQ_END(n) SEQ_ ## n ## _END
|
||
|
|
||
|
#define SEQ1_HELPER(s, r0) \
|
||
|
case (s * 4) + 0: if (r0) return 0; goto SEQ_END(s); \
|
||
|
SEQ_END(s):
|
||
|
|
||
|
#define SEQ2_HELPER(s, r0, r1) \
|
||
|
case (s * 4) + 0: if (r0) { printf("ehe"); return (s * 4) + 1; } goto SEQ_END(s); \
|
||
|
case (s * 4) + 1: if (r1) return 0; return -1; \
|
||
|
SEQ_END(s):
|
||
|
|
||
|
#define SEQ3_HELPER(s, r0, r1, r2) \
|
||
|
case (s * 4) + 0: if (r0) return (s * 4) + 1; goto SEQ_END(s); \
|
||
|
case (s * 4) + 1: if (r1) return (s * 4) + 2; return -1; \
|
||
|
case (s * 4) + 2: if (r2) return 0; return -1; \
|
||
|
SEQ_END(s):
|
||
|
|
||
|
#define SEQ4_HELPER(s, r0, r1, r2, r3) \
|
||
|
case (s * 4) + 0: if (r0) return (s * 4) + 1; goto SEQ_END(s); \
|
||
|
case (s * 4) + 1: if (r1) return (s * 4) + 2; return -1; \
|
||
|
case (s * 4) + 2: if (r2) return (s * 4) + 3; return -1; \
|
||
|
case (s * 4) + 3: if (r3) return 0; return -1; \
|
||
|
SEQ_END(s):
|
||
|
|
||
|
#define SEQ1(s, r0) SEQ1_HELPER(s, r0)
|
||
|
#define SEQ2(s, r0, r1) SEQ2_HELPER(s, r0, r1)
|
||
|
#define SEQ3(s, r0, r1, r2) SEQ3_HELPER(s, r0, r1, r2)
|
||
|
#define SEQ4(s, r0, r1, r2, r3) SEQ4_HELPER(s, r0, r1, r2, r3)
|
||
|
|
||
|
// Matcher macros
|
||
|
|
||
|
#define VALUE(v) (c == v)
|
||
|
#define RANGE(s, e) (c >= s && c <= e)
|
||
|
/* workaround for "-Wtype-limits" as c >= s is allways true for
|
||
|
* the unsigned char in the case of c == 0 */
|
||
|
#define EGNAR(s, e) ((c >= s + 1 && c <= e) || c == s)
|
||
|
|
||
|
/* from rfc3629
|
||
|
*
|
||
|
* UTF8-octets = *( UTF8-char )
|
||
|
* UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
|
||
|
* UTF8-1 = %x00-7F
|
||
|
* UTF8-2 = %xC2-DF UTF8-tail
|
||
|
* UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
|
||
|
* %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
|
||
|
* UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
|
||
|
* %xF4 %x80-8F 2( UTF8-tail )
|
||
|
* UTF8-tail = %x80-BF
|
||
|
*/
|
||
|
|
||
|
#define TAIL RANGE(0x80, 0xBF)
|
||
|
|
||
|
static inline utf8_state next_state(utf8_state state, unsigned char c)
|
||
|
{
|
||
|
printf("C: %d\n", c);
|
||
|
switch (state) {
|
||
|
SEQ1(0, EGNAR(0x00, 0x7F))
|
||
|
SEQ2(1, RANGE(0xC2, 0xDF), TAIL)
|
||
|
SEQ3(2, VALUE(0xE0), RANGE(0xA0, 0xBF), TAIL)
|
||
|
SEQ3(3, RANGE(0xE1, 0xEC), TAIL, TAIL)
|
||
|
SEQ3(4, VALUE(0xED), RANGE(0x80, 0x9F), TAIL)
|
||
|
SEQ3(5, RANGE(0xEE, 0xEF), TAIL, TAIL)
|
||
|
SEQ4(6, VALUE(0xF0), RANGE(0x90, 0xBF), TAIL, TAIL)
|
||
|
SEQ4(7, RANGE(0xF1, 0xF3), TAIL, TAIL, TAIL)
|
||
|
SEQ4(8, VALUE(0xF4), RANGE(0x80, 0x8F), TAIL, TAIL)
|
||
|
// no sequence start matched
|
||
|
break;
|
||
|
default:
|
||
|
/*
|
||
|
* This should not happen, unless you feed an error
|
||
|
* state or an uninitialized utf8_state to this function.
|
||
|
*/
|
||
|
assert(false && "invalid utf8 state");
|
||
|
}
|
||
|
|
||
|
return -1;
|
||
|
}
|