|
|
|
/* MIT (BSD) license - see LICENSE file for details - taken from ccan. thanks rusty! */
|
|
|
|
|
|
|
|
#include "utf8.h"
|
|
|
|
#include <errno.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
|
|
|
/* I loved this table, so I stole it: */
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
|
|
|
|
* <https://github.com/chansen/c-utf8-valid>
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions are met:
|
|
|
|
*
|
|
|
|
* 1. Redistributions of source code must retain the above copyright notice, this
|
|
|
|
* list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
* this list of conditions and the following disclaimer in the documentation
|
|
|
|
* and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
|
|
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
|
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* UTF-8 Encoding Form
|
|
|
|
*
|
|
|
|
* U+0000..U+007F 0xxxxxxx <= 7 bits
|
|
|
|
* U+0080..U+07FF 110xxxxx 10xxxxxx <= 11 bits
|
|
|
|
* U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx <= 16 bits
|
|
|
|
* U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits
|
|
|
|
*
|
|
|
|
*
|
|
|
|
* U+0000..U+007F 00..7F
|
|
|
|
* N C0..C1 80..BF 1100000x 10xxxxxx
|
|
|
|
* U+0080..U+07FF C2..DF 80..BF
|
|
|
|
* N E0 80..9F 80..BF 11100000 100xxxxx
|
|
|
|
* U+0800..U+0FFF E0 A0..BF 80..BF
|
|
|
|
* U+1000..U+CFFF E1..EC 80..BF 80..BF
|
|
|
|
* U+D000..U+D7FF ED 80..9F 80..BF
|
|
|
|
* S ED A0..BF 80..BF 11101101 101xxxxx
|
|
|
|
* U+E000..U+FFFF EE..EF 80..BF 80..BF
|
|
|
|
* N F0 80..8F 80..BF 80..BF 11110000 1000xxxx
|
|
|
|
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
|
|
|
|
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
|
|
|
|
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx
|
|
|
|
*
|
|
|
|
* Legend:
|
|
|
|
* N = Non-shortest form
|
|
|
|
* S = Surrogates
|
|
|
|
*/
|
|
|
|
bool utf8_decode(struct utf8_state *utf8_state, char c)
|
|
|
|
{
|
|
|
|
if (utf8_state->used_len == utf8_state->total_len) {
|
|
|
|
utf8_state->used_len = 1;
|
|
|
|
/* First character in sequence. */
|
|
|
|
if (((unsigned char)c & 0x80) == 0) {
|
|
|
|
/* ASCII, easy. */
|
|
|
|
if (c == 0)
|
|
|
|
goto bad_encoding;
|
|
|
|
utf8_state->total_len = 1;
|
|
|
|
utf8_state->c = c;
|
|
|
|
goto finished_decoding;
|
|
|
|
} else if (((unsigned char)c & 0xE0) == 0xC0) {
|
|
|
|
utf8_state->total_len = 2;
|
|
|
|
utf8_state->c = ((unsigned char)c & 0x1F);
|
|
|
|
return false;
|
|
|
|
} else if (((unsigned char)c & 0xF0) == 0xE0) {
|
|
|
|
utf8_state->total_len = 3;
|
|
|
|
utf8_state->c = ((unsigned char)c & 0x0F);
|
|
|
|
return false;
|
|
|
|
} else if (((unsigned char)c & 0xF8) == 0xF0) {
|
|
|
|
utf8_state->total_len = 4;
|
|
|
|
utf8_state->c = ((unsigned char)c & 0x07);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
goto bad_encoding;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (((unsigned char)c & 0xC0) != 0x80)
|
|
|
|
goto bad_encoding;
|
|
|
|
|
|
|
|
utf8_state->c <<= 6;
|
|
|
|
utf8_state->c |= ((unsigned char)c & 0x3F);
|
|
|
|
|
|
|
|
utf8_state->used_len++;
|
|
|
|
if (utf8_state->used_len == utf8_state->total_len)
|
|
|
|
goto finished_decoding;
|
|
|
|
return false;
|
|
|
|
|
|
|
|
finished_decoding:
|
|
|
|
if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
|
|
|
|
errno = ERANGE;
|
|
|
|
/* The UTF-16 "surrogate range": illegal in UTF-8 */
|
|
|
|
else if (utf8_state->total_len == 3
|
|
|
|
&& (utf8_state->c & 0xFFFFF800) == 0x0000D800)
|
|
|
|
errno = ERANGE;
|
|
|
|
else {
|
|
|
|
int min_bits;
|
|
|
|
switch (utf8_state->total_len) {
|
|
|
|
case 1:
|
|
|
|
min_bits = 0;
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
min_bits = 7;
|
|
|
|
break;
|
|
|
|
case 3:
|
|
|
|
min_bits = 11;
|
|
|
|
break;
|
|
|
|
case 4:
|
|
|
|
min_bits = 16;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
if ((utf8_state->c >> min_bits) == 0)
|
|
|
|
errno = EFBIG;
|
|
|
|
else
|
|
|
|
errno = 0;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
|
|
|
|
bad_encoding:
|
|
|
|
utf8_state->total_len = utf8_state->used_len;
|
|
|
|
errno = EINVAL;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
|
|
|
|
{
|
|
|
|
if ((point >> 7) == 0) {
|
|
|
|
if (point == 0) {
|
|
|
|
errno = ERANGE;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
/* 0xxxxxxx */
|
|
|
|
dest[0] = point;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((point >> 11) == 0) {
|
|
|
|
/* 110xxxxx 10xxxxxx */
|
|
|
|
dest[1] = 0x80 | (point & 0x3F);
|
|
|
|
dest[0] = 0xC0 | (point >> 6);
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((point >> 16) == 0) {
|
|
|
|
if (point >= 0xD800 && point <= 0xDFFF) {
|
|
|
|
errno = ERANGE;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
|
|
|
dest[2] = 0x80 | (point & 0x3F);
|
|
|
|
dest[1] = 0x80 | ((point >> 6) & 0x3F);
|
|
|
|
dest[0] = 0xE0 | (point >> 12);
|
|
|
|
return 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (point > 0x10FFFF) {
|
|
|
|
errno = ERANGE;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
|
|
dest[3] = 0x80 | (point & 0x3F);
|
|
|
|
dest[2] = 0x80 | ((point >> 6) & 0x3F);
|
|
|
|
dest[1] = 0x80 | ((point >> 12) & 0x3F);
|
|
|
|
dest[0] = 0xF0 | (point >> 18);
|
|
|
|
return 4;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Check for valid UTF-8 */
|
|
|
|
bool utf8_check(const void *vbuf, size_t buflen)
|
|
|
|
{
|
|
|
|
const unsigned char *buf = vbuf;
|
|
|
|
struct utf8_state utf8_state = UTF8_STATE_INIT;
|
|
|
|
bool need_more = false;
|
|
|
|
|
|
|
|
for (size_t i = 0; i < buflen; i++) {
|
|
|
|
if (!utf8_decode(&utf8_state, buf[i])) {
|
|
|
|
need_more = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
need_more = false;
|
|
|
|
if (errno != 0)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return !need_more;
|
|
|
|
}
|
|
|
|
|