Rusty Russell
7 years ago
committed by
Christian Decker
11 changed files with 624 additions and 5 deletions
@ -1,3 +1,3 @@ |
|||
CCAN imported from http://ccodearchive.net. |
|||
|
|||
CCAN version: init-2432-gd830ca0e |
|||
CCAN version: init-2434-gac8694de |
|||
|
@ -0,0 +1 @@ |
|||
../../licenses/BSD-MIT |
@ -0,0 +1,48 @@ |
|||
#include "config.h" |
|||
#include <stdio.h> |
|||
#include <string.h> |
|||
|
|||
/** |
|||
* utf8 - Simple routines to encode/decode valid UTF-8. |
|||
* |
|||
* This code contains routines to encode and decode UTF-8 characters. |
|||
* Table and test code stolen entirely from: |
|||
* Copyright (c) 2017 Christian Hansen <chansen@cpan.org> |
|||
* <https://github.com/chansen/c-utf8-valid> |
|||
* |
|||
* Example: |
|||
* int main(int argc, char *argv[]) |
|||
* { |
|||
* size_t i; |
|||
* struct utf8_state utf8_state = UTF8_STATE_INIT; |
|||
* bool decoded = true; |
|||
* |
|||
* for (i = 0; i < strlen(argv[1]); i++) { |
|||
* decoded = utf8_decode(&utf8_state, argv[1][i]); |
|||
* if (decoded) { |
|||
* if (errno != 0) |
|||
* err(1, "Invalid UTF8 char %zu-%zu", |
|||
* i - utf8_state.used_len, i); |
|||
* printf("Character %u\n", utf8_state.c); |
|||
* } |
|||
* } |
|||
* |
|||
* if (!decoded) |
|||
* errx(1, "Incomplete UTF8"); |
|||
* return 0; |
|||
* } |
|||
* |
|||
* License: BSD-MIT |
|||
*/ |
|||
int main(int argc, char *argv[]) |
|||
{ |
|||
/* Expect exactly one argument */ |
|||
if (argc != 2) |
|||
return 1; |
|||
|
|||
if (strcmp(argv[1], "depends") == 0) { |
|||
return 0; |
|||
} |
|||
|
|||
return 1; |
|||
} |
@ -0,0 +1,266 @@ |
|||
#include <ccan/utf8/utf8.h> |
|||
/* Include the C files directly. */ |
|||
#include <ccan/utf8/utf8.c> |
|||
#include <ccan/tap/tap.h> |
|||
#include <assert.h> |
|||
|
|||
/* Stolen from https://github.com/chansen/c-utf8-valid/blob/master/test.c */ |
|||
|
|||
/*
|
|||
* UTF-8 |
|||
* |
|||
* U+0000..U+007F 00..7F |
|||
* n C0..C1 80..BF |
|||
* U+0080..U+07FF C2..DF 80..BF |
|||
* n E0 80..9F 80..BF |
|||
* U+0800..U+D7FF E0..ED A0..9F 80..BF |
|||
* U+D800..U+DFFF s ED A0..BF 80..BF |
|||
* U+E000..U+FFFF EE..EF 80..BF 80..BF |
|||
* n F0 80..8F 80..BF 80..BF |
|||
* U+0800..U+FFFF F0 80..8F A0..BF 80..BF |
|||
* U+10000..U+10FFFF F0..F4 90..8F 80..BF 80..BF |
|||
* |
|||
* U-110000..U-1FFFFF x F4..F7 90..BF 80..BF 80..BF |
|||
* xn F8 80..87 80..BF 80..BF 80..BF |
|||
* U-200000..U-3FFFFFF x F8..FB 88..BF 80..BF 80..BF 80..BF |
|||
* xn FC 80..83 80..BF 80..BF 80..BF 80..BF |
|||
* U-4000000..U-7FFFFFFF x FC..FD 84..BF 80..BF 80..BF 80..BF 80..BF |
|||
* |
|||
* Legend: |
|||
* n = Non-shortest form |
|||
* s = Surrogates |
|||
* x = Codepoints outside Unicode codespace |
|||
*/ |
|||
|
|||
/*
|
|||
* Encodes the given ordinal [0, 7FFFFFFF] using the UTF-8 encoding scheme |
|||
* to the given sequence length [1, 6]. This routine can be used to |
|||
* produce well-formed and ill-formed UTF-8. |
|||
* |
|||
* To encode a Unicode scalar value to a well-formed representation: |
|||
* |
|||
* [U+0000, U+007F] should be encoded to a sequence length of 1 |
|||
* [U+0080, U+07FF] should be encoded to a sequence length of 2 |
|||
* [U+0800, U+D7FF] should be encoded to a sequence length of 3 |
|||
* [U+E000, U+FFFF] should be encoded to a sequence length of 3 |
|||
* [U+10000, U+10FFFF] should be encoded to a sequence length of 4 |
|||
* |
|||
* To encode a Unicode scalar value to non-shortest form representation: |
|||
* |
|||
* [U+0000, U+007F] can be encoded to a sequence length of [2, 6] |
|||
* [U+0080, U+07FF] can be encoded to a sequence length of [3, 6] |
|||
* [U+0800, U+FFFF] can be encoded to a sequence length of [4, 6] |
|||
* |
|||
* To encode an ordinal outside of Unicode codespace: |
|||
* |
|||
* [110000, 1FFFFF] can be encoded to a sequence length of 4 |
|||
* [200000, 3FFFFFF] can be encoded to a sequence length of 5 |
|||
* [4000000, 7FFFFFFF] can be encoded to a sequence length of 6 |
|||
*/ |
|||
|
|||
static char * |
|||
encode_ord(uint32_t ord, size_t len, char *dst) { |
|||
static const uint32_t kMask[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
|||
static const uint32_t kMax[6] = { 1 << 7, 1 << 11, 1 << 16, |
|||
1 << 21, 1 << 26, 1 << 31 }; |
|||
size_t i; |
|||
|
|||
assert(len >= 1); |
|||
assert(len <= 6); |
|||
assert(ord < kMax[len - 1]); |
|||
|
|||
for (i = len - 1; i > 0; i--) { |
|||
dst[i] = (ord & 0x3F) | 0x80; |
|||
ord >>= 6; |
|||
} |
|||
dst[0] = ord | kMask[len - 1]; |
|||
return dst; |
|||
} |
|||
|
|||
static int utf8_check(const char *src, size_t len) |
|||
{ |
|||
bool decoded = false; |
|||
struct utf8_state utf8_state = UTF8_STATE_INIT; |
|||
size_t i; |
|||
|
|||
for (i = 0; i < len; i++) { |
|||
decoded = utf8_decode(&utf8_state, src[i]); |
|||
if (decoded) { |
|||
if (errno != 0) |
|||
return errno; |
|||
} |
|||
} |
|||
if (!decoded) |
|||
return EMLINK; |
|||
return 0; |
|||
} |
|||
|
|||
static void |
|||
test_utf8(const char *src, size_t len, int exp_err, unsigned line) { |
|||
int got_err; |
|||
|
|||
assert(len <= 255); |
|||
|
|||
got_err = utf8_check(src, len); |
|||
|
|||
ok(got_err == exp_err, "Got result %i, expected %i at line %u", |
|||
got_err, exp_err, line); |
|||
} |
|||
|
|||
#define TEST_UTF8(src, len, exp) \ |
|||
test_utf8(src, len, exp, __LINE__) |
|||
|
|||
|
|||
static void |
|||
test_unicode_scalar_value(void) { |
|||
uint32_t ord; |
|||
char src[4]; |
|||
|
|||
/* Unicode scalar value [U+0000, U+007F] */ |
|||
for (ord = 0x0000; ord <= 0x007F; ord++) { |
|||
encode_ord(ord, 1, src); |
|||
TEST_UTF8(src, 1, ord ? 0 : ERANGE); |
|||
} |
|||
|
|||
/*
|
|||
* Unicode scalar value [U+0080, U+07FF] |
|||
* The maximal subpart is the length of the truncated sequence |
|||
*/ |
|||
for (ord = 0x0080; ord <= 0x07FF; ord++) { |
|||
encode_ord(ord, 2, src); |
|||
TEST_UTF8(src, 2, 0); |
|||
} |
|||
|
|||
/*
|
|||
* Unicode scalar value [U+0800, U+D7FF] and [U+E000, U+FFFF] |
|||
* The maximal subpart is the length of the truncated sequence |
|||
*/ |
|||
for (ord = 0x0800; ord <= 0xFFFF && (ord & 0xF800) != 0xD800; ord++) { |
|||
encode_ord(ord, 3, src); |
|||
|
|||
TEST_UTF8(src, 3, 0); |
|||
if ((ord % (1 << 6)) == 0) |
|||
TEST_UTF8(src, 2, EMLINK); |
|||
} |
|||
|
|||
/*
|
|||
* Unicode scalar value [U+10000, U+10FFF] |
|||
* The maximal subpart is the length of the truncated sequence |
|||
*/ |
|||
for (ord = 0x10000; ord <= 0x10FFFF; ord++) { |
|||
encode_ord(ord, 4, src); |
|||
|
|||
TEST_UTF8(src, 4, 0); |
|||
if ((ord % (1 << 6)) == 0) |
|||
TEST_UTF8(src, 3, EMLINK); |
|||
if ((ord % (1 << 12)) == 0) |
|||
TEST_UTF8(src, 2, EMLINK); |
|||
} |
|||
} |
|||
|
|||
static void |
|||
test_non_shortest_form(void) { |
|||
uint32_t ord; |
|||
char src[4]; |
|||
|
|||
/*
|
|||
* Non-shortest form 2-byte sequence [U+0000, U+007F] |
|||
* The maximal subpart is 1-byte |
|||
*/ |
|||
for (ord = 0x0001; ord <= 0x007F; ord++) { |
|||
encode_ord(ord, 2, src); |
|||
TEST_UTF8(src, 2, EFBIG); |
|||
} |
|||
|
|||
/*
|
|||
* Non-shortest form 3-byte sequence [U+0000, U+07FF] |
|||
* The maximal subpart is 1-byte |
|||
*/ |
|||
for (ord = 0x0001; ord <= 0x07FF; ord++) { |
|||
encode_ord(ord, 3, src); |
|||
|
|||
TEST_UTF8(src, 3, EFBIG); |
|||
if ((ord % (1 << 6)) == 0) |
|||
TEST_UTF8(src, 2, EMLINK); |
|||
} |
|||
|
|||
/*
|
|||
* Non-shortest form 4-byte sequence [U+0000, U+FFFF] |
|||
* The maximal subpart is 1-byte |
|||
*/ |
|||
for (ord = 0x0001; ord <= 0xFFFF; ord++) { |
|||
encode_ord(ord, 4, src); |
|||
|
|||
TEST_UTF8(src, 4, EFBIG); |
|||
if ((ord % (1 << 6)) == 0) |
|||
TEST_UTF8(src, 3, EMLINK); |
|||
if ((ord % (1 << 12)) == 0) |
|||
TEST_UTF8(src, 2, EMLINK); |
|||
} |
|||
} |
|||
|
|||
static void |
|||
test_non_unicode(void) { |
|||
uint32_t ord; |
|||
char src[4]; |
|||
|
|||
/*
|
|||
* Code point outside Unicode codespace |
|||
* The maximal subpart is 1-byte |
|||
*/ |
|||
for (ord = 0x110000; ord <= 0x1FFFFF; ord++) { |
|||
encode_ord(ord, 4, src); |
|||
|
|||
TEST_UTF8(src, 4, ERANGE); |
|||
if ((ord % (1 << 6)) == 0) |
|||
TEST_UTF8(src, 3, EMLINK); |
|||
if ((ord % (1 << 12)) == 0) |
|||
TEST_UTF8(src, 2, EMLINK); |
|||
} |
|||
} |
|||
|
|||
static void |
|||
test_surrogates(void) { |
|||
uint32_t ord; |
|||
char src[4]; |
|||
|
|||
/*
|
|||
* Surrogates [U+D800, U+DFFF] |
|||
* The maximal subpart is 1-byte |
|||
*/ |
|||
for (ord = 0xD800; ord <= 0xDFFF; ord++) { |
|||
encode_ord(ord, 3, src); |
|||
|
|||
TEST_UTF8(src, 3, ERANGE); |
|||
if ((ord % (1 << 6)) == 0) |
|||
TEST_UTF8(src, 2, EMLINK); |
|||
} |
|||
} |
|||
|
|||
static void |
|||
test_continuations(void) { |
|||
uint8_t ord; |
|||
char src[4]; |
|||
|
|||
/*
|
|||
* Missplaced continuation [\x80, \xBF] |
|||
* The maximal subpart is 1-byte |
|||
*/ |
|||
for (ord = 0x80; ord <= 0xBF; ord++) { |
|||
src[0] = ord; |
|||
TEST_UTF8(src, 1, EINVAL); |
|||
} |
|||
} |
|||
|
|||
int |
|||
main(int argc, char **argv) |
|||
{ |
|||
plan_tests(2190906); |
|||
test_unicode_scalar_value(); |
|||
test_surrogates(); |
|||
test_non_shortest_form(); |
|||
test_non_unicode(); |
|||
test_continuations(); |
|||
|
|||
return exit_status(); |
|||
} |
@ -0,0 +1,42 @@ |
|||
#include <ccan/utf8/utf8.h> |
|||
/* Include the C files directly. */ |
|||
#include <ccan/utf8/utf8.c> |
|||
#include <ccan/tap/tap.h> |
|||
#include <assert.h> |
|||
|
|||
static bool utf8_check(const char *src, size_t len) |
|||
{ |
|||
bool decoded = false; |
|||
struct utf8_state utf8_state = UTF8_STATE_INIT; |
|||
size_t i; |
|||
|
|||
for (i = 0; i < len; i++) { |
|||
decoded = utf8_decode(&utf8_state, src[i]); |
|||
if (decoded) { |
|||
if (errno != 0) |
|||
return false; |
|||
} |
|||
} |
|||
if (!decoded) |
|||
return false; |
|||
return true; |
|||
} |
|||
|
|||
int main(int argc, char **argv) |
|||
{ |
|||
int i; |
|||
char dest[UTF8_MAX_LEN]; |
|||
|
|||
plan_tests(0x10FFFF - (0xDFFF - 0xD7FF + 2)); |
|||
|
|||
for (i = 1; i < 0x10FFFF; i++) { |
|||
int len; |
|||
if (i >= 0xD7FF && i <= 0xDFFF) |
|||
continue; |
|||
len = utf8_encode(i, dest); |
|||
assert(len != 0); |
|||
ok1(utf8_check(dest, len)); |
|||
} |
|||
|
|||
return exit_status(); |
|||
} |
@ -0,0 +1,30 @@ |
|||
#include <ccan/utf8/utf8.h> |
|||
/* Include the C files directly. */ |
|||
#include <ccan/utf8/utf8.c> |
|||
#include <ccan/tap/tap.h> |
|||
|
|||
int main(int argc, char **argv) |
|||
{ |
|||
int i; |
|||
char dest[UTF8_MAX_LEN]; |
|||
|
|||
plan_tests(1 + 0x10FFFF + 1); |
|||
|
|||
for (i = 0; i < 1; i++) |
|||
ok1(utf8_encode(i, dest) == 0 && errno == ERANGE); |
|||
for (; i <= 0x7F; i++) |
|||
ok1(utf8_encode(i, dest) == 1); |
|||
for (; i <= 0x7FF; i++) |
|||
ok1(utf8_encode(i, dest) == 2); |
|||
for (; i <= 0xD7FF; i++) |
|||
ok1(utf8_encode(i, dest) == 3); |
|||
for (; i <= 0xDFFF; i++) |
|||
ok1(utf8_encode(i, dest) == 0 && errno == ERANGE); |
|||
for (; i <= 0xFFFF; i++) |
|||
ok1(utf8_encode(i, dest) == 3); |
|||
for (; i <= 0x10FFFF; i++) |
|||
ok1(utf8_encode(i, dest) == 4); |
|||
ok1(utf8_encode(i, dest) == 0 && errno == ERANGE); |
|||
|
|||
return exit_status(); |
|||
} |
@ -0,0 +1,176 @@ |
|||
/* MIT (BSD) license - see LICENSE file for details */ |
|||
#include <ccan/utf8/utf8.h> |
|||
#include <errno.h> |
|||
#include <stdlib.h> |
|||
|
|||
/* I loved this table, so I stole it: */ |
|||
/*
|
|||
* Copyright (c) 2017 Christian Hansen <chansen@cpan.org> |
|||
* <https://github.com/chansen/c-utf8-valid>
|
|||
* All rights reserved. |
|||
* |
|||
* Redistribution and use in source and binary forms, with or without |
|||
* modification, are permitted provided that the following conditions are met: |
|||
* |
|||
* 1. Redistributions of source code must retain the above copyright notice, this |
|||
* list of conditions and the following disclaimer. |
|||
* 2. Redistributions in binary form must reproduce the above copyright notice, |
|||
* this list of conditions and the following disclaimer in the documentation |
|||
* and/or other materials provided with the distribution. |
|||
* |
|||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
|||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
|||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR |
|||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
|||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
|||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|||
*/ |
|||
/*
|
|||
* UTF-8 Encoding Form |
|||
* |
|||
* U+0000..U+007F 0xxxxxxx <= 7 bits |
|||
* U+0080..U+07FF 110xxxxx 10xxxxxx <= 11 bits |
|||
* U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx <= 16 bits |
|||
* U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits |
|||
* |
|||
* |
|||
* U+0000..U+007F 00..7F |
|||
* N C0..C1 80..BF 1100000x 10xxxxxx |
|||
* U+0080..U+07FF C2..DF 80..BF |
|||
* N E0 80..9F 80..BF 11100000 100xxxxx |
|||
* U+0800..U+0FFF E0 A0..BF 80..BF |
|||
* U+1000..U+CFFF E1..EC 80..BF 80..BF |
|||
* U+D000..U+D7FF ED 80..9F 80..BF |
|||
* S ED A0..BF 80..BF 11101101 101xxxxx |
|||
* U+E000..U+FFFF EE..EF 80..BF 80..BF |
|||
* N F0 80..8F 80..BF 80..BF 11110000 1000xxxx |
|||
* U+10000..U+3FFFF F0 90..BF 80..BF 80..BF |
|||
* U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF |
|||
* U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx |
|||
* |
|||
* Legend: |
|||
* N = Non-shortest form |
|||
* S = Surrogates |
|||
*/ |
|||
bool utf8_decode(struct utf8_state *utf8_state, char c) |
|||
{ |
|||
if (utf8_state->used_len == utf8_state->total_len) { |
|||
utf8_state->used_len = 1; |
|||
/* First character in sequence. */ |
|||
if (((unsigned char)c & 0x80) == 0) { |
|||
/* ASCII, easy. */ |
|||
utf8_state->total_len = 1; |
|||
utf8_state->c = c; |
|||
goto finished_decoding; |
|||
} else if (((unsigned char)c & 0xE0) == 0xC0) { |
|||
utf8_state->total_len = 2; |
|||
utf8_state->c = ((unsigned char)c & 0x1F); |
|||
return false; |
|||
} else if (((unsigned char)c & 0xF0) == 0xE0) { |
|||
utf8_state->total_len = 3; |
|||
utf8_state->c = ((unsigned char)c & 0x0F); |
|||
return false; |
|||
} else if (((unsigned char)c & 0xF8) == 0xF0) { |
|||
utf8_state->total_len = 4; |
|||
utf8_state->c = ((unsigned char)c & 0x07); |
|||
return false; |
|||
} |
|||
goto bad_encoding; |
|||
} |
|||
|
|||
if (((unsigned char)c & 0xC0) != 0x80) |
|||
goto bad_encoding; |
|||
|
|||
utf8_state->c <<= 6; |
|||
utf8_state->c |= ((unsigned char)c & 0x3F); |
|||
|
|||
utf8_state->used_len++; |
|||
if (utf8_state->used_len == utf8_state->total_len) |
|||
goto finished_decoding; |
|||
return false; |
|||
|
|||
finished_decoding: |
|||
if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF) |
|||
errno = ERANGE; |
|||
/* The UTF-16 "surrogate range": illegal in UTF-8 */ |
|||
else if (utf8_state->total_len == 3 |
|||
&& (utf8_state->c & 0xFFFFF800) == 0x0000D800) |
|||
errno = ERANGE; |
|||
else { |
|||
int min_bits; |
|||
switch (utf8_state->total_len) { |
|||
case 1: |
|||
min_bits = 0; |
|||
break; |
|||
case 2: |
|||
min_bits = 7; |
|||
break; |
|||
case 3: |
|||
min_bits = 11; |
|||
break; |
|||
case 4: |
|||
min_bits = 16; |
|||
break; |
|||
default: |
|||
abort(); |
|||
} |
|||
if ((utf8_state->c >> min_bits) == 0) |
|||
errno = EFBIG; |
|||
else |
|||
errno = 0; |
|||
} |
|||
return true; |
|||
|
|||
bad_encoding: |
|||
utf8_state->total_len = utf8_state->used_len; |
|||
errno = EINVAL; |
|||
return true; |
|||
} |
|||
|
|||
size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]) |
|||
{ |
|||
if ((point >> 7) == 0) { |
|||
if (point == 0) { |
|||
errno = ERANGE; |
|||
return 0; |
|||
} |
|||
/* 0xxxxxxx */ |
|||
dest[0] = point; |
|||
return 1; |
|||
} |
|||
|
|||
if ((point >> 11) == 0) { |
|||
/* 110xxxxx 10xxxxxx */ |
|||
dest[1] = 0x80 | (point & 0x3F); |
|||
dest[0] = 0xC0 | (point >> 6); |
|||
return 2; |
|||
} |
|||
|
|||
if ((point >> 16) == 0) { |
|||
if (point >= 0xD800 && point <= 0xDFFF) { |
|||
errno = ERANGE; |
|||
return 0; |
|||
} |
|||
/* 1110xxxx 10xxxxxx 10xxxxxx */ |
|||
dest[2] = 0x80 | (point & 0x3F); |
|||
dest[1] = 0x80 | ((point >> 6) & 0x3F); |
|||
dest[0] = 0xE0 | (point >> 12); |
|||
return 3; |
|||
} |
|||
|
|||
if (point > 0x10FFFF) { |
|||
errno = ERANGE; |
|||
return 0; |
|||
} |
|||
|
|||
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
|||
dest[3] = 0x80 | (point & 0x3F); |
|||
dest[2] = 0x80 | ((point >> 6) & 0x3F); |
|||
dest[1] = 0x80 | ((point >> 12) & 0x3F); |
|||
dest[0] = 0xF0 | (point >> 18); |
|||
return 4; |
|||
} |
@ -0,0 +1,54 @@ |
|||
/* MIT (BSD) license - see LICENSE file for details */ |
|||
#ifndef CCAN_UTF8_H |
|||
#define CCAN_UTF8_H |
|||
#include <inttypes.h> |
|||
#include <stdbool.h> |
|||
#include <string.h> |
|||
|
|||
/* Unicode is limited to 21 bits. */ |
|||
#define UTF8_MAX_LEN 4 |
|||
|
|||
struct utf8_state { |
|||
/* How many characters we are expecting as part of this Unicode point */ |
|||
uint16_t total_len; |
|||
/* How many characters we've already seen. */ |
|||
uint16_t used_len; |
|||
/* Compound character, aka Unicode point. */ |
|||
uint32_t c; |
|||
}; |
|||
|
|||
#define UTF8_STATE_INIT { 0, 0, 0 } |
|||
|
|||
static inline void utf8_state_init(struct utf8_state *utf8_state) |
|||
{ |
|||
memset(utf8_state, 0, sizeof(*utf8_state)); |
|||
} |
|||
|
|||
/**
|
|||
* utf8_decode - continue UTF8 decoding with this character. |
|||
* @utf8_state - initialized UTF8 state. |
|||
* @c - the character. |
|||
* |
|||
* Returns false if it needs another character to give results. |
|||
* Otherwise returns true, @utf8_state can be reused without initializeation, |
|||
* and sets errno: |
|||
* 0: success |
|||
* EINVAL: bad encoding. |
|||
* EFBIG: not a minimal encoding. |
|||
* ERANGE: encoding of invalid character. |
|||
* |
|||
* You can extract the character from @utf8_state->c; @utf8_state->used_len |
|||
* indicates how many characters have been consumed. |
|||
*/ |
|||
bool utf8_decode(struct utf8_state *utf8_state, char c); |
|||
|
|||
/**
|
|||
* utf8_encode - encode a point into UTF8. |
|||
* @point - Unicode point to include. |
|||
* @dest - buffer to fill. |
|||
* |
|||
* Returns 0 if point was invalid, otherwise bytes of dest used. |
|||
* Sets errno to ERANGE if point was invalid. |
|||
*/ |
|||
size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]); |
|||
#endif /* CCAN_UTF8_H */ |
Loading…
Reference in new issue