/* MIT (BSD) license - see LICENSE file for details - taken from ccan. thanks rusty! */ #include "utf8.h" #include #include /* I loved this table, so I stole it: */ /* * Copyright (c) 2017 Christian Hansen * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * UTF-8 Encoding Form * * U+0000..U+007F 0xxxxxxx <= 7 bits * U+0080..U+07FF 110xxxxx 10xxxxxx <= 11 bits * U+0800..U+FFFF 1110xxxx 10xxxxxx 10xxxxxx <= 16 bits * U+10000..U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx <= 21 bits * * * U+0000..U+007F 00..7F * N C0..C1 80..BF 1100000x 10xxxxxx * U+0080..U+07FF C2..DF 80..BF * N E0 80..9F 80..BF 11100000 100xxxxx * U+0800..U+0FFF E0 A0..BF 80..BF * U+1000..U+CFFF E1..EC 80..BF 80..BF * U+D000..U+D7FF ED 80..9F 80..BF * S ED A0..BF 80..BF 11101101 101xxxxx * U+E000..U+FFFF EE..EF 80..BF 80..BF * N F0 80..8F 80..BF 80..BF 11110000 1000xxxx * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 11110100 1000xxxx * * Legend: * N = Non-shortest form * S = Surrogates */ bool utf8_decode(struct utf8_state *utf8_state, char c) { if (utf8_state->used_len == utf8_state->total_len) { utf8_state->used_len = 1; /* First character in sequence. */ if (((unsigned char)c & 0x80) == 0) { /* ASCII, easy. */ if (c == 0) goto bad_encoding; utf8_state->total_len = 1; utf8_state->c = c; goto finished_decoding; } else if (((unsigned char)c & 0xE0) == 0xC0) { utf8_state->total_len = 2; utf8_state->c = ((unsigned char)c & 0x1F); return false; } else if (((unsigned char)c & 0xF0) == 0xE0) { utf8_state->total_len = 3; utf8_state->c = ((unsigned char)c & 0x0F); return false; } else if (((unsigned char)c & 0xF8) == 0xF0) { utf8_state->total_len = 4; utf8_state->c = ((unsigned char)c & 0x07); return false; } goto bad_encoding; } if (((unsigned char)c & 0xC0) != 0x80) goto bad_encoding; utf8_state->c <<= 6; utf8_state->c |= ((unsigned char)c & 0x3F); utf8_state->used_len++; if (utf8_state->used_len == utf8_state->total_len) goto finished_decoding; return false; finished_decoding: if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF) errno = ERANGE; /* The UTF-16 "surrogate range": illegal in UTF-8 */ else if (utf8_state->total_len == 3 && (utf8_state->c & 0xFFFFF800) == 0x0000D800) errno = ERANGE; else { int min_bits; switch (utf8_state->total_len) { case 1: min_bits = 0; break; case 2: min_bits = 7; break; case 3: min_bits = 11; break; case 4: min_bits = 16; break; default: abort(); } if ((utf8_state->c >> min_bits) == 0) errno = EFBIG; else errno = 0; } return true; bad_encoding: utf8_state->total_len = utf8_state->used_len; errno = EINVAL; return true; } size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN]) { if ((point >> 7) == 0) { if (point == 0) { errno = ERANGE; return 0; } /* 0xxxxxxx */ dest[0] = point; return 1; } if ((point >> 11) == 0) { /* 110xxxxx 10xxxxxx */ dest[1] = 0x80 | (point & 0x3F); dest[0] = 0xC0 | (point >> 6); return 2; } if ((point >> 16) == 0) { if (point >= 0xD800 && point <= 0xDFFF) { errno = ERANGE; return 0; } /* 1110xxxx 10xxxxxx 10xxxxxx */ dest[2] = 0x80 | (point & 0x3F); dest[1] = 0x80 | ((point >> 6) & 0x3F); dest[0] = 0xE0 | (point >> 12); return 3; } if (point > 0x10FFFF) { errno = ERANGE; return 0; } /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ dest[3] = 0x80 | (point & 0x3F); dest[2] = 0x80 | ((point >> 6) & 0x3F); dest[1] = 0x80 | ((point >> 12) & 0x3F); dest[0] = 0xF0 | (point >> 18); return 4; } /* Check for valid UTF-8 */ bool utf8_check(const void *vbuf, size_t buflen) { const unsigned char *buf = vbuf; struct utf8_state utf8_state = UTF8_STATE_INIT; bool need_more = false; for (size_t i = 0; i < buflen; i++) { if (!utf8_decode(&utf8_state, buf[i])) { need_more = true; continue; } need_more = false; if (errno != 0) return false; } return !need_more; }