damus/damus-c/utf8.c

/* MIT (BSD) license - see LICENSE file for details - taken from ccan. thanks rusty! */

#include "utf8.h"
#include <errno.h>
#include <stdlib.h>

/* I loved this table, so I stole it: */
/*
 * Copyright (c) 2017 Christian Hansen <chansen@cpan.org>
 * <https://github.com/chansen/c-utf8-valid>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
/*
 *    UTF-8 Encoding Form
 *
 *    U+0000..U+007F       0xxxxxxx                <= 7 bits
 *    U+0080..U+07FF       110xxxxx 10xxxxxx            <= 11 bits
 *    U+0800..U+FFFF       1110xxxx 10xxxxxx 10xxxxxx        <= 16 bits
 *   U+10000..U+10FFFF     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx    <= 21 bits
 *
 *
 *    U+0000..U+007F       00..7F
 *                      N  C0..C1  80..BF                   1100000x 10xxxxxx
 *    U+0080..U+07FF       C2..DF  80..BF
 *                      N  E0      80..9F  80..BF           11100000 100xxxxx
 *    U+0800..U+0FFF       E0      A0..BF  80..BF
 *    U+1000..U+CFFF       E1..EC  80..BF  80..BF
 *    U+D000..U+D7FF       ED      80..9F  80..BF
 *                      S  ED      A0..BF  80..BF           11101101 101xxxxx
 *    U+E000..U+FFFF       EE..EF  80..BF  80..BF
 *                      N  F0      80..8F  80..BF  80..BF   11110000 1000xxxx
 *   U+10000..U+3FFFF      F0      90..BF  80..BF  80..BF
 *   U+40000..U+FFFFF      F1..F3  80..BF  80..BF  80..BF
 *  U+100000..U+10FFFF     F4      80..8F  80..BF  80..BF   11110100 1000xxxx
 *
 *  Legend:
 *    N = Non-shortest form
 *    S = Surrogates
 */
bool utf8_decode(struct utf8_state *utf8_state, char c)
{
    if (utf8_state->used_len == utf8_state->total_len) {
        utf8_state->used_len = 1;
        /* First character in sequence. */
        if (((unsigned char)c & 0x80) == 0) {
            /* ASCII, easy. */
            if (c == 0)
                goto bad_encoding;
            utf8_state->total_len = 1;
            utf8_state->c = c;
            goto finished_decoding;
        } else if (((unsigned char)c & 0xE0) == 0xC0) {
            utf8_state->total_len = 2;
            utf8_state->c = ((unsigned char)c & 0x1F);
            return false;
        } else if (((unsigned char)c & 0xF0) == 0xE0) {
            utf8_state->total_len = 3;
            utf8_state->c = ((unsigned char)c & 0x0F);
            return false;
        } else if (((unsigned char)c & 0xF8) == 0xF0) {
            utf8_state->total_len = 4;
            utf8_state->c = ((unsigned char)c & 0x07);
            return false;
        }
        goto bad_encoding;
    }

    if (((unsigned char)c & 0xC0) != 0x80)
        goto bad_encoding;

    utf8_state->c <<= 6;
    utf8_state->c |= ((unsigned char)c & 0x3F);

    utf8_state->used_len++;
    if (utf8_state->used_len == utf8_state->total_len)
        goto finished_decoding;
    return false;

finished_decoding:
    if (utf8_state->c == 0 || utf8_state->c > 0x10FFFF)
        errno = ERANGE;
    /* The UTF-16 "surrogate range": illegal in UTF-8 */
    else if (utf8_state->total_len == 3
         && (utf8_state->c & 0xFFFFF800) == 0x0000D800)
        errno = ERANGE;
    else {
        int min_bits;
        switch (utf8_state->total_len) {
        case 1:
            min_bits = 0;
            break;
        case 2:
            min_bits = 7;
            break;
        case 3:
            min_bits = 11;
            break;
        case 4:
            min_bits = 16;
            break;
        default:
            abort();
        }
        if ((utf8_state->c >> min_bits) == 0)
            errno = EFBIG;
        else
            errno = 0;
    }
    return true;

bad_encoding:
    utf8_state->total_len = utf8_state->used_len;
    errno = EINVAL;
    return true;
}

size_t utf8_encode(uint32_t point, char dest[UTF8_MAX_LEN])
{
    if ((point >> 7) == 0) {
        if (point == 0) {
            errno = ERANGE;
            return 0;
        }
        /* 0xxxxxxx */
        dest[0] = point;
        return 1;
    }

    if ((point >> 11) == 0) {
        /* 110xxxxx 10xxxxxx */
        dest[1] = 0x80 | (point & 0x3F);
        dest[0] = 0xC0 | (point >> 6);
        return 2;
    }

    if ((point >> 16) == 0) {
        if (point >= 0xD800 && point <= 0xDFFF) {
            errno = ERANGE;
            return 0;
        }
        /* 1110xxxx 10xxxxxx 10xxxxxx */
        dest[2] = 0x80 | (point & 0x3F);
        dest[1] = 0x80 | ((point >> 6) & 0x3F);
        dest[0] = 0xE0 | (point >> 12);
        return 3;
    }

    if (point > 0x10FFFF) {
        errno = ERANGE;
        return 0;
    }

    /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    dest[3] = 0x80 | (point & 0x3F);
    dest[2] = 0x80 | ((point >> 6) & 0x3F);
    dest[1] = 0x80 | ((point >> 12) & 0x3F);
    dest[0] = 0xF0 | (point >> 18);
    return 4;
}

/* Check for valid UTF-8 */
bool utf8_check(const void *vbuf, size_t buflen)
{
    const unsigned char *buf = vbuf;
    struct utf8_state utf8_state = UTF8_STATE_INIT;
    bool need_more = false;

    for (size_t i = 0; i < buflen; i++) {
        if (!utf8_decode(&utf8_state, buf[i])) {
            need_more = true;
            continue;
        }
        need_more = false;
        if (errno != 0)
            return false;
    }
    return !need_more;
}