unicode.c - cscg24-guacamole - CSCG 2024 Challenge 'Guacamole Mashup'

	cscg24-guacamole CSCG 2024 Challenge 'Guacamole Mashup'
	git clone https://git.sinitax.com/sinitax/cscg24-guacamole
	Log \| Files \| Refs \| sfeed.txt
unicode.c (4110B)
      1/*
      2 * Licensed to the Apache Software Foundation (ASF) under one
      3 * or more contributor license agreements.  See the NOTICE file
      4 * distributed with this work for additional information
      5 * regarding copyright ownership.  The ASF licenses this file
      6 * to you under the Apache License, Version 2.0 (the
      7 * "License"); you may not use this file except in compliance
      8 * with the License.  You may obtain a copy of the License at
      9 *
     10 *   http://www.apache.org/licenses/LICENSE-2.0
     11 *
     12 * Unless required by applicable law or agreed to in writing,
     13 * software distributed under the License is distributed on an
     14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
     15 * KIND, either express or implied.  See the License for the
     16 * specific language governing permissions and limitations
     17 * under the License.
     18 */
     19
     20#include "config.h"
     21
     22#include "guacamole/unicode.h"
     23
     24#include <stddef.h>
     25
     26size_t guac_utf8_charsize(unsigned char c) {
     27
     28    /* Determine size in bytes of character */
     29    if ((c | 0x7F) == 0x7F) return 1;
     30    if ((c | 0x1F) == 0xDF) return 2;
     31    if ((c | 0x0F) == 0xEF) return 3;
     32    if ((c | 0x07) == 0xF7) return 4;
     33
     34    /* Default to one character */
     35    return 1;
     36
     37}
     38
     39size_t guac_utf8_strlen(const char* str) {
     40
     41    /* The current length of the string */
     42    int length = 0;
     43
     44    /* Number of characters before start of next character */
     45    int skip = 0;
     46
     47    while (*str != 0) {
     48
     49        /* If skipping, then skip */
     50        if (skip > 0) skip--;
     51
     52        /* Otherwise, determine next skip value, and increment length */
     53        else {
     54
     55            /* Get next character */
     56            unsigned char c = (unsigned char) *str;
     57
     58            /* Determine skip value (size in bytes of rest of character) */
     59            skip = guac_utf8_charsize(c) - 1;
     60
     61            length++;
     62        }
     63
     64        str++;
     65    }
     66
     67    return length;
     68
     69}
     70
     71int guac_utf8_write(int codepoint, char* utf8, int length) {
     72
     73    int i;
     74    int mask, bytes;
     75
     76    /* If not even one byte, cannot write */
     77    if (length <= 0)
     78        return 0;
     79
     80    /* Determine size and initial byte mask */
     81    if (codepoint <= 0x007F) {
     82        mask  = 0x00;
     83        bytes = 1;
     84    }
     85    else if (codepoint <= 0x7FF) {
     86        mask  = 0xC0;
     87        bytes = 2;
     88    }
     89    else if (codepoint <= 0xFFFF) {
     90        mask  = 0xE0;
     91        bytes = 3;
     92    }
     93    else if (codepoint <= 0x1FFFFF) {
     94        mask  = 0xF0;
     95        bytes = 4;
     96    }
     97
     98    /* Otherwise, invalid codepoint */
     99    else {
    100        *(utf8++) = '?';
    101        return 1;
    102    }
    103
    104    /* If not enough room, don't write anything */
    105    if (bytes > length)
    106        return 0;
    107
    108    /* Offset buffer by size */
    109    utf8 += bytes - 1;
    110
    111    /* Add trailing bytes, if any */
    112    for (i=1; i<bytes; i++) {
    113        *(utf8--) = 0x80 | (codepoint & 0x3F);
    114        codepoint >>= 6;
    115    }
    116
    117    /* Set initial byte */
    118    *utf8 = mask | codepoint;
    119
    120    /* Done */
    121    return bytes;
    122
    123}
    124
    125int guac_utf8_read(const char* utf8, int length, int* codepoint) {
    126
    127    unsigned char initial;
    128    int bytes;
    129    int result;
    130    int i;
    131
    132    /* If not even one byte, cannot read */
    133    if (length <= 0)
    134        return 0;
    135
    136    /* Read initial byte */
    137    initial = (unsigned char) *(utf8++);
    138
    139    /* 0xxxxxxx */
    140    if ((initial | 0x7F) == 0x7F) {
    141        result = initial;
    142        bytes  = 1;
    143    }
    144
    145    /* 110xxxxx 10xxxxxx */
    146    else if ((initial | 0x1F) == 0xDF) {
    147        result = initial & 0x1F;
    148        bytes  = 2;
    149    }
    150
    151    /* 1110xxxx 10xxxxxx 10xxxxxx */
    152    else if ((initial | 0x0F) == 0xEF) {
    153        result = initial & 0x0F;
    154        bytes  = 3;
    155    }
    156
    157    /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
    158    else if ((initial | 0x07) == 0xF7) {
    159        result = initial & 0x07;
    160        bytes  = 4;
    161    }
    162
    163    /* Otherwise, invalid codepoint */
    164    else {
    165        *codepoint = 0xFFFD; /* Replacement character */
    166        return 1;
    167    }
    168
    169    /* If not enough room, don't read anything */
    170    if (bytes > length)
    171        return 0;
    172
    173    /* Read trailing bytes, if any */
    174    for (i=1; i<bytes; i++) {
    175        result <<= 6;
    176        result |= *(utf8++) & 0x3F;
    177    }
    178
    179    *codepoint = result;
    180    return bytes;
    181
    182}
    183