unicode.c (4110B)
1/* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 20#include "config.h" 21 22#include "guacamole/unicode.h" 23 24#include <stddef.h> 25 26size_t guac_utf8_charsize(unsigned char c) { 27 28 /* Determine size in bytes of character */ 29 if ((c | 0x7F) == 0x7F) return 1; 30 if ((c | 0x1F) == 0xDF) return 2; 31 if ((c | 0x0F) == 0xEF) return 3; 32 if ((c | 0x07) == 0xF7) return 4; 33 34 /* Default to one character */ 35 return 1; 36 37} 38 39size_t guac_utf8_strlen(const char* str) { 40 41 /* The current length of the string */ 42 int length = 0; 43 44 /* Number of characters before start of next character */ 45 int skip = 0; 46 47 while (*str != 0) { 48 49 /* If skipping, then skip */ 50 if (skip > 0) skip--; 51 52 /* Otherwise, determine next skip value, and increment length */ 53 else { 54 55 /* Get next character */ 56 unsigned char c = (unsigned char) *str; 57 58 /* Determine skip value (size in bytes of rest of character) */ 59 skip = guac_utf8_charsize(c) - 1; 60 61 length++; 62 } 63 64 str++; 65 } 66 67 return length; 68 69} 70 71int guac_utf8_write(int codepoint, char* utf8, int length) { 72 73 int i; 74 int mask, bytes; 75 76 /* If not even one byte, cannot write */ 77 if (length <= 0) 78 return 0; 79 80 /* Determine size and initial byte mask */ 81 if (codepoint <= 0x007F) { 82 mask = 0x00; 83 bytes = 1; 84 } 85 else if (codepoint <= 0x7FF) { 86 mask = 0xC0; 87 bytes = 2; 88 } 89 else if (codepoint <= 0xFFFF) { 90 mask = 0xE0; 91 bytes = 3; 92 } 93 else if (codepoint <= 0x1FFFFF) { 94 mask = 0xF0; 95 bytes = 4; 96 } 97 98 /* Otherwise, invalid codepoint */ 99 else { 100 *(utf8++) = '?'; 101 return 1; 102 } 103 104 /* If not enough room, don't write anything */ 105 if (bytes > length) 106 return 0; 107 108 /* Offset buffer by size */ 109 utf8 += bytes - 1; 110 111 /* Add trailing bytes, if any */ 112 for (i=1; i<bytes; i++) { 113 *(utf8--) = 0x80 | (codepoint & 0x3F); 114 codepoint >>= 6; 115 } 116 117 /* Set initial byte */ 118 *utf8 = mask | codepoint; 119 120 /* Done */ 121 return bytes; 122 123} 124 125int guac_utf8_read(const char* utf8, int length, int* codepoint) { 126 127 unsigned char initial; 128 int bytes; 129 int result; 130 int i; 131 132 /* If not even one byte, cannot read */ 133 if (length <= 0) 134 return 0; 135 136 /* Read initial byte */ 137 initial = (unsigned char) *(utf8++); 138 139 /* 0xxxxxxx */ 140 if ((initial | 0x7F) == 0x7F) { 141 result = initial; 142 bytes = 1; 143 } 144 145 /* 110xxxxx 10xxxxxx */ 146 else if ((initial | 0x1F) == 0xDF) { 147 result = initial & 0x1F; 148 bytes = 2; 149 } 150 151 /* 1110xxxx 10xxxxxx 10xxxxxx */ 152 else if ((initial | 0x0F) == 0xEF) { 153 result = initial & 0x0F; 154 bytes = 3; 155 } 156 157 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 158 else if ((initial | 0x07) == 0xF7) { 159 result = initial & 0x07; 160 bytes = 4; 161 } 162 163 /* Otherwise, invalid codepoint */ 164 else { 165 *codepoint = 0xFFFD; /* Replacement character */ 166 return 1; 167 } 168 169 /* If not enough room, don't read anything */ 170 if (bytes > length) 171 return 0; 172 173 /* Read trailing bytes, if any */ 174 for (i=1; i<bytes; i++) { 175 result <<= 6; 176 result |= *(utf8++) & 0x3F; 177 } 178 179 *codepoint = result; 180 return bytes; 181 182} 183