character.c (8932B)
1/* See LICENSE file for copyright and license details. */ 2#include <limits.h> 3#include <stdbool.h> 4#include <stddef.h> 5 6#include "../gen/character.h" 7#include "../grapheme.h" 8#include "util.h" 9 10struct character_break_state { 11 uint_least8_t prop; 12 bool prop_set; 13 bool gb11_flag; 14 bool gb12_13_flag; 15}; 16 17static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = { 18 [CHAR_BREAK_PROP_OTHER] = 19 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 20 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 21 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 22 [CHAR_BREAK_PROP_CR] = UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */ 23 [CHAR_BREAK_PROP_EXTEND] = 24 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 25 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 26 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 27 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = 28 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 29 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 30 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 31 [CHAR_BREAK_PROP_HANGUL_L] = 32 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ 33 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ 34 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ 35 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ 36 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 37 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 38 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 39 [CHAR_BREAK_PROP_HANGUL_V] = 40 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ 41 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ 42 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 43 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 44 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 45 [CHAR_BREAK_PROP_HANGUL_T] = 46 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ 47 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 48 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 49 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 50 [CHAR_BREAK_PROP_HANGUL_LV] = 51 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ 52 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ 53 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 54 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 55 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 56 [CHAR_BREAK_PROP_HANGUL_LVT] = 57 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ 58 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 59 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 60 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 61 [CHAR_BREAK_PROP_PREPEND] = 62 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 63 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 64 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */ 65 (UINT16_C(0xFFFF) & 66 ~(UINT16_C(1) << CHAR_BREAK_PROP_CR | 67 UINT16_C(1) << CHAR_BREAK_PROP_LF | 68 UINT16_C(1) << CHAR_BREAK_PROP_CONTROL)), /* GB9b */ 69 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = 70 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 71 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 72 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 73 [CHAR_BREAK_PROP_SPACINGMARK] = 74 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 75 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 76 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 77 [CHAR_BREAK_PROP_ZWJ] = 78 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ 79 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ 80 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ 81}; 82static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { 83 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = 84 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | 85 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, 86 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = 87 UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, 88 [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = 89 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | 90 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ, 91 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = 92 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | 93 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, 94}; 95static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { 96 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = 97 UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, 98}; 99static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { 100 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = 101 UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, 102}; 103static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { 104 [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = 105 UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, 106}; 107 108static inline enum char_break_property 109get_break_prop(uint_least32_t cp) 110{ 111 if (likely(cp <= UINT32_C(0x10FFFF))) { 112 return (enum char_break_property) 113 char_break_minor[char_break_major[cp >> 8] + 114 (cp & 0xFF)]; 115 } else { 116 return CHAR_BREAK_PROP_OTHER; 117 } 118} 119 120static inline void 121state_serialize(const struct character_break_state *in, uint_least16_t *out) 122{ 123 *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */ 124 (uint_least16_t)(((uint_least16_t)(in->prop_set)) 125 << 8) | /* 9th bit */ 126 (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) 127 << 9) | /* 10th bit */ 128 (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) 129 << 10); /* 11th bit */ 130} 131 132static inline void 133state_deserialize(uint_least16_t in, struct character_break_state *out) 134{ 135 out->prop = in & UINT8_C(0xFF); 136 out->prop_set = in & (UINT16_C(1) << 8); 137 out->gb11_flag = in & (UINT16_C(1) << 9); 138 out->gb12_13_flag = in & (UINT16_C(1) << 10); 139} 140 141bool 142grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, 143 uint_least16_t *s) 144{ 145 struct character_break_state state; 146 enum char_break_property cp0_prop, cp1_prop; 147 bool notbreak = false; 148 149 if (likely(s)) { 150 state_deserialize(*s, &state); 151 152 if (likely(state.prop_set)) { 153 cp0_prop = state.prop; 154 } else { 155 cp0_prop = get_break_prop(cp0); 156 } 157 cp1_prop = get_break_prop(cp1); 158 159 /* preserve prop of right codepoint for next iteration */ 160 state.prop = (uint_least8_t)cp1_prop; 161 state.prop_set = true; 162 163 /* update flags */ 164 state.gb11_flag = 165 flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS * 166 state.gb11_flag] & 167 UINT16_C(1) << cp1_prop; 168 state.gb12_13_flag = 169 flag_update_gb12_13[cp0_prop + 170 NUM_CHAR_BREAK_PROPS * 171 state.gb12_13_flag] & 172 UINT16_C(1) << cp1_prop; 173 174 /* 175 * Apply grapheme cluster breaking algorithm (UAX #29), see 176 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 177 */ 178 notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) || 179 (dont_break_gb11[cp0_prop + 180 state.gb11_flag * 181 NUM_CHAR_BREAK_PROPS] & 182 (UINT16_C(1) << cp1_prop)) || 183 (dont_break_gb12_13[cp0_prop + 184 state.gb12_13_flag * 185 NUM_CHAR_BREAK_PROPS] & 186 (UINT16_C(1) << cp1_prop)); 187 188 /* update or reset flags (when we have a break) */ 189 if (likely(!notbreak)) { 190 state.gb11_flag = state.gb12_13_flag = false; 191 } 192 193 state_serialize(&state, s); 194 } else { 195 cp0_prop = get_break_prop(cp0); 196 cp1_prop = get_break_prop(cp1); 197 198 /* 199 * Apply grapheme cluster breaking algorithm (UAX #29), see 200 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 201 * 202 * Given we have no state, this behaves as if the state-booleans 203 * were all set to false 204 */ 205 notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) || 206 (dont_break_gb11[cp0_prop] & 207 (UINT16_C(1) << cp1_prop)) || 208 (dont_break_gb12_13[cp0_prop] & 209 (UINT16_C(1) << cp1_prop)); 210 } 211 212 return !notbreak; 213} 214 215static size_t 216next_character_break(HERODOTUS_READER *r) 217{ 218 uint_least16_t state = 0; 219 uint_least32_t cp0 = 0, cp1 = 0; 220 221 for (herodotus_read_codepoint(r, true, &cp0); 222 herodotus_read_codepoint(r, false, &cp1) == 223 HERODOTUS_STATUS_SUCCESS; 224 herodotus_read_codepoint(r, true, &cp0)) { 225 if (grapheme_is_character_break(cp0, cp1, &state)) { 226 break; 227 } 228 } 229 230 return herodotus_reader_number_read(r); 231} 232 233size_t 234grapheme_next_character_break(const uint_least32_t *str, size_t len) 235{ 236 HERODOTUS_READER r; 237 238 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); 239 240 return next_character_break(&r); 241} 242 243size_t 244grapheme_next_character_break_utf8(const char *str, size_t len) 245{ 246 HERODOTUS_READER r; 247 248 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); 249 250 return next_character_break(&r); 251}