libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit f13f39a84bf411748a956f066743a18ec73ab7e6
parent dcf877fee95f52f4d60c8b0976f9ced82c608406
Author: Laslo Hunhold <dev@frign.de>
Date:   Fri,  7 Jan 2022 17:41:48 +0100

Refactor data-table-generation, use enum-minor-table

For the special case where the property-struct only contains a single
element it's a bit redundant to store offsets in the minor-array which
only point into a property-struct. We might as well store the enums
themselves in the minor-array, saving us one table-lookup.

Given our enum is way below 255 entries, we can store those enums
uint_least8_t. Previously, the offset-array would at least be an
uint_least16_t, which was a small oversight on my behalf.

The end-goal is to have separate data-tables for the other
break-detection-algorithms (word, sentence, line). This is a stark
contrast to libutf8proc for instance, which has one data-table for
everything. However, preliminary experiments showed, of course, that the
larger the data-table gets, the longer the accesses take as the CPU is
more likely to offload the table from the cache.
Another factor is static linking: It's unlikely that you use all of the
algorithms at once. For a minimal overall data-overhead (just a few KB)
each object file in the library (character.o, etc.) will be
self-sufficient, allowing easy LTO for the linker.

The aforementioned change leads to a speedup of around 6% and
data-table-reduction of around 40% (!) to 35K, which easily fits in the
L1-cache. We are now around half an order of magnitude faster than
libutf8proc.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mgen/properties.c | 104++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Msrc/character.c | 186++++++++++++++++++++++++++++++++++++++++----------------------------------------
2 files changed, 167 insertions(+), 123 deletions(-)

diff --git a/gen/properties.c b/gen/properties.c @@ -12,7 +12,7 @@ #define FILE_GRAPHEME "data/GraphemeBreakProperty.txt" struct properties { - uint_least8_t break_property; + uint_least8_t char_break_property; }; struct property_spec { @@ -21,7 +21,13 @@ struct property_spec { const char *ucdname; }; -static const struct property_spec break_property[] = { +struct property_payload { + struct properties *prop; + const struct property_spec *spec; + size_t speclen; +}; + +static const struct property_spec char_break_property[] = { { .enumname = "OTHER", .file = NULL, @@ -104,7 +110,7 @@ break_property_callback(char *file, char **field, size_t nfields, char *comment, void *payload) { /* prop always has the length 0x110000 */ - struct properties *prop = (struct properties *)payload; + struct property_payload *p = (struct property_payload *)payload; struct range r; size_t i; uint_least32_t cp; @@ -115,11 +121,11 @@ break_property_callback(char *file, char **field, size_t nfields, return 1; } - for (i = 0; i < LEN(break_property); i++) { + for (i = 0; i < p->speclen; i++) { /* identify fitting file and identifier */ - if (break_property[i].file && - !strcmp(break_property[i].file, file) && - !strcmp(break_property[i].ucdname, field[1])) { + if (p->spec[i].file && + !strcmp(p->spec[i].file, file) && + !strcmp(p->spec[i].ucdname, field[1])) { /* parse range in first field */ if (range_parse(field[0], &r)) { return 1; @@ -127,13 +133,18 @@ break_property_callback(char *file, char **field, size_t nfields, /* apply to all codepoints in the range */ for (cp = r.lower; cp <= r.upper; cp++) { - if (prop[cp].break_property != 0) { + if (p->spec == char_break_property) { + if (p->prop[cp].char_break_property != 0) { + fprintf(stderr, "break_property_callback: " + "Character break property overlap.\n"); + exit(1); + } + p->prop[cp].char_break_property = i; + } else { fprintf(stderr, "break_property_callback: " - "property overlap\n"); + "Unknown specification.\n"); exit(1); } - - prop[cp].break_property = i; } break; @@ -292,13 +303,54 @@ print_lookup_table(char *name, size_t *data, size_t datalen, size_t maxval) printf("};\n"); } +/* + * this function is for the special case where struct property + * only contains a single enum value. + */ +static void +print_enum_lookup_table(char *name, size_t *offset, size_t offsetlen, + const struct properties *prop, + const struct property_spec *spec, + const char *enumprefix) +{ + size_t i; + + printf("const uint8_t %s[] = {\n\t", name); + for (i = 0; i < offsetlen; i++) { + printf("%s_%s", enumprefix, + spec[*((uint_least8_t *)&(prop[offset[i]]))].enumname); + if (i + 1 == offsetlen) { + printf("\n"); + } else if ((i + 1) % 8 != 0) { + printf(", "); + } else { + printf(",\n\t"); + } + + } + printf("};\n"); +} + +static void +print_enum(const struct property_spec *spec, size_t speclen, + const char *enumname, const char *enumprefix) +{ + size_t i; + + printf("enum %s {\n", enumname); + for (i = 0; i < speclen; i++) { + printf("\t%s_%s,\n", enumprefix, spec[i].enumname); + } + printf("\tNUM_%sS,\n};\n\n", enumprefix); +} + int main(int argc, char *argv[]) { struct compressed_properties comp; struct major_minor_properties mm; + struct property_payload payload; struct properties *prop; - size_t i; (void)argc; @@ -309,8 +361,12 @@ main(int argc, char *argv[]) } /* extract properties */ - parse_file_with_callback(FILE_EMOJI, break_property_callback, prop); - parse_file_with_callback(FILE_GRAPHEME, break_property_callback, prop); + payload.prop = prop; + + payload.spec = char_break_property; + payload.speclen = LEN(char_break_property); + parse_file_with_callback(FILE_EMOJI, break_property_callback, &payload); + parse_file_with_callback(FILE_GRAPHEME, break_property_callback, &payload); /* * deduplicate by generating an array of offsets into prop where @@ -323,29 +379,17 @@ main(int argc, char *argv[]) get_major_minor_properties(&comp, &mm)); /* print data */ + print_enum(char_break_property, LEN(char_break_property), + "char_break_property", "CHAR_BREAK_PROP"); + if (mm.minorlen < 0x100) { fprintf(stderr, "minor-array is too short.\n"); } print_lookup_table("major", mm.major, 0x1100, mm.minorlen - 0x100); printf("\n"); - print_lookup_table("minor", mm.minor, mm.minorlen, comp.datalen); + print_enum_lookup_table("minor", mm.minor, mm.minorlen, comp.data, char_break_property, "CHAR_BREAK_PROP"); printf("\n"); - printf("enum break_property {\n"); - for (i = 0; i < LEN(break_property); i++) { - printf("\tBREAK_PROP_%s,\n", break_property[i].enumname); - } - printf("\tNUM_BREAK_PROPS,\n};\n\n"); - printf("struct properties {\n\tenum break_property break_property;\n};\n\n"); - - /* properties table */ - printf("const struct properties prop[] = {\n"); - for (i = 0; i < comp.datalen; i++) { - printf("\t{ BREAK_PROP_%s },\n", - break_property[comp.data[i].break_property].enumname); - } - printf("};\n"); - /* free data */ free(prop); free(comp.data); diff --git a/src/character.c b/src/character.c @@ -8,114 +8,114 @@ #include "../grapheme.h" #include "util.h" -static const uint_least16_t dont_break[NUM_BREAK_PROPS] = { - [BREAK_PROP_OTHER] = - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_CR] = - UINT16_C(1 << BREAK_PROP_LF), /* GB3 */ - [BREAK_PROP_EXTEND] = - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_EXTENDED_PICTOGRAPHIC] = - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_HANGUL_L] = - UINT16_C(1 << BREAK_PROP_HANGUL_L) | /* GB6 */ - UINT16_C(1 << BREAK_PROP_HANGUL_V) | /* GB6 */ - UINT16_C(1 << BREAK_PROP_HANGUL_LV) | /* GB6 */ - UINT16_C(1 << BREAK_PROP_HANGUL_LVT) | /* GB6 */ - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_HANGUL_V] = - UINT16_C(1 << BREAK_PROP_HANGUL_V) | /* GB7 */ - UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB7 */ - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_HANGUL_T] = - UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB8 */ - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_HANGUL_LV] = - UINT16_C(1 << BREAK_PROP_HANGUL_V) | /* GB7 */ - UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB7 */ - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_HANGUL_LVT] = - UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB8 */ - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_PREPEND] = - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK) | /* GB9a */ +static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = { + [CHAR_BREAK_PROP_OTHER] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_CR] = + UINT16_C(1 << CHAR_BREAK_PROP_LF), /* GB3 */ + [CHAR_BREAK_PROP_EXTEND] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_HANGUL_L] = + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_L) | /* GB6 */ + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB6 */ + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LV) | /* GB6 */ + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LVT) | /* GB6 */ + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_HANGUL_V] = + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */ + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */ + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_HANGUL_T] = + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */ + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_HANGUL_LV] = + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */ + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */ + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_HANGUL_LVT] = + UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */ + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_PREPEND] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK) | /* GB9a */ (UINT16_C(0xFFFF) & - ~(UINT16_C(1 << BREAK_PROP_CR) | - UINT16_C(1 << BREAK_PROP_LF) | - UINT16_C(1 << BREAK_PROP_CONTROL) + ~(UINT16_C(1 << CHAR_BREAK_PROP_CR) | + UINT16_C(1 << CHAR_BREAK_PROP_LF) | + UINT16_C(1 << CHAR_BREAK_PROP_CONTROL) ) ), /* GB9b */ - [BREAK_PROP_REGIONAL_INDICATOR] = - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_SPACINGMARK] = - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ - [BREAK_PROP_ZWJ] = - UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_SPACINGMARK] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + [CHAR_BREAK_PROP_ZWJ] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ + UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ }; -static const uint_least16_t flag_update_gb11[2 * NUM_BREAK_PROPS] = { - [BREAK_PROP_EXTENDED_PICTOGRAPHIC] = - UINT16_C(1 << BREAK_PROP_ZWJ) | - UINT16_C(1 << BREAK_PROP_EXTEND), - [BREAK_PROP_ZWJ + NUM_BREAK_PROPS] = - UINT16_C(1 << BREAK_PROP_EXTENDED_PICTOGRAPHIC), - [BREAK_PROP_EXTEND + NUM_BREAK_PROPS] = - UINT16_C(1 << BREAK_PROP_EXTEND) | - UINT16_C(1 << BREAK_PROP_ZWJ), - [BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_BREAK_PROPS] = - UINT16_C(1 << BREAK_PROP_ZWJ) | - UINT16_C(1 << BREAK_PROP_EXTEND), +static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { + [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND), + [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC), + [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ), + [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = + UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | + UINT16_C(1 << CHAR_BREAK_PROP_EXTEND), }; -static const uint_least16_t dont_break_gb11[2 * NUM_BREAK_PROPS] = { - [BREAK_PROP_ZWJ + NUM_BREAK_PROPS] = - UINT16_C(1 << BREAK_PROP_EXTENDED_PICTOGRAPHIC), +static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { + [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = + UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC), }; -static const uint_least16_t flag_update_gb12_13[2 * NUM_BREAK_PROPS] = { - [BREAK_PROP_REGIONAL_INDICATOR] = - UINT16_C(1 << BREAK_PROP_REGIONAL_INDICATOR), +static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { + [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = + UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR), }; -static const uint_least16_t dont_break_gb12_13[2 * NUM_BREAK_PROPS] = { - [BREAK_PROP_REGIONAL_INDICATOR + NUM_BREAK_PROPS] = - UINT16_C(1 << BREAK_PROP_REGIONAL_INDICATOR), +static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { + [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = + UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR), }; -static enum break_property +static enum char_break_property get_break_prop(uint_least32_t cp) { if (likely(cp <= 0x10FFFF)) { - return prop[minor[major[cp >> 8] + (cp & 0xff)]].break_property; + return (enum char_break_property)minor[major[cp >> 8] + (cp & 0xff)]; } else { - return BREAK_PROP_OTHER; + return CHAR_BREAK_PROP_OTHER; } } bool grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state) { - enum break_property cp0_prop, cp1_prop; + enum char_break_property cp0_prop, cp1_prop; bool notbreak = false; if (likely(state)) { @@ -132,11 +132,11 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA /* update flags */ state->gb11_flag = flag_update_gb11[cp0_prop + - NUM_BREAK_PROPS * + NUM_CHAR_BREAK_PROPS * state->gb11_flag] & UINT16_C(1 << cp1_prop); state->gb12_13_flag = flag_update_gb12_13[cp0_prop + - NUM_BREAK_PROPS * + NUM_CHAR_BREAK_PROPS * state->gb12_13_flag] & UINT16_C(1 << cp1_prop); @@ -146,10 +146,10 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA */ notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) || (dont_break_gb11[cp0_prop + state->gb11_flag * - NUM_BREAK_PROPS] & + NUM_CHAR_BREAK_PROPS] & UINT16_C(1 << cp1_prop)) || (dont_break_gb12_13[cp0_prop + state->gb12_13_flag * - NUM_BREAK_PROPS] & + NUM_CHAR_BREAK_PROPS] & UINT16_C(1 << cp1_prop)); /* update or reset flags (when we have a break) */