libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 759f02fbe74f11f64ba451b3a18090cbbb10add2
parent 55597f406bb21be8264ac88a7dec4e96994a5507
Author: Laslo Hunhold <dev@frign.de>
Date:   Fri, 31 Dec 2021 15:41:41 +0100

Add generator for compressed O(1)-lookup-table

Up to this point, the idea behind property-checks was to have a sorted
list and run something like binary search on it. Even though binary
search works really well and is O(log(n)), with our multiple tables
we suffer from the effect that

   sum_{i=1..N} O(log(i)) > O(log(sum_{i=1..N} i))

In simpler terms, the asymptotic gains are largest if you have one
large sorted list that you run binary search on. So the first thing I
tried out was putting all into one long range-list and add the break
property to each array.

While this worked and was much faster, it still was not satisfactory.

Preliminary experiments showed that it is possible to go with a really
crazy approach: Actually have a mapping of each of the 0x110000
codepoints separated into two separate tables, where the compression
works such that the first table indexes into the highest 2 bytes
(shifted 1 byte to the right) and then you have a subtable indexing
into the lowest byte.
The main advantage is that you can compress it by pointing to identical
sub-sequences. This might sound very rudimentary, but it gives a
compression ratio of 97.91%, which means that the lookup-table is only
around 55K, which is totally acceptable.

Really crazy are the speed-gains, because you end up with a branchless
lookup. utf8proc employs a similar method using hash-tables during
processing, but it has a bug that makes the compression around 20% worse
than what we have here. Additionally, their generator is written in
Ruby.

Stay tuned for the subsequent additions in the grapheme break detection
algorithm. I'm not exaggerating when saying that libgrapheme will by far
be the fastest implementation out there, but the code needs a bit of
cleanup I won't be able to finish today.

Until then I wish you a happy new year!

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
MMakefile | 4++++
Agen/properties.c | 357+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mgen/util.c | 7++++---
Mgen/util.h | 5+++++
4 files changed, 370 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile @@ -15,6 +15,7 @@ DATA =\ GEN =\ gen/character-prop\ gen/character-test\ + gen/properties\ SRC =\ src/character\ @@ -40,6 +41,7 @@ benchmark/character.o: benchmark/character.c config.mk gen/character-test.h grap benchmark/util.o: benchmark/util.c config.mk benchmark/util.h gen/character-prop.o: gen/character-prop.c config.mk gen/util.h gen/character-test.o: gen/character-test.c config.mk gen/util.h +gen/properties.o: gen/properties.c config.mk gen/util.h gen/util.o: gen/util.c config.mk gen/util.h src/character.o: src/character.c config.mk gen/character-prop.h grapheme.h src/util.h src/utf8.o: src/utf8.c config.mk grapheme.h @@ -52,12 +54,14 @@ test/util.o: test/util.c config.mk test/util.h benchmark/character: benchmark/character.o benchmark/util.o libgrapheme.a gen/character-prop: gen/character-prop.o gen/util.o gen/character-test: gen/character-test.o gen/util.o +gen/properties: gen/properties.o gen/util.o test/character: test/character.o test/util.o libgrapheme.a test/utf8-encode: test/utf8-encode.o test/util.o libgrapheme.a test/utf8-decode: test/utf8-decode.o test/util.o libgrapheme.a gen/character-prop.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/character-prop gen/character-test.h: data/GraphemeBreakTest.txt gen/character-test +gen/properties.h: data/emoji-data.txt data/GraphemeBreakProperty.txt gen/properties data/emoji-data.txt: wget -O $@ https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt diff --git a/gen/properties.c b/gen/properties.c @@ -0,0 +1,357 @@ +/* See LICENSE file for copyright and license details. */ +#include <errno.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "util.h" + +#define FILE_EMOJI "data/emoji-data.txt" +#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt" + +struct properties { + uint_least8_t break_property; +}; + +struct property_spec { + const char *enumname; + const char *file; + const char *ucdname; +}; + +static const struct property_spec break_property[] = { + { + .enumname = "OTHER", + .file = NULL, + .ucdname = NULL, + }, + { + .enumname = "CONTROL", + .file = FILE_GRAPHEME, + .ucdname = "Control", + }, + { + .enumname = "CR", + .file = FILE_GRAPHEME, + .ucdname = "CR", + }, + { + .enumname = "EXTEND", + .file = FILE_GRAPHEME, + .ucdname = "Extend", + }, + { + .enumname = "EXTENDED_PICTOGRAPHIC", + .file = FILE_EMOJI, + .ucdname = "Extended_Pictographic", + }, + { + .enumname = "HANGUL_L", + .file = FILE_GRAPHEME, + .ucdname = "L", + }, + { + .enumname = "HANGUL_V", + .file = FILE_GRAPHEME, + .ucdname = "V", + }, + { + .enumname = "HANGUL_T", + .file = FILE_GRAPHEME, + .ucdname = "T", + }, + { + .enumname = "HANGUL_LV", + .file = FILE_GRAPHEME, + .ucdname = "LV", + }, + { + .enumname = "HANGUL_LVT", + .file = FILE_GRAPHEME, + .ucdname = "LVT", + }, + { + .enumname = "LF", + .file = FILE_GRAPHEME, + .ucdname = "LF", + }, + { + .enumname = "PREPEND", + .file = FILE_GRAPHEME, + .ucdname = "Prepend", + }, + { + .enumname = "REGIONAL_INDICATOR", + .file = FILE_GRAPHEME, + .ucdname = "Regional_Indicator", + }, + { + .enumname = "SPACINGMARK", + .file = FILE_GRAPHEME, + .ucdname = "SpacingMark", + }, + { + .enumname = "ZWJ", + .file = FILE_GRAPHEME, + .ucdname = "ZWJ", + }, +}; + +static int +break_property_callback(char *file, char **field, size_t nfields, + char *comment, void *payload) +{ + /* prop always has the length 0x110000 */ + struct properties *prop = (struct properties *)payload; + struct range r; + size_t i; + uint_least32_t cp; + + (void)comment; + + if (nfields < 2) { + return 1; + } + + for (i = 0; i < LEN(break_property); i++) { + /* identify fitting file and identifier */ + if (break_property[i].file && + !strcmp(break_property[i].file, file) && + !strcmp(break_property[i].ucdname, field[1])) { + /* parse range in first field */ + if (range_parse(field[0], &r)) { + return 1; + } + + /* apply to all codepoints in the range */ + for (cp = r.lower; cp <= r.upper; cp++) { + if (prop[cp].break_property != 0) { + fprintf(stderr, "break_property_callback: " + "property overlap\n"); + exit(1); + } + + prop[cp].break_property = i; + } + + break; + } + } + + return 0; +} + +struct compressed_properties { + size_t *offset; + struct properties *data; + size_t datalen; +}; + +void +compress_properties(const struct properties *prop, + struct compressed_properties *comp) +{ + uint_least32_t cp, i; + + /* initialization */ + if (!(comp->offset = malloc((size_t)0x110000 * sizeof(*(comp->offset))))) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + comp->data = NULL; + comp->datalen = 0; + + for (cp = 0; cp < 0x110000; cp++) { + for (i = 0; i < comp->datalen; i++) { + if (!memcmp(&(prop[cp]), &(comp->data[i]), sizeof(*prop))) { + /* found a match! */ + comp->offset[cp] = i; + break; + } + } + if (i == comp->datalen) { + /* + * found no matching properties-struct, so + * add current properties to data and add the + * offset in the offset-table + */ + if (!(comp->data = reallocarray(comp->data, + ++(comp->datalen), + sizeof(*(comp->data))))) { + fprintf(stderr, "reallocarray: %s\n", + strerror(errno)); + exit(1); + } + memcpy(&(comp->data[comp->datalen - 1]), &(prop[cp]), + sizeof(*prop)); + comp->offset[cp] = comp->datalen - 1; + } + } +} + +struct major_minor_properties { + size_t *major; + size_t *minor; + size_t minorlen; +}; + +static double +get_major_minor_properties(const struct compressed_properties *comp, + struct major_minor_properties *mm) +{ + size_t i, j, compression_count = 0; + + /* + * we currently have an array comp->offset which maps the + * codepoints 0..0x110000 to offsets into comp->data. + * To improve cache-locality instead and allow a bit of + * compressing, instead of directly mapping a codepoint + * 0xAAAABB with comp->offset, we generate two arrays major + * and minor such that + * comp->offset(0xAAAABB) == minor[major[0xAAAA] + 0xBB] + * This yields a major-array of length 2^16 and a minor array + * of variable length depending on how many common subsequences + * can be filtered out. + */ + + /* initialize */ + if (!(mm->major = malloc((size_t)0x1100 * sizeof(*(mm->major))))) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + mm->minor = NULL; + mm->minorlen = 0; + + printf("#include <stdint.h>\n\n"); + + for (i = 0; i < (size_t)0x1100; i++) { + /* + * we now look at the cp-range (i << 8)..(i << 8 + 0xFF) + * and check if its corresponding offset-data already + * exists in minor (because then we just point there + * and need less storage) + */ + for (j = 0; j + 0xFF < mm->minorlen; j++) { + if (!memcmp(&(comp->offset[i << 8]), + &(mm->minor[j]), + sizeof(*(comp->offset)) * 0x100)) { + break; + } + } + if (j + 0xFF < mm->minorlen) { + /* found an index */ + compression_count++; + mm->major[i] = j; + } else { + /* + * add "new" sequence to minor and point to it + * in major + */ + mm->minorlen += 0x100; + if (!(mm->minor = reallocarray(mm->minor, + mm->minorlen, + sizeof(*(mm->minor))))) { + fprintf(stderr, "reallocarray: %s\n", + strerror(errno)); + exit(1); + } + memcpy(&(mm->minor[mm->minorlen - 0x100]), + &(comp->offset[i << 8]), + sizeof(*(mm->minor)) * 0x100); + mm->major[i] = mm->minorlen - 0x100; + } + } + + /* return compression ratio */ + return (double)compression_count / 0x1100 * 100; +} + +static void +print_lookup_table(char *name, size_t *data, size_t datalen, size_t maxval) +{ + char *type; + size_t i; + + type = (maxval <= (((1 << 16) - 1)) + 0xFF) ? "uint_least16_t" : + "uint_least32_t"; + + printf("const %s %s[] = {\n\t", type, name); + for (i = 0; i < datalen; i++) { + printf("%zu", data[i]); + if (i + 1 == datalen) { + printf("\n"); + } else if ((i + 1) % 8 != 0) { + printf(", "); + } else { + printf(",\n\t"); + } + + } + printf("};\n"); +} + +int +main(int argc, char *argv[]) +{ + struct compressed_properties comp; + struct major_minor_properties mm; + struct properties *prop; + size_t i; + + (void)argc; + + /* allocate property buffer for all codepoints */ + if (!(prop = calloc(0x110000, sizeof(*prop)))) { + fprintf(stderr, "calloc: %s\n", strerror(errno)); + exit(1); + } + + /* extract properties */ + parse_file_with_callback(FILE_EMOJI, break_property_callback, prop); + parse_file_with_callback(FILE_GRAPHEME, break_property_callback, prop); + + /* + * deduplicate by generating an array of offsets into prop where + * common data points at the same offset + */ + compress_properties(prop, &comp); + + /* generate major-minor-offset-tables */ + fprintf(stderr, "%s: compression-ratio: %.2f%%\n", argv[0], + get_major_minor_properties(&comp, &mm)); + + /* print data */ + if (mm.minorlen < 0x100) { + fprintf(stderr, "minor-array is too short.\n"); + } + print_lookup_table("major", mm.major, 0x1100, mm.minorlen - 0x100); + printf("\n"); + print_lookup_table("minor", mm.minor, mm.minorlen, comp.datalen); + printf("\n"); + + printf("enum break_property {\n"); + for (i = 0; i < LEN(break_property); i++) { + printf("\tBREAK_PROP_%s,\n", break_property[i].enumname); + } + printf("\tNUM_BREAK_PROPS,\n};\n\n"); + printf("struct properties {\n\tenum break_property break_property;\n};\n\n"); + + /* properties table */ + printf("const struct properties prop[] = {\n"); + for (i = 0; i < comp.datalen; i++) { + printf("\t{ BREAK_PROP_%s },\n", + break_property[comp.data[i].break_property].enumname); + } + printf("};\n"); + + /* free data */ + free(prop); + free(comp.data); + free(comp.offset); + free(mm.major); + free(mm.minor); + + return 0; +} diff --git a/gen/util.c b/gen/util.c @@ -1,9 +1,10 @@ /* See LICENSE file for copyright and license details. */ +#include <errno.h> +#include <stdbool.h> #include <stdint.h> #include <stdlib.h> #include <stdio.h> #include <string.h> -#include <errno.h> #include "util.h" @@ -62,7 +63,7 @@ hextocp(const char *str, size_t len, uint_least32_t *cp) return 0; } -static int +int range_parse(const char *str, struct range *range) { char *p; @@ -103,7 +104,7 @@ range_list_append(struct range **range, size_t *nranges, const struct range *new } } -static void +void parse_file_with_callback(char *fname, int (*callback)(char *, char **, size_t, char *, void *), void *payload) { diff --git a/gen/util.h b/gen/util.h @@ -28,6 +28,11 @@ struct segment_test { char *descr; }; +int range_parse(const char *, struct range *); + +void parse_file_with_callback(char *, int (*callback)(char *, char **, + size_t, char *, void *), void *payload); + void property_list_parse(struct property *, size_t); void property_list_print(const struct property *, size_t, const char *, const char *);