commit f13f39a84bf411748a956f066743a18ec73ab7e6
parent dcf877fee95f52f4d60c8b0976f9ced82c608406
Author: Laslo Hunhold <dev@frign.de>
Date: Fri, 7 Jan 2022 17:41:48 +0100
Refactor data-table-generation, use enum-minor-table
For the special case where the property-struct only contains a single
element it's a bit redundant to store offsets in the minor-array which
only point into a property-struct. We might as well store the enums
themselves in the minor-array, saving us one table-lookup.
Given our enum is way below 255 entries, we can store those enums
uint_least8_t. Previously, the offset-array would at least be an
uint_least16_t, which was a small oversight on my behalf.
The end-goal is to have separate data-tables for the other
break-detection-algorithms (word, sentence, line). This is a stark
contrast to libutf8proc for instance, which has one data-table for
everything. However, preliminary experiments showed, of course, that the
larger the data-table gets, the longer the accesses take as the CPU is
more likely to offload the table from the cache.
Another factor is static linking: It's unlikely that you use all of the
algorithms at once. For a minimal overall data-overhead (just a few KB)
each object file in the library (character.o, etc.) will be
self-sufficient, allowing easy LTO for the linker.
The aforementioned change leads to a speedup of around 6% and
data-table-reduction of around 40% (!) to 35K, which easily fits in the
L1-cache. We are now around half an order of magnitude faster than
libutf8proc.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
M | gen/properties.c | | | 104 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------- |
M | src/character.c | | | 186 | ++++++++++++++++++++++++++++++++++++++++---------------------------------------- |
2 files changed, 167 insertions(+), 123 deletions(-)
diff --git a/gen/properties.c b/gen/properties.c
@@ -12,7 +12,7 @@
#define FILE_GRAPHEME "data/GraphemeBreakProperty.txt"
struct properties {
- uint_least8_t break_property;
+ uint_least8_t char_break_property;
};
struct property_spec {
@@ -21,7 +21,13 @@ struct property_spec {
const char *ucdname;
};
-static const struct property_spec break_property[] = {
+struct property_payload {
+ struct properties *prop;
+ const struct property_spec *spec;
+ size_t speclen;
+};
+
+static const struct property_spec char_break_property[] = {
{
.enumname = "OTHER",
.file = NULL,
@@ -104,7 +110,7 @@ break_property_callback(char *file, char **field, size_t nfields,
char *comment, void *payload)
{
/* prop always has the length 0x110000 */
- struct properties *prop = (struct properties *)payload;
+ struct property_payload *p = (struct property_payload *)payload;
struct range r;
size_t i;
uint_least32_t cp;
@@ -115,11 +121,11 @@ break_property_callback(char *file, char **field, size_t nfields,
return 1;
}
- for (i = 0; i < LEN(break_property); i++) {
+ for (i = 0; i < p->speclen; i++) {
/* identify fitting file and identifier */
- if (break_property[i].file &&
- !strcmp(break_property[i].file, file) &&
- !strcmp(break_property[i].ucdname, field[1])) {
+ if (p->spec[i].file &&
+ !strcmp(p->spec[i].file, file) &&
+ !strcmp(p->spec[i].ucdname, field[1])) {
/* parse range in first field */
if (range_parse(field[0], &r)) {
return 1;
@@ -127,13 +133,18 @@ break_property_callback(char *file, char **field, size_t nfields,
/* apply to all codepoints in the range */
for (cp = r.lower; cp <= r.upper; cp++) {
- if (prop[cp].break_property != 0) {
+ if (p->spec == char_break_property) {
+ if (p->prop[cp].char_break_property != 0) {
+ fprintf(stderr, "break_property_callback: "
+ "Character break property overlap.\n");
+ exit(1);
+ }
+ p->prop[cp].char_break_property = i;
+ } else {
fprintf(stderr, "break_property_callback: "
- "property overlap\n");
+ "Unknown specification.\n");
exit(1);
}
-
- prop[cp].break_property = i;
}
break;
@@ -292,13 +303,54 @@ print_lookup_table(char *name, size_t *data, size_t datalen, size_t maxval)
printf("};\n");
}
+/*
+ * this function is for the special case where struct property
+ * only contains a single enum value.
+ */
+static void
+print_enum_lookup_table(char *name, size_t *offset, size_t offsetlen,
+ const struct properties *prop,
+ const struct property_spec *spec,
+ const char *enumprefix)
+{
+ size_t i;
+
+ printf("const uint8_t %s[] = {\n\t", name);
+ for (i = 0; i < offsetlen; i++) {
+ printf("%s_%s", enumprefix,
+ spec[*((uint_least8_t *)&(prop[offset[i]]))].enumname);
+ if (i + 1 == offsetlen) {
+ printf("\n");
+ } else if ((i + 1) % 8 != 0) {
+ printf(", ");
+ } else {
+ printf(",\n\t");
+ }
+
+ }
+ printf("};\n");
+}
+
+static void
+print_enum(const struct property_spec *spec, size_t speclen,
+ const char *enumname, const char *enumprefix)
+{
+ size_t i;
+
+ printf("enum %s {\n", enumname);
+ for (i = 0; i < speclen; i++) {
+ printf("\t%s_%s,\n", enumprefix, spec[i].enumname);
+ }
+ printf("\tNUM_%sS,\n};\n\n", enumprefix);
+}
+
int
main(int argc, char *argv[])
{
struct compressed_properties comp;
struct major_minor_properties mm;
+ struct property_payload payload;
struct properties *prop;
- size_t i;
(void)argc;
@@ -309,8 +361,12 @@ main(int argc, char *argv[])
}
/* extract properties */
- parse_file_with_callback(FILE_EMOJI, break_property_callback, prop);
- parse_file_with_callback(FILE_GRAPHEME, break_property_callback, prop);
+ payload.prop = prop;
+
+ payload.spec = char_break_property;
+ payload.speclen = LEN(char_break_property);
+ parse_file_with_callback(FILE_EMOJI, break_property_callback, &payload);
+ parse_file_with_callback(FILE_GRAPHEME, break_property_callback, &payload);
/*
* deduplicate by generating an array of offsets into prop where
@@ -323,29 +379,17 @@ main(int argc, char *argv[])
get_major_minor_properties(&comp, &mm));
/* print data */
+ print_enum(char_break_property, LEN(char_break_property),
+ "char_break_property", "CHAR_BREAK_PROP");
+
if (mm.minorlen < 0x100) {
fprintf(stderr, "minor-array is too short.\n");
}
print_lookup_table("major", mm.major, 0x1100, mm.minorlen - 0x100);
printf("\n");
- print_lookup_table("minor", mm.minor, mm.minorlen, comp.datalen);
+ print_enum_lookup_table("minor", mm.minor, mm.minorlen, comp.data, char_break_property, "CHAR_BREAK_PROP");
printf("\n");
- printf("enum break_property {\n");
- for (i = 0; i < LEN(break_property); i++) {
- printf("\tBREAK_PROP_%s,\n", break_property[i].enumname);
- }
- printf("\tNUM_BREAK_PROPS,\n};\n\n");
- printf("struct properties {\n\tenum break_property break_property;\n};\n\n");
-
- /* properties table */
- printf("const struct properties prop[] = {\n");
- for (i = 0; i < comp.datalen; i++) {
- printf("\t{ BREAK_PROP_%s },\n",
- break_property[comp.data[i].break_property].enumname);
- }
- printf("};\n");
-
/* free data */
free(prop);
free(comp.data);
diff --git a/src/character.c b/src/character.c
@@ -8,114 +8,114 @@
#include "../grapheme.h"
#include "util.h"
-static const uint_least16_t dont_break[NUM_BREAK_PROPS] = {
- [BREAK_PROP_OTHER] =
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_CR] =
- UINT16_C(1 << BREAK_PROP_LF), /* GB3 */
- [BREAK_PROP_EXTEND] =
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_HANGUL_L] =
- UINT16_C(1 << BREAK_PROP_HANGUL_L) | /* GB6 */
- UINT16_C(1 << BREAK_PROP_HANGUL_V) | /* GB6 */
- UINT16_C(1 << BREAK_PROP_HANGUL_LV) | /* GB6 */
- UINT16_C(1 << BREAK_PROP_HANGUL_LVT) | /* GB6 */
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_HANGUL_V] =
- UINT16_C(1 << BREAK_PROP_HANGUL_V) | /* GB7 */
- UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB7 */
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_HANGUL_T] =
- UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB8 */
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_HANGUL_LV] =
- UINT16_C(1 << BREAK_PROP_HANGUL_V) | /* GB7 */
- UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB7 */
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_HANGUL_LVT] =
- UINT16_C(1 << BREAK_PROP_HANGUL_T) | /* GB8 */
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_PREPEND] =
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK) | /* GB9a */
+static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
+ [CHAR_BREAK_PROP_OTHER] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_CR] =
+ UINT16_C(1 << CHAR_BREAK_PROP_LF), /* GB3 */
+ [CHAR_BREAK_PROP_EXTEND] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_HANGUL_L] =
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_L) | /* GB6 */
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB6 */
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LV) | /* GB6 */
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LVT) | /* GB6 */
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_HANGUL_V] =
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_HANGUL_T] =
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_HANGUL_LV] =
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_HANGUL_LVT] =
+ UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_PREPEND] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK) | /* GB9a */
(UINT16_C(0xFFFF) &
- ~(UINT16_C(1 << BREAK_PROP_CR) |
- UINT16_C(1 << BREAK_PROP_LF) |
- UINT16_C(1 << BREAK_PROP_CONTROL)
+ ~(UINT16_C(1 << CHAR_BREAK_PROP_CR) |
+ UINT16_C(1 << CHAR_BREAK_PROP_LF) |
+ UINT16_C(1 << CHAR_BREAK_PROP_CONTROL)
)
), /* GB9b */
- [BREAK_PROP_REGIONAL_INDICATOR] =
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_SPACINGMARK] =
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
- [BREAK_PROP_ZWJ] =
- UINT16_C(1 << BREAK_PROP_EXTEND) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_ZWJ) | /* GB9 */
- UINT16_C(1 << BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_SPACINGMARK] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
+ [CHAR_BREAK_PROP_ZWJ] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */
+ UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */
};
-static const uint_least16_t flag_update_gb11[2 * NUM_BREAK_PROPS] = {
- [BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
- UINT16_C(1 << BREAK_PROP_ZWJ) |
- UINT16_C(1 << BREAK_PROP_EXTEND),
- [BREAK_PROP_ZWJ + NUM_BREAK_PROPS] =
- UINT16_C(1 << BREAK_PROP_EXTENDED_PICTOGRAPHIC),
- [BREAK_PROP_EXTEND + NUM_BREAK_PROPS] =
- UINT16_C(1 << BREAK_PROP_EXTEND) |
- UINT16_C(1 << BREAK_PROP_ZWJ),
- [BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_BREAK_PROPS] =
- UINT16_C(1 << BREAK_PROP_ZWJ) |
- UINT16_C(1 << BREAK_PROP_EXTEND),
+static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
+ [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) |
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND),
+ [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC),
+ [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) |
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ),
+ [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
+ UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) |
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTEND),
};
-static const uint_least16_t dont_break_gb11[2 * NUM_BREAK_PROPS] = {
- [BREAK_PROP_ZWJ + NUM_BREAK_PROPS] =
- UINT16_C(1 << BREAK_PROP_EXTENDED_PICTOGRAPHIC),
+static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
+ [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
+ UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC),
};
-static const uint_least16_t flag_update_gb12_13[2 * NUM_BREAK_PROPS] = {
- [BREAK_PROP_REGIONAL_INDICATOR] =
- UINT16_C(1 << BREAK_PROP_REGIONAL_INDICATOR),
+static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
+ [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
+ UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
};
-static const uint_least16_t dont_break_gb12_13[2 * NUM_BREAK_PROPS] = {
- [BREAK_PROP_REGIONAL_INDICATOR + NUM_BREAK_PROPS] =
- UINT16_C(1 << BREAK_PROP_REGIONAL_INDICATOR),
+static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
+ [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
+ UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR),
};
-static enum break_property
+static enum char_break_property
get_break_prop(uint_least32_t cp)
{
if (likely(cp <= 0x10FFFF)) {
- return prop[minor[major[cp >> 8] + (cp & 0xff)]].break_property;
+ return (enum char_break_property)minor[major[cp >> 8] + (cp & 0xff)];
} else {
- return BREAK_PROP_OTHER;
+ return CHAR_BREAK_PROP_OTHER;
}
}
bool
grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STATE *state)
{
- enum break_property cp0_prop, cp1_prop;
+ enum char_break_property cp0_prop, cp1_prop;
bool notbreak = false;
if (likely(state)) {
@@ -132,11 +132,11 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
/* update flags */
state->gb11_flag = flag_update_gb11[cp0_prop +
- NUM_BREAK_PROPS *
+ NUM_CHAR_BREAK_PROPS *
state->gb11_flag] &
UINT16_C(1 << cp1_prop);
state->gb12_13_flag = flag_update_gb12_13[cp0_prop +
- NUM_BREAK_PROPS *
+ NUM_CHAR_BREAK_PROPS *
state->gb12_13_flag] &
UINT16_C(1 << cp1_prop);
@@ -146,10 +146,10 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA
*/
notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) ||
(dont_break_gb11[cp0_prop + state->gb11_flag *
- NUM_BREAK_PROPS] &
+ NUM_CHAR_BREAK_PROPS] &
UINT16_C(1 << cp1_prop)) ||
(dont_break_gb12_13[cp0_prop + state->gb12_13_flag *
- NUM_BREAK_PROPS] &
+ NUM_CHAR_BREAK_PROPS] &
UINT16_C(1 << cp1_prop));
/* update or reset flags (when we have a break) */