Replace awk-scripts with C programs for data-parsing - libgrapheme - Freestanding C library for unicode string handling

	libgrapheme Freestanding C library for unicode string handling
	git clone https://git.sinitax.com/suckless/libgrapheme
	Log \| Files \| Refs \| README \| LICENSE \| sfeed.txt

commit d74e91e355c37eff0ac64b8ce0e18ef587a1d333
parent fc071310eecb27fe2a469a64a3154c8db514a779
Author: Laslo Hunhold <dev@frign.de>
Date:   Sun, 18 Oct 2020 19:07:17 +0200

Replace awk-scripts with C programs for data-parsing

Even though one can expect POSIX awk(1) to be present on almost all
conceivable systems, I personally must admit that I was never
comfortable with it and had to really bend it to support the features
necessary for the Unicode data table parsing (most prominently,
parsing hexadecimal numbers).

It is common to write short awk-invocations to parse line-oriented
data, but it hits its limits at the given scale. Much finer-grained
control is possible in C, with the added benefit that code-reuse is
possible and people familiar with C can now also debug the data parsing.
All in all, it adds a few lines overall, but only marginally if you
consider the fact that C is such a low-level language.

As a result, libgrapheme now only needs POSIX make(1) and a C99
compiler, while simplifying the Makefile a bit as well.

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
M Makefile  | 61 +++++++++++++++++++++++++++++--------------------------------
D data/emo.awk  | 77 -----------------------------------------------------------------------------
A data/emo.c  | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D data/gbp.awk  | 101 -------------------------------------------------------------------------------
A data/gbp.c  | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D data/gbt.awk  | 68 --------------------------------------------------------------------
A data/gbt.c  | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A data/util.c  | 159 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A data/util.h  | 20 ++++++++++++++++++++

9 files changed, 531 insertions(+), 278 deletions(-)
diff --git a/Makefile b/Makefile
@@ -13,18 +13,41 @@ MAN7 = man/libgrapheme.7
 
 all: libgrapheme.a libgrapheme.so $(TEST)
 
+data/gbp.h: data/gbp.txt data/gbp
+data/emo.h: data/emo.txt data/emo
+data/gbt.h: data/gbt.txt data/gbt
+
+data/gbp.o: data/gbp.c config.mk data/util.h
+data/emo.o: data/emo.c config.mk data/util.h
+data/gbt.o: data/gbt.c config.mk data/util.h
+data/util.o: data/util.c config.mk data/util.h
 src/boundary.o: src/boundary.c config.mk data/emo.h data/gbp.h grapheme.h
 src/codepoint.o: src/codepoint.c config.mk grapheme.h
 src/grapheme.o: src/grapheme.c config.mk grapheme.h
 test/test.o: test/test.c config.mk data/gbt.h grapheme.h
 
+data/gbp: data/gbp.o data/util.o
+data/emo: data/emo.o data/util.o
+data/gbt: data/gbt.o data/util.o
 test/test: test/test.o $(LIB:=.o)
 
-test: $(TEST)
-	for m in $(TEST); do ./$$m; done
+data/gbp.txt:
+	wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
+
+data/emo.txt:
+	wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt
+
+data/gbt.txt:
+	wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakTest.txt
+
+$(DATA:=.h):
+	$(@:.h=) < $(@:.h=.txt) > $@
+
+$(DATA):
+	$(CC) -o $@ $(LDFLAGS) $@.o data/util.o
 
 $(TEST):
-	$(CC) -o $@ $(LDFLAGS) $< $(LIB:=.o)
+	$(CC) -o $@ $(LDFLAGS) $@.o $(LIB:=.o)
 
 .c.o:
 	$(CC) -c -o $@ $(CPPFLAGS) $(CFLAGS) $<
@@ -36,34 +59,8 @@ libgrapheme.a: $(LIB:=.o)
 libgrapheme.so: $(LIB:=.o)
 	$(CC) -o $@ -shared $?
 
-data/gbp.h: data/gbp.awk data/gbp.txt
-	printf "/* Automatically generated by gbp.awk */\n" > $@
-	printf "#include <stdint.h>\n\n" >> $@
-	awk -f data/gbp.awk data/gbp.txt >> $@
-	printf "\n" >> $@
-
-data/emo.h: data/emo.awk data/emo.txt
-	printf "/* Automatically generated by emo.awk */\n" > $@
-	printf "#include <stdint.h>\n\n" >> $@
-	awk -f data/emo.awk data/emo.txt >> $@
-	printf "\n" >> $@
-
-data/gbt.h: data/gbt.awk data/gbt.txt
-	printf "/* Automatically generated by gbt.awk */\n" > $@
-	printf "#include <stddef.h>\n" >> $@
-	printf "#include <stdint.h>\n\n" >> $@
-	printf "#include \"../grapheme.h\"\n\n" >> $@
-	awk -f data/gbt.awk data/gbt.txt >> $@
-	printf "\n" >> $@
-
-data/gbp.txt:
-	wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakProperty.txt
-
-data/emo.txt:
-	wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/emoji/emoji-data.txt
-
-data/gbt.txt:
-	wget -O $@ https://www.unicode.org/Public/13.0.0/ucd/auxiliary/GraphemeBreakTest.txt
+test: $(TEST)
+	for m in $(TEST); do ./$$m; done
 
 install: all
 	mkdir -p "$(DESTDIR)$(LIBPREFIX)"
@@ -84,7 +81,7 @@ uninstall:
 	rm -f "$(DESTDIR)$(INCPREFIX)/grapheme.h"
 
 clean:
-	rm -f $(DATA:=.h) $(LIB:=.o) $(TEST:=.o) $(TEST) libgrapheme.a libgrapheme.so
+	rm -f $(DATA:=.h) $(DATA:=.o) data/util.o $(LIB:=.o) $(TEST:=.o) $(DATA) $(TEST) libgrapheme.a libgrapheme.so
 
 clean-data:
 	rm -f $(DATA:=.txt)
diff --git a/data/emo.awk b/data/emo.awk
@@ -1,77 +0,0 @@
-# See LICENSE file for copyright and license details.
-
-# https://www.unicode.org/Public/emoji/latest/emoji-data.txt
-BEGIN {
-	FS = "[ ;]+"
-}
-
-$0 ~ /^#/ || $0 ~ /^\s*$/      { next }
-$2 == "Extended_Pictographic#" { extpicts[nextpicts++] = $1 }
-
-END {
-	mktable("extpict", extpicts, nextpicts);
-}
-
-function hextonum(str) {
-	str = tolower(str);
-	if (substr(str, 1, 2) != "0x") {
-		return -1;
-	}
-	str = substr(str, 3);
-
-	val = 0;
-	for (i = 0; i < length(str); i++) {
-		dig = index("0123456789abcdef", substr(str, i + 1, 1));
-
-		if (!dig) {
-			return -1;
-		}
-
-		val = (16 * val) + (dig - 1);
-	}
-
-	return val;
-}
-
-function mktable(name, array, arrlen) {
-	printf("\nstatic const uint32_t "name"_table[][2] = {\n");
-
-	for (j = 0; j < arrlen; j++) {
-		if (ind = index(array[j], "..")) {
-			lower = tolower(substr(array[j], 1, ind - 1));
-			upper = tolower(substr(array[j], ind + 2));
-		} else {
-			lower = upper = tolower(array[j]);
-		}
-		lower = sprintf("0x%s", lower);
-		upper = sprintf("0x%s", upper);
-
-		# print lower bound
-		printf("\t{ UINT32_C(%s), ", lower);
-
-		for (; j < arrlen - 1; j++) {
-			# look ahead and check if we have adjacent arrays
-			if (ind = index(array[j + 1], "..")) {
-				nextlower = tolower(substr(array[j + 1],
-				                    1, ind - 1));
-				nextupper = tolower(substr(array[j + 1],
-				                    ind + 2));
-			} else {
-				nextlower = nextupper = tolower(array[j + 1]);
-			}
-			nextlower = sprintf("0x%s", nextlower);
-			nextupper = sprintf("0x%s", nextupper);
-
-			if ((hextonum(nextlower) * 1) != (hextonum(upper) + 1)) {
-				break;
-			} else {
-				upper = nextupper;
-			}
-		}
-
-		# print upper bound
-		printf("UINT32_C(%s) },\n", upper);
-	}
-
-	printf("};\n");
-}
diff --git a/data/emo.c b/data/emo.c
@@ -0,0 +1,68 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "util.h"
+
+static struct {
+	char         *identifier;
+	char         *tablename;
+	struct range *table;
+	size_t        tablelen;
+} properties[] = {
+	{
+		.identifier = "Extended_Pictographic",
+		.tablename  = "extpict_table",
+	},
+};
+
+int
+process_line(char **field, size_t nfields, char *comment)
+{
+	size_t i;
+	struct range r;
+
+	(void)comment;
+
+	if (nfields < 2) {
+		return 1;
+	}
+
+	for (i = 0; i < LEN(properties); i++) {
+		if (!strcmp(field[1], properties[i].identifier)) {
+			if (range_parse(field[0], &r)) {
+				return 1;
+			}
+			range_list_append(&(properties[i].table),
+			                  &(properties[i].tablelen), &r);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+int
+main(void)
+{
+	size_t i, j;
+
+	printf("/* Automatically generated by data/emo */\n"
+	       "#include <stdint.h>\n");
+
+	parse_input(process_line);
+
+	for (i = 0; i < LEN(properties); i++) {
+		printf("\nstatic const uint32_t %s[][2] = {\n",
+		       properties[i].tablename);
+		for (j = 0; j < properties[i].tablelen; j++) {
+			printf("\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
+			       properties[i].table[j].lower,
+			       properties[i].table[j].upper);
+		}
+		printf("};\n");
+	}
+
+	return 0;
+}
diff --git a/data/gbp.awk b/data/gbp.awk
@@ -1,101 +0,0 @@
-# See LICENSE file for copyright and license details.
-
-# http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
-BEGIN {
-	FS = "[ ;]+"
-}
-
-$0 ~ /^#/ || $0 ~ /^\s*$/  { next }
-$2 == "CR"                 { crs[ncrs++] = $1 }
-$2 == "LF"                 { lfs[nlfs++] = $1 }
-$2 == "Control"            { controls[ncontrols++] = $1 }
-$2 == "Extend"             { extends[nextends++] = $1 }
-$2 == "ZWJ"                { zwj[nzwj++] = $1 }
-$2 == "Regional_Indicator" { ris[nris++] = $1 }
-$2 == "Prepend"            { prepends[nprepends++] = $1 }
-$2 == "SpacingMark"        { spacingmarks[nspacingmarks++] = $1 }
-$2 == "L"                  { ls[nls++] = $1 }
-$2 == "V"                  { vs[nvs++] = $1 }
-$2 == "T"                  { ts[nts++] = $1 }
-$2 == "LV"                 { lvs[nlvs++] = $1 }
-$2 == "LVT"                { lvts[nlvts++] = $1 }
-
-END {
-	mktable("cr", crs, ncrs);
-	mktable("lf", lfs, nlfs);
-	mktable("control", controls, ncontrols);
-	mktable("extend", extends, nextends);
-	mktable("zwj", zwj, nzwj);
-	mktable("ri", ris, nris);
-	mktable("prepend", prepends, nprepends);
-	mktable("spacingmark", spacingmarks, nspacingmarks);
-	mktable("l", ls, nls);
-	mktable("v", vs, nvs);
-	mktable("t", ts, nts);
-	mktable("lv", lvs, nlvs);
-	mktable("lvt", lvts, nlvts);
-}
-
-function hextonum(str) {
-	str = tolower(str);
-	if (substr(str, 1, 2) != "0x") {
-		return -1;
-	}
-	str = substr(str, 3);
-
-	val = 0;
-	for (i = 0; i < length(str); i++) {
-		dig = index("0123456789abcdef", substr(str, i + 1, 1));
-
-		if (!dig) {
-			return -1;
-		}
-
-		val = (16 * val) + (dig - 1);
-	}
-
-	return val;
-}
-
-function mktable(name, array, arrlen) {
-	printf("static const uint32_t "name"_table[][2] = {\n");
-
-	for (j = 0; j < arrlen; j++) {
-		if (ind = index(array[j], "..")) {
-			lower = tolower(substr(array[j], 1, ind - 1));
-			upper = tolower(substr(array[j], ind + 2));
-		} else {
-			lower = upper = tolower(array[j]);
-		}
-		lower = sprintf("0x%s", lower);
-		upper = sprintf("0x%s", upper);
-
-		# print lower bound
-		printf("\t{ UINT32_C(%s), ", lower);
-
-		for (; j < arrlen - 1; j++) {
-			# look ahead and check if we have adjacent arrays
-			if (ind = index(array[j + 1], "..")) {
-				nextlower = tolower(substr(array[j + 1],
-				                    1, ind - 1));
-				nextupper = tolower(substr(array[j + 1],
-				                    ind + 2));
-			} else {
-				nextlower = nextupper = tolower(array[j + 1]);
-			}
-			nextlower = sprintf("0x%s", nextlower);
-			nextupper = sprintf("0x%s", nextupper);
-
-			if ((hextonum(nextlower) * 1) != (hextonum(upper) + 1)) {
-				break;
-			} else {
-				upper = nextupper;
-			}
-		}
-
-		# print upper bound
-		printf("UINT32_C(%s) },\n", upper);
-	}
-
-	printf("};\n");
-}
diff --git a/data/gbp.c b/data/gbp.c
@@ -0,0 +1,116 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "util.h"
+
+static struct {
+	char         *identifier;
+	char         *tablename;
+	struct range *table;
+	size_t        tablelen;
+} properties[] = {
+	{
+		.identifier = "CR",
+		.tablename  = "cr_table",
+	},
+	{
+		.identifier = "LF",
+		.tablename  = "lf_table",
+	},
+	{
+		.identifier = "Control",
+		.tablename  = "control_table",
+	},
+	{
+		.identifier = "Extend",
+		.tablename  = "extend_table",
+	},
+	{
+		.identifier = "ZWJ",
+		.tablename  = "zwj_table",
+	},
+	{
+		.identifier = "Regional_Indicator",
+		.tablename  = "ri_table",
+	},
+	{
+		.identifier = "Prepend",
+		.tablename  = "prepend_table",
+	},
+	{
+		.identifier = "SpacingMark",
+		.tablename  = "spacingmark_table",
+	},
+	{
+		.identifier = "L",
+		.tablename  = "l_table",
+	},
+	{
+		.identifier = "V",
+		.tablename  = "v_table",
+	},
+	{
+		.identifier = "T",
+		.tablename  = "t_table",
+	},
+	{
+		.identifier = "LV",
+		.tablename  = "lv_table",
+	},
+	{
+		.identifier = "LVT",
+		.tablename  = "lvt_table",
+	},
+};
+
+int
+process_line(char **field, size_t nfields, char *comment)
+{
+	size_t i;
+	struct range r;
+
+	(void)comment;
+
+	if (nfields < 2) {
+		return 1;
+	}
+
+	for (i = 0; i < LEN(properties); i++) {
+		if (!strcmp(field[1], properties[i].identifier)) {
+			if (range_parse(field[0], &r)) {
+				return 1;
+			}
+			range_list_append(&(properties[i].table),
+			                  &(properties[i].tablelen), &r);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+int
+main(void)
+{
+	size_t i, j;
+
+	printf("/* Automatically generated by data/gbp */\n"
+	       "#include <stdint.h>\n");
+
+	parse_input(process_line);
+
+	for (i = 0; i < LEN(properties); i++) {
+		printf("\nstatic const uint32_t %s[][2] = {\n",
+		       properties[i].tablename);
+		for (j = 0; j < properties[i].tablelen; j++) {
+			printf("\t{ UINT32_C(0x%06X), UINT32_C(0x%06X) },\n",
+			       properties[i].table[j].lower,
+			       properties[i].table[j].upper);
+		}
+		printf("};\n");
+	}
+
+	return 0;
+}
diff --git a/data/gbt.awk b/data/gbt.awk
@@ -1,68 +0,0 @@
-# See LICENSE file for copyright and license details.
-
-# https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt
-BEGIN {
-	FS = " "
-
-	printf("struct test {\n\tuint32_t *cp;\n\tsize_t cplen;\n");
-	printf("\tsize_t *len;\n\tsize_t lenlen;\n\tchar *descr;\n};\n\n");
-	printf("static const struct test t[] = {\n");
-}
-
-$0 ~ /^#/ || $0 ~ /^\s*$/ { next }
-
-{
-	ncps = 0;
-	nlens = 0;
-
-	curlen = 1;
-	for (i = 2; i <= NF; i++) {
-		if ($(i + 1) == "#") {
-			break;
-		}
-		if (i % 2 == 0) {
-			# code point
-			cp[ncps++] = tolower($i);
-		} else {
-			# break information
-			if ($i == "÷") {
-				# break
-				len[nlens++] = curlen;
-				curlen = 1;
-			} else { # $i == "×"
-				# no break
-				curlen++;
-			}
-		}
-	}
-	len[nlens++] = curlen;
-
-	# print code points
-	printf("\t{\n\t\t.cp     = (uint32_t[]){ ");
-	for (i = 0; i < ncps; i++) {
-		printf("UINT32_C(0x%s)", cp[i]);
-		if (i + 1 < ncps) {
-			printf(", ");
-		}
-	}
-	printf(" },\n\t\t.cplen  = %d,\n", ncps);
-
-	# print grapheme cluster lengths
-	printf("\t\t.len    = (size_t[]){ ");
-	for (i = 0; i < nlens; i++) {
-		printf("%s", len[i]);
-		if (i + 1 < nlens) {
-			printf(", ");
-		}
-	}
-	printf(" },\n\t\t.lenlen = %d,\n", nlens);
-
-	# print testcase description
-	printf("\t\t.descr  = \"%s\",\n", substr($0, index($0, "#") + 3));
-
-	printf("\t},\n");
-}
-
-END {
-	printf("};\n");
-}
diff --git a/data/gbt.c b/data/gbt.c
@@ -0,0 +1,139 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "util.h"
+
+struct break_test {
+	uint32_t *cp;
+	size_t cplen;
+	size_t *len;
+	size_t lenlen;
+	char *descr;
+};
+
+static struct break_test *test = NULL;
+static size_t ntests = 0;
+
+int
+process_line(char **field, size_t nfields, char *comment)
+{
+	struct break_test *t;
+	size_t i;
+	char *token;
+
+	if (nfields < 1) {
+		return 1;
+	}
+
+	/* append new testcase and initialize with zeroes */
+	if ((test = realloc(test, ++ntests * sizeof(*test))) == NULL) {
+		fprintf(stderr, "realloc: %s\n", strerror(errno));
+		return 1;
+	}
+	t = &test[ntests - 1];
+	memset(t, 0, sizeof(*t));
+
+	/* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
+	for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
+	     token = strtok(NULL, " ")) {
+		if (i % 2 == 0) {
+			/* delimiter */
+			if (!strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
+				/*
+				 * '÷' indicates a breakpoint,
+				 * the current length is done; allocate
+				 * a new length field and set it to 0
+				 */
+				if ((t->len = realloc(t->len,
+				     ++t->lenlen * sizeof(*t->len))) == NULL) {
+					fprintf(stderr, "realloc: %s\n",
+					        strerror(errno));
+					return 1;
+				}
+				t->len[t->lenlen - 1] = 0;
+			} else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
+				/*
+				 * '×' indicates a non-breakpoint, do nothing
+				 */
+			} else {
+				fprintf(stderr, "malformed delimiter '%s'\n",
+				        token);
+				return 1;
+			}
+		} else {
+			/* add code point to cp-array */
+			if ((t->cp = realloc(t->cp, ++t->cplen *
+			                     sizeof(*t->cp))) == NULL) {
+				fprintf(stderr, "realloc: %s\n", strerror(errno));
+				return 1;
+			}
+			if (cp_parse(token, &t->cp[t->cplen - 1])) {
+				return 1;
+			}
+			if (t->lenlen > 0) {
+				t->len[t->lenlen - 1]++;
+			}
+		}
+	}
+	if (t->len[t->lenlen - 1] == 0) {
+		/* we allocated one more length than we needed */
+		t->lenlen--;
+	}
+
+	/* store comment */
+	if ((test[ntests - 1].descr = strdup(comment)) == NULL) {
+		fprintf(stderr, "strdup: %s\n", strerror(errno));
+		return 1;
+	}
+
+	return 0;
+}
+
+int
+main(void)
+{
+	size_t i, j;
+
+	printf("/* Automatically generated by data/gbt */\n"
+	       "#include <stdint.h>\n#include <stddef.h>\n\n");
+
+	parse_input(process_line);
+
+	printf("static const struct break_test {\n\tuint32_t *cp;\n"
+	       "\tsize_t cplen;\n\tsize_t *len;\n\tsize_t lenlen;\n"
+	       "\tchar *descr;\n} t[] = {\n");
+	for (i = 0; i < ntests; i++) {
+		printf("\t{\n");
+
+		printf("\t\t.cp     = (uint32_t[]){");
+		for (j = 0; j < test[i].cplen; j++) {
+			printf(" UINT32_C(0x%06X)", test[i].cp[j]);
+			if (j + 1 < test[i].cplen) {
+				putchar(',');
+			}
+		}
+		printf(" },\n");
+		printf("\t\t.cplen  = %zu,\n", test[i].cplen);
+
+		printf("\t\t.len    = (size_t[]){");
+		for (j = 0; j < test[i].lenlen; j++) {
+			printf(" %zu", test[i].len[j]);
+			if (j + 1 < test[i].lenlen) {
+				putchar(',');
+			}
+		}
+		printf(" },\n");
+		printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
+
+		printf("\t\t.descr  = \"%s\",\n", test[i].descr);
+
+		printf("\t},\n");
+	}
+	printf("};\n");
+
+	return 0;
+}
diff --git a/data/util.c b/data/util.c
@@ -0,0 +1,159 @@
+/* See LICENSE file for copyright and license details. */
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+
+#include "util.h"
+
+void
+parse_input(int (*process_line)(char **, size_t, char *))
+{
+	char *line = NULL, **field = NULL, *comment;
+	size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
+	ssize_t len;
+
+	while ((len = getline(&line, &linebufsize, stdin)) >= 0) {
+		/* remove trailing newline */
+		if (len > 0 && line[len - 1] == '\n') {
+			line[len - 1] = '\0';
+			len--;
+		}
+
+		/* skip empty lines and comment lines */
+		if (len == 0 || line[0] == '#') {
+			continue;
+		}
+
+		/* tokenize line into fields */
+		for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
+			/* extend field buffer, if necessary */
+			if (++nfields > fieldbufsize) {
+				if ((field = realloc(field, nfields *
+				                     sizeof(*field))) == NULL) {
+					fprintf(stderr, "realloc: %s\n", strerror(errno));
+					exit(1);
+				}
+				fieldbufsize = nfields;
+			}
+
+			/* skip leading whitespace */
+			while (line[i] == ' ') {
+				i++;
+			}
+
+			/* set current position as field start */
+			field[nfields - 1] = &line[i];
+
+			/* continue until we reach ';' or '#' or end */
+			while (line[i] != ';' && line[i] != '#' &&
+			       line[i] != '\0') {
+				i++;
+			}
+			if (line [i] == '#') {
+				/* set comment-variable for later */
+				comment = &line[i + 1];
+			}
+
+			/* go back whitespace and terminate field there */
+			if (i > 0) {
+				for (j = i - 1; line[j] == ' '; j--)
+					;
+				line[j + 1] = '\0';
+			} else {
+				line[i] = '\0';
+			}
+
+			/* if comment is set, we are done */
+			if (comment != NULL) {
+				break;
+			}
+		}
+
+		/* skip leading whitespace in comment */
+		while (comment != NULL && comment[0] == ' ') {
+			comment++;
+		}
+
+		/* call line processing function */
+		if (process_line(field, nfields, comment)) {
+			exit(1);
+		}
+	}
+
+	free(line);
+	free(field);
+}
+
+static int
+valid_hexstring(const char *str)
+{
+	const char *p = str;
+
+	while ((*p >= '0' && *p <= '9') ||
+	       (*p >= 'a' && *p <= 'f') ||
+	       (*p >= 'A' && *p <= 'F')) {
+		p++;
+	}
+
+	if (*p != '\0') {
+		fprintf(stderr, "invalid code point range '%s'\n", str);
+		return 0;
+	}
+
+	return 1;
+}
+
+int
+cp_parse(const char *str, uint32_t *cp)
+{
+	if (!valid_hexstring(str)) {
+		return 1;
+	}
+	*cp = strtol(str, NULL, 16);
+
+	return 0;
+}
+
+int
+range_parse(const char *str, struct range *range)
+{
+	char *p;
+
+	if ((p = strstr(str, "..")) == NULL) {
+		/* input has the form "XXXXXX" */
+		if (!valid_hexstring(str)) {
+			return 1;
+		}
+		range->lower = range->upper = strtol(str, NULL, 16);
+	} else {
+		/* input has the form "XXXXXX..XXXXXX" */
+		*p = '\0';
+		p += 2;
+		if (!valid_hexstring(str) || !valid_hexstring(p)) {
+			return 1;
+		}
+		range->lower = strtol(str, NULL, 16);
+		range->upper = strtol(p, NULL, 16);
+	}
+
+	return 0;
+}
+
+void
+range_list_append(struct range **range, size_t *nranges, const struct range *new)
+{
+	if (*nranges > 0 && (*range)[*nranges - 1].upper == new->lower) {
+		/* we can merge with previous entry */
+		(*range)[*nranges - 1].upper = new->upper;
+	} else {
+		/* need to append new entry */
+		if ((*range = realloc(*range, (++(*nranges)) * sizeof(**range))) == NULL) {
+			fprintf(stderr, "realloc: %s\n", strerror(errno));
+			exit(1);
+		}
+		(*range)[*nranges - 1].lower = new->lower;
+		(*range)[*nranges - 1].upper = new->upper;
+	}
+}
diff --git a/data/util.h b/data/util.h
@@ -0,0 +1,20 @@
+/* See LICENSE file for copyright and license details. */
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#define LEN(x) (sizeof (x) / sizeof *(x))
+
+struct range {
+	uint32_t lower;
+	uint32_t upper;
+};
+
+void parse_input(int (*process_line)(char **, size_t, char *));
+int cp_parse(const char *, uint32_t *);
+int range_parse(const char *, struct range *);
+void range_list_append(struct range **, size_t *, const struct range *);
+
+#endif /* UTIL_H */

M	Makefile	\|	61	+++++++++++++++++++++++++++++--------------------------------
D	data/emo.awk	\|	77	-----------------------------------------------------------------------------
A	data/emo.c	\|	68	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	data/gbp.awk	\|	101	-------------------------------------------------------------------------------
A	data/gbp.c	\|	116	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	data/gbt.awk	\|	68	--------------------------------------------------------------------
A	data/gbt.c	\|	139	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	data/util.c	\|	159	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	data/util.h	\|	20	++++++++++++++++++++