summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore4
-rw-r--r--Makefile31
-rw-r--r--README4
-rw-r--r--bootstr.c325
-rw-r--r--bootstr.h19
-rw-r--r--puny.c120
-rw-r--r--test/basic.in1
-rw-r--r--test/basic.out1
8 files changed, 505 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..3158d4f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+puny
+bootstr
+*.o
+*.so
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..a75f410
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,31 @@
+PREFIX ?= /usr/local
+BINDIR ?= /bin
+LIBDIR ?= /lib
+
+all: libbootstr.so puny
+
+clean:
+ rm -f puny
+
+puny: puny.c libbootstr.so
+ $(CC) -o $@ $(filter %.c,$^) -g -lunistring -L . -lbootstr
+
+test/%.phony: test/%.in test/%.out
+ @echo "test $*"
+ test "$(shell cat test/$*.in | ./puny -e)" = "$(shell cat test/$*.out)"
+ test "$(shell cat test/$*.out | ./puny -d)" = "$(shell cat test/$*.in)"
+
+test: puny test/basic.phony
+
+libbootstr.so: bootstr.o
+ $(CC) -o $@ $^ -fPIC -shared -lunistring
+
+install:
+ install -m755 libbootstr.so -t "$(DESTDIR)$(PREFIX)$(LIBDIR)"
+ install -m755 puny -t "$(DESTDIR)$(PREFIX)$(BINDIR)"
+
+uninstall:
+ rm -f "$(DESTDIR)$(PREFIX)$(LIBDIR)/libbootstr.so"
+ rm -f "$(DESTDIR)$(PREFIX)$(BINDIR)/puny"
+
+.PHONY: all clean test install uninstall
diff --git a/README b/README
new file mode 100644
index 0000000..4d1c327
--- /dev/null
+++ b/README
@@ -0,0 +1,4 @@
+bootstr
+=======
+
+Bootstring-encoding library according to RFC#3492 with punycode test program.
diff --git a/bootstr.c b/bootstr.c
new file mode 100644
index 0000000..97cf693
--- /dev/null
+++ b/bootstr.c
@@ -0,0 +1,325 @@
+#include "bootstr.h"
+
+#include <limits.h>
+#include <stdint.h>
+#include <unistr.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define MIN(a, b) ((a) > (b) ? (b) : (a))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+static int check_realloc(uint32_t **alloc, size_t reserve, size_t *cap);
+static int append_codes(uint32_t **alloc, size_t *len, size_t *cap,
+ const uint32_t *src, size_t srclen);
+static int check_config(const struct bootstr_cfg *cfg);
+
+static inline size_t
+bootstr_adapt(const struct bootstr_cfg *cfg, ssize_t delta,
+ ssize_t len, bool first)
+{
+ size_t k;
+
+ delta = first ? delta / cfg->damp : delta / 2;
+ delta += delta / len;
+
+ k = 0;
+ while (delta > (cfg->baselen - cfg->tmin) * cfg->tmax / 2) {
+ delta /= cfg->baselen - cfg->tmin;
+ k += cfg->baselen;
+ }
+ k += (cfg->baselen - cfg->tmin + 1) * delta / (delta + cfg->skew);
+
+ return k;
+}
+
+int
+check_realloc(uint32_t **alloc, size_t reserve, size_t *cap)
+{
+ if (reserve >= *cap) {
+ if (!*cap) {
+ *cap = reserve;
+ } else {
+ *cap = MAX(*cap * 2, reserve);
+ }
+ *alloc = realloc(*alloc, *cap * sizeof(uint32_t));
+ if (!*alloc) return errno;
+ }
+
+ return 0;
+}
+
+int
+append_codes(uint32_t **alloc, size_t *len, size_t *cap,
+ const uint32_t *src, size_t srclen)
+{
+ int ret;
+
+ ret = check_realloc(alloc, *len + srclen, cap);
+ if (ret) return ret;
+
+ memcpy(*alloc + *len, src, srclen * sizeof(uint32_t));
+ *len += srclen;
+
+ return 0;
+}
+
+int
+check_config(const struct bootstr_cfg *cfg)
+{
+ if (cfg->tmin >= cfg->baselen || cfg->tmin <= 0)
+ return EINVAL;
+
+ if (cfg->tmax < cfg->tmin)
+ return EINVAL;
+
+ if (!cfg->delim)
+ return EINVAL;
+
+ if (!cfg->base || cfg->baselen <= 0)
+ return EINVAL;
+
+ if (!cfg->damp)
+ return EINVAL;
+
+ return 0;
+}
+
+int
+bootstr_encode_delta(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out,
+ size_t *outlen, size_t *outcap, ssize_t bias, ssize_t delta)
+{
+ ssize_t thresh;
+ ssize_t val;
+ ssize_t off;
+ ssize_t ci;
+ int ret;
+
+ val = delta;
+
+ off = cfg->baselen;
+ while (1) {
+ /* final digit must be under threshold */
+ thresh = MIN(cfg->tmax, MAX(cfg->tmin, off - bias));
+ if (val < thresh) break;
+
+ /* no room for encoding, invalid params */
+ if (thresh >= cfg->baselen)
+ return EINVAL;
+
+ /* encode char according to current base */
+ ci = thresh + (val - thresh) % (cfg->baselen - thresh);
+ val = (val - thresh) / (cfg->baselen - thresh);
+ if (ci >= cfg->baselen)
+ return EINVAL;
+
+ ret = append_codes(out, outlen, outcap, &cfg->base[ci], 1);
+ if (ret) return ret;
+
+ off += cfg->baselen;
+ }
+
+ ret = append_codes(out, outlen, outcap, &cfg->base[val], 1);
+ if (ret) return ret;
+
+ return 0;
+}
+
+int
+bootstr_encode(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out)
+{
+ size_t outlen, outcap;
+ size_t inlen;
+ ssize_t processed, basiclen;
+ ssize_t next_code, n;
+ ssize_t delta, bias;
+ ssize_t i;
+ int ret;
+
+ ret = check_config(cfg);
+ if (ret) return ret;
+
+ outlen = 0;
+ outcap = 0;
+
+ /* parse out safe character prefix */
+ inlen = u32_strlen(in);
+ for (i = 0; i < inlen; i++) {
+ if (cfg->is_basic(in[i]))
+ append_codes(out, &outlen, &outcap, &in[i], 1);
+ }
+ processed = outlen;
+ basiclen = outlen;
+
+ /* if basic prefix avail, add delim */
+ if (outlen) {
+ ret = append_codes(out, &outlen, &outcap,
+ cfg->delim, u32_strlen(cfg->delim));
+ if (ret) return ret;
+ }
+
+ bias = cfg->initial_bias;
+ n = cfg->initial_n;
+ delta = 0;
+
+ /* encode rest of non-basic chars */
+ while (processed < inlen) {
+ next_code = SSIZE_MAX;
+ for (i = 0; i < inlen; i++) {
+ if (in[i] >= n && in[i] < next_code)
+ next_code = in[i];
+ }
+
+ /* calc insertions to skip until start of last round:
+ * (processed + 1) insertions possible per round
+ * (next_code - n) rounds todo */
+ if ((next_code - n) > (SSIZE_MAX - delta) / (processed + 1))
+ return EOVERFLOW;
+ delta += (next_code - n) * (processed + 1);
+
+ /* calculate number of skip to reach code in output at n */
+ n = next_code;
+ for (i = 0; i < inlen; i++) {
+ /* only consider characters already in output */
+ if (in[i] < n || cfg->is_basic(in[i])) {
+ delta += 1;
+ if (delta <= 0)
+ return EOVERFLOW;
+ }
+
+ /* reached the position of ONE of next_code */
+ if (in[i] == n) {
+ ret = bootstr_encode_delta(cfg, in, out,
+ &outlen, &outcap, bias, delta);
+ if (ret) return ret;
+ bias = bootstr_adapt(cfg, delta,
+ processed + 1, processed == basiclen);
+ delta = 0;
+ processed += 1;
+ }
+ }
+
+ delta += 1;
+ n += 1;
+ }
+
+ ret = append_codes(out, &outlen, &outcap, U"\x00", 1);
+ if (ret) return ret;
+
+ return 0;
+}
+
+int
+bootstr_decode_delta(const struct bootstr_cfg *cfg, uint32_t *in,
+ ssize_t *processed, ssize_t bias, ssize_t state, ssize_t *state_new)
+{
+ ssize_t thresh;
+ ssize_t digit;
+ ssize_t mul;
+ ssize_t off;
+ uint32_t *tok;
+
+ /* construct integer from digits while accounting
+ * for possibly different bases per digit */
+
+ mul = 1;
+ off = cfg->baselen;
+ while (1) {
+ if (!in[*processed]) return EINVAL;
+
+ tok = u32_strchr(cfg->base, in[*processed]);
+ if (!tok) return EINVAL;
+ *processed += 1;
+
+ digit = tok - cfg->base;
+ if (digit > (SSIZE_MAX - state) / mul)
+ return EOVERFLOW;
+ state += digit * mul;
+
+ thresh = MIN(cfg->tmax, MAX(cfg->tmin, off - bias));
+ if (digit < thresh) break;
+
+ if (mul > SSIZE_MAX / (cfg->baselen - thresh))
+ return EOVERFLOW;
+ mul *= cfg->baselen - thresh;
+
+ off += cfg->baselen;
+ }
+ *state_new = state;
+
+ return 0;
+}
+
+int
+bootstr_decode(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out)
+{
+ size_t outlen, outcap;
+ size_t inlen;
+ ssize_t basiclen;
+ ssize_t processed, n;
+ ssize_t state, state_new, bias;
+ ssize_t i, len;
+ int ret;
+
+ ret = check_config(cfg);
+ if (ret) return ret;
+
+ outlen = 0;
+ outcap = 0;
+
+ basiclen = 0;
+ inlen = u32_strlen(in);
+
+ /* find basic prefix delim */
+ for (i = 0; i < inlen; i++) {
+ if (!u32_strcmp(in + i, cfg->delim)) {
+ basiclen = i;
+ break;
+ }
+ if (!cfg->is_basic(in[i]))
+ return EINVAL;
+ }
+
+ /* copy basic prefix to output */
+ if (basiclen)
+ append_codes(out, &outlen, &outcap, in, basiclen);
+
+ n = cfg->initial_n;
+ bias = cfg->initial_bias;
+ state = 0;
+
+ /* decode rest of non-basic chars */
+ for (processed = basiclen; processed < inlen; ) {
+ /* decode delta and add to state */
+ ret = bootstr_decode_delta(cfg, in, &processed,
+ bias, state, &state_new);
+ if (ret) return ret;
+
+ /* use delta to calculate new bias */
+ bias = bootstr_adapt(cfg, state_new - state,
+ outlen + 1, state == 0);
+ state = state_new;
+
+ /* split up state into rounds and index */
+ if (state / (outlen + 1) > (SSIZE_MAX - n))
+ return EOVERFLOW;
+ n += state / (outlen + 1);
+ state %= outlen + 1;
+
+ /* insert current code */
+ ret = check_realloc(out, outlen + 1, &outcap);
+ if (ret) return ret;
+ memmove(*out + state + 1, *out + state,
+ (outlen - state) * sizeof(uint32_t));
+ (*out)[state] = n;
+ state += 1;
+ outlen += 1;
+ }
+
+ ret = append_codes(out, &outlen, &outcap, U"\x00", 1);
+ if (ret) return ret;
+
+ return 0;
+}
diff --git a/bootstr.h b/bootstr.h
new file mode 100644
index 0000000..8f26a6d
--- /dev/null
+++ b/bootstr.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+struct bootstr_cfg {
+ const uint32_t *base;
+ ssize_t baselen;
+ const uint32_t *delim;
+ bool (*is_basic)(uint32_t c);
+ ssize_t tmin, tmax;
+ ssize_t skew, damp;
+ ssize_t initial_bias;
+ ssize_t initial_n;
+};
+
+int bootstr_encode(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out);
+int bootstr_decode(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out);
diff --git a/puny.c b/puny.c
new file mode 100644
index 0000000..0ea9c29
--- /dev/null
+++ b/puny.c
@@ -0,0 +1,120 @@
+#include "bootstr.h"
+
+#include <unistr.h>
+#include <err.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+#define CHUNKSIZE 4096
+
+bool is_ascii(uint32_t c);
+
+const struct bootstr_cfg puny_cfg = {
+ .base = U"abcdefghijklmnopqrstuvwxyz0123456789",
+ .baselen = 36,
+ .delim = U"-",
+ .is_basic = is_ascii,
+ .tmin = 1,
+ .tmax = 26,
+ .skew = 38,
+ .damp = 700,
+ .initial_bias = 72,
+ .initial_n = 128
+};
+
+bool
+is_ascii(uint32_t c)
+{
+ return c < 128;
+}
+
+uint8_t *
+readall(FILE *file, size_t *len)
+{
+ ssize_t nread;
+ size_t cap;
+ uint8_t *data;
+
+ *len = 0;
+ cap = CHUNKSIZE + 1;
+ data = malloc(cap);
+ if (!data) err(1, "malloc");
+
+ while (1) {
+ if (*len + CHUNKSIZE + 1 > cap) {
+ cap *= 2;
+ data = realloc(data, cap);
+ if (!data) err(1, "realloc");
+ }
+
+ nread = fread(data + *len, 1, CHUNKSIZE, file);
+ if (nread <= 0) break;
+
+ *len += nread;
+ }
+
+ *(data + *len) = '\0';
+
+ return data;
+}
+
+int
+main(int argc, const char **argv)
+{
+ const char **arg;
+ uint8_t *in, *out;
+ uint32_t *u_in, *u_out;
+ size_t inlen, outlen;
+ size_t u_inlen, u_outlen;
+ const char *filepath;
+ bool encode;
+ char *tok;
+ FILE *file;
+ int ret;
+
+ encode = true;
+ filepath = NULL;
+ for (arg = argv + 1; *arg; arg++) {
+ if (!strcmp(*arg, "-e")) {
+ encode = true;
+ } else if (!strcmp(*arg, "-d")) {
+ encode = false;
+ } else if (!filepath) {
+ filepath = *arg;
+ } else {
+ errx(1, "unknown arg %s", *arg);
+ }
+ }
+
+ out = NULL;
+ if (filepath) {
+ file = fopen(filepath, "r");
+ if (!file) err(1, "fopen %s", filepath);
+ in = readall(file, &inlen);
+ fclose(file);
+ } else {
+ in = readall(stdin, &inlen);
+ }
+ tok = strchr((char *)in, '\n');
+ if (tok) *tok = '\0';
+
+ u_in = u8_to_u32(in, inlen + 1, NULL, &u_inlen);
+ u_out = NULL;
+
+ if (encode) {
+ ret = bootstr_encode(&puny_cfg, u_in, &u_out);
+ if (ret) errx(1, "encode: %s", strerror(ret));
+ } else {
+ ret = bootstr_decode(&puny_cfg, u_in, &u_out);
+ if (ret) errx(1, "decode: %s", strerror(ret));
+ }
+
+ out = u32_to_u8(u_out, u32_strlen(u_out) + 1, NULL, &outlen);
+ printf("%s\n", (char *)out);
+
+ free(u_out);
+ free(u_in);
+ free(out);
+ free(in);
+}
diff --git a/test/basic.in b/test/basic.in
new file mode 100644
index 0000000..71f4119
--- /dev/null
+++ b/test/basic.in
@@ -0,0 +1 @@
+他們爲什麽不說中文 \ No newline at end of file
diff --git a/test/basic.out b/test/basic.out
new file mode 100644
index 0000000..7e12b07
--- /dev/null
+++ b/test/basic.out
@@ -0,0 +1 @@
+ihqwctvzc91f659drss3x8bo0yb \ No newline at end of file