libbootstr-c

C bootstring encoding library
git clone https://git.sinitax.com/sinitax/libbootstr-c
Log | Files | Refs | LICENSE | sfeed.txt

commit 42636a96271693fc5545da691d624b51b6192b5f
Author: Louis Burda <quent.burda@gmail.com>
Date:   Fri, 17 Feb 2023 03:55:24 +0100

Initial version

Diffstat:
A.gitignore | 4++++
AMakefile | 31+++++++++++++++++++++++++++++++
AREADME | 4++++
Abootstr.c | 325+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Abootstr.h | 19+++++++++++++++++++
Apuny.c | 120+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/basic.in | 2++
Atest/basic.out | 2++
8 files changed, 507 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,4 @@ +puny +bootstr +*.o +*.so diff --git a/Makefile b/Makefile @@ -0,0 +1,31 @@ +PREFIX ?= /usr/local +BINDIR ?= /bin +LIBDIR ?= /lib + +all: libbootstr.so puny + +clean: + rm -f puny + +puny: puny.c libbootstr.so + $(CC) -o $@ $(filter %.c,$^) -g -lunistring -L . -lbootstr + +test/%.phony: test/%.in test/%.out + @echo "test $*" + test "$(shell cat test/$*.in | ./puny -e)" = "$(shell cat test/$*.out)" + test "$(shell cat test/$*.out | ./puny -d)" = "$(shell cat test/$*.in)" + +test: puny test/basic.phony + +libbootstr.so: bootstr.o + $(CC) -o $@ $^ -fPIC -shared -lunistring + +install: + install -m755 libbootstr.so -t "$(DESTDIR)$(PREFIX)$(LIBDIR)" + install -m755 puny -t "$(DESTDIR)$(PREFIX)$(BINDIR)" + +uninstall: + rm -f "$(DESTDIR)$(PREFIX)$(LIBDIR)/libbootstr.so" + rm -f "$(DESTDIR)$(PREFIX)$(BINDIR)/puny" + +.PHONY: all clean test install uninstall diff --git a/README b/README @@ -0,0 +1,4 @@ +bootstr +======= + +Bootstring-encoding library according to RFC#3492 with punycode test program. diff --git a/bootstr.c b/bootstr.c @@ -0,0 +1,325 @@ +#include "bootstr.h" + +#include <limits.h> +#include <stdint.h> +#include <unistr.h> +#include <errno.h> +#include <string.h> +#include <stdio.h> +#include <stdlib.h> + +#define MIN(a, b) ((a) > (b) ? (b) : (a)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +static int check_realloc(uint32_t **alloc, size_t reserve, size_t *cap); +static int append_codes(uint32_t **alloc, size_t *len, size_t *cap, + const uint32_t *src, size_t srclen); +static int check_config(const struct bootstr_cfg *cfg); + +static inline size_t +bootstr_adapt(const struct bootstr_cfg *cfg, ssize_t delta, + ssize_t len, bool first) +{ + size_t k; + + delta = first ? delta / cfg->damp : delta / 2; + delta += delta / len; + + k = 0; + while (delta > (cfg->baselen - cfg->tmin) * cfg->tmax / 2) { + delta /= cfg->baselen - cfg->tmin; + k += cfg->baselen; + } + k += (cfg->baselen - cfg->tmin + 1) * delta / (delta + cfg->skew); + + return k; +} + +int +check_realloc(uint32_t **alloc, size_t reserve, size_t *cap) +{ + if (reserve >= *cap) { + if (!*cap) { + *cap = reserve; + } else { + *cap = MAX(*cap * 2, reserve); + } + *alloc = realloc(*alloc, *cap * sizeof(uint32_t)); + if (!*alloc) return errno; + } + + return 0; +} + +int +append_codes(uint32_t **alloc, size_t *len, size_t *cap, + const uint32_t *src, size_t srclen) +{ + int ret; + + ret = check_realloc(alloc, *len + srclen, cap); + if (ret) return ret; + + memcpy(*alloc + *len, src, srclen * sizeof(uint32_t)); + *len += srclen; + + return 0; +} + +int +check_config(const struct bootstr_cfg *cfg) +{ + if (cfg->tmin >= cfg->baselen || cfg->tmin <= 0) + return EINVAL; + + if (cfg->tmax < cfg->tmin) + return EINVAL; + + if (!cfg->delim) + return EINVAL; + + if (!cfg->base || cfg->baselen <= 0) + return EINVAL; + + if (!cfg->damp) + return EINVAL; + + return 0; +} + +int +bootstr_encode_delta(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out, + size_t *outlen, size_t *outcap, ssize_t bias, ssize_t delta) +{ + ssize_t thresh; + ssize_t val; + ssize_t off; + ssize_t ci; + int ret; + + val = delta; + + off = cfg->baselen; + while (1) { + /* final digit must be under threshold */ + thresh = MIN(cfg->tmax, MAX(cfg->tmin, off - bias)); + if (val < thresh) break; + + /* no room for encoding, invalid params */ + if (thresh >= cfg->baselen) + return EINVAL; + + /* encode char according to current base */ + ci = thresh + (val - thresh) % (cfg->baselen - thresh); + val = (val - thresh) / (cfg->baselen - thresh); + if (ci >= cfg->baselen) + return EINVAL; + + ret = append_codes(out, outlen, outcap, &cfg->base[ci], 1); + if (ret) return ret; + + off += cfg->baselen; + } + + ret = append_codes(out, outlen, outcap, &cfg->base[val], 1); + if (ret) return ret; + + return 0; +} + +int +bootstr_encode(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out) +{ + size_t outlen, outcap; + size_t inlen; + ssize_t processed, basiclen; + ssize_t next_code, n; + ssize_t delta, bias; + ssize_t i; + int ret; + + ret = check_config(cfg); + if (ret) return ret; + + outlen = 0; + outcap = 0; + + /* parse out safe character prefix */ + inlen = u32_strlen(in); + for (i = 0; i < inlen; i++) { + if (cfg->is_basic(in[i])) + append_codes(out, &outlen, &outcap, &in[i], 1); + } + processed = outlen; + basiclen = outlen; + + /* if basic prefix avail, add delim */ + if (outlen) { + ret = append_codes(out, &outlen, &outcap, + cfg->delim, u32_strlen(cfg->delim)); + if (ret) return ret; + } + + bias = cfg->initial_bias; + n = cfg->initial_n; + delta = 0; + + /* encode rest of non-basic chars */ + while (processed < inlen) { + next_code = SSIZE_MAX; + for (i = 0; i < inlen; i++) { + if (in[i] >= n && in[i] < next_code) + next_code = in[i]; + } + + /* calc insertions to skip until start of last round: + * (processed + 1) insertions possible per round + * (next_code - n) rounds todo */ + if ((next_code - n) > (SSIZE_MAX - delta) / (processed + 1)) + return EOVERFLOW; + delta += (next_code - n) * (processed + 1); + + /* calculate number of skip to reach code in output at n */ + n = next_code; + for (i = 0; i < inlen; i++) { + /* only consider characters already in output */ + if (in[i] < n || cfg->is_basic(in[i])) { + delta += 1; + if (delta <= 0) + return EOVERFLOW; + } + + /* reached the position of ONE of next_code */ + if (in[i] == n) { + ret = bootstr_encode_delta(cfg, in, out, + &outlen, &outcap, bias, delta); + if (ret) return ret; + bias = bootstr_adapt(cfg, delta, + processed + 1, processed == basiclen); + delta = 0; + processed += 1; + } + } + + delta += 1; + n += 1; + } + + ret = append_codes(out, &outlen, &outcap, U"\x00", 1); + if (ret) return ret; + + return 0; +} + +int +bootstr_decode_delta(const struct bootstr_cfg *cfg, uint32_t *in, + ssize_t *processed, ssize_t bias, ssize_t state, ssize_t *state_new) +{ + ssize_t thresh; + ssize_t digit; + ssize_t mul; + ssize_t off; + uint32_t *tok; + + /* construct integer from digits while accounting + * for possibly different bases per digit */ + + mul = 1; + off = cfg->baselen; + while (1) { + if (!in[*processed]) return EINVAL; + + tok = u32_strchr(cfg->base, in[*processed]); + if (!tok) return EINVAL; + *processed += 1; + + digit = tok - cfg->base; + if (digit > (SSIZE_MAX - state) / mul) + return EOVERFLOW; + state += digit * mul; + + thresh = MIN(cfg->tmax, MAX(cfg->tmin, off - bias)); + if (digit < thresh) break; + + if (mul > SSIZE_MAX / (cfg->baselen - thresh)) + return EOVERFLOW; + mul *= cfg->baselen - thresh; + + off += cfg->baselen; + } + *state_new = state; + + return 0; +} + +int +bootstr_decode(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out) +{ + size_t outlen, outcap; + size_t inlen; + ssize_t basiclen; + ssize_t processed, n; + ssize_t state, state_new, bias; + ssize_t i, len; + int ret; + + ret = check_config(cfg); + if (ret) return ret; + + outlen = 0; + outcap = 0; + + basiclen = 0; + inlen = u32_strlen(in); + + /* find basic prefix delim */ + for (i = 0; i < inlen; i++) { + if (!u32_strcmp(in + i, cfg->delim)) { + basiclen = i; + break; + } + if (!cfg->is_basic(in[i])) + return EINVAL; + } + + /* copy basic prefix to output */ + if (basiclen) + append_codes(out, &outlen, &outcap, in, basiclen); + + n = cfg->initial_n; + bias = cfg->initial_bias; + state = 0; + + /* decode rest of non-basic chars */ + for (processed = basiclen; processed < inlen; ) { + /* decode delta and add to state */ + ret = bootstr_decode_delta(cfg, in, &processed, + bias, state, &state_new); + if (ret) return ret; + + /* use delta to calculate new bias */ + bias = bootstr_adapt(cfg, state_new - state, + outlen + 1, state == 0); + state = state_new; + + /* split up state into rounds and index */ + if (state / (outlen + 1) > (SSIZE_MAX - n)) + return EOVERFLOW; + n += state / (outlen + 1); + state %= outlen + 1; + + /* insert current code */ + ret = check_realloc(out, outlen + 1, &outcap); + if (ret) return ret; + memmove(*out + state + 1, *out + state, + (outlen - state) * sizeof(uint32_t)); + (*out)[state] = n; + state += 1; + outlen += 1; + } + + ret = append_codes(out, &outlen, &outcap, U"\x00", 1); + if (ret) return ret; + + return 0; +} diff --git a/bootstr.h b/bootstr.h @@ -0,0 +1,19 @@ +#pragma once + +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> + +struct bootstr_cfg { + const uint32_t *base; + ssize_t baselen; + const uint32_t *delim; + bool (*is_basic)(uint32_t c); + ssize_t tmin, tmax; + ssize_t skew, damp; + ssize_t initial_bias; + ssize_t initial_n; +}; + +int bootstr_encode(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out); +int bootstr_decode(const struct bootstr_cfg *cfg, uint32_t *in, uint32_t **out); diff --git a/puny.c b/puny.c @@ -0,0 +1,120 @@ +#include "bootstr.h" + +#include <unistr.h> +#include <err.h> +#include <stdio.h> +#include <string.h> +#include <stdint.h> + +#define CHUNKSIZE 4096 + +bool is_ascii(uint32_t c); + +const struct bootstr_cfg puny_cfg = { + .base = U"abcdefghijklmnopqrstuvwxyz0123456789", + .baselen = 36, + .delim = U"-", + .is_basic = is_ascii, + .tmin = 1, + .tmax = 26, + .skew = 38, + .damp = 700, + .initial_bias = 72, + .initial_n = 128 +}; + +bool +is_ascii(uint32_t c) +{ + return c < 128; +} + +uint8_t * +readall(FILE *file, size_t *len) +{ + ssize_t nread; + size_t cap; + uint8_t *data; + + *len = 0; + cap = CHUNKSIZE + 1; + data = malloc(cap); + if (!data) err(1, "malloc"); + + while (1) { + if (*len + CHUNKSIZE + 1 > cap) { + cap *= 2; + data = realloc(data, cap); + if (!data) err(1, "realloc"); + } + + nread = fread(data + *len, 1, CHUNKSIZE, file); + if (nread <= 0) break; + + *len += nread; + } + + *(data + *len) = '\0'; + + return data; +} + +int +main(int argc, const char **argv) +{ + const char **arg; + uint8_t *in, *out; + uint32_t *u_in, *u_out; + size_t inlen, outlen; + size_t u_inlen, u_outlen; + const char *filepath; + bool encode; + char *tok; + FILE *file; + int ret; + + encode = true; + filepath = NULL; + for (arg = argv + 1; *arg; arg++) { + if (!strcmp(*arg, "-e")) { + encode = true; + } else if (!strcmp(*arg, "-d")) { + encode = false; + } else if (!filepath) { + filepath = *arg; + } else { + errx(1, "unknown arg %s", *arg); + } + } + + out = NULL; + if (filepath) { + file = fopen(filepath, "r"); + if (!file) err(1, "fopen %s", filepath); + in = readall(file, &inlen); + fclose(file); + } else { + in = readall(stdin, &inlen); + } + tok = strchr((char *)in, '\n'); + if (tok) *tok = '\0'; + + u_in = u8_to_u32(in, inlen + 1, NULL, &u_inlen); + u_out = NULL; + + if (encode) { + ret = bootstr_encode(&puny_cfg, u_in, &u_out); + if (ret) errx(1, "encode: %s", strerror(ret)); + } else { + ret = bootstr_decode(&puny_cfg, u_in, &u_out); + if (ret) errx(1, "decode: %s", strerror(ret)); + } + + out = u32_to_u8(u_out, u32_strlen(u_out) + 1, NULL, &outlen); + printf("%s\n", (char *)out); + + free(u_out); + free(u_in); + free(out); + free(in); +} diff --git a/test/basic.in b/test/basic.in @@ -0,0 +1 @@ +他們爲什麽不說中文 +\ No newline at end of file diff --git a/test/basic.out b/test/basic.out @@ -0,0 +1 @@ +ihqwctvzc91f659drss3x8bo0yb +\ No newline at end of file