commit 21b6f66acc659e8c515d4685a11fa534a289af14
parent 52a25d52f16697e74dfd582217de5d169c3790cb
Author: Laslo Hunhold <dev@frign.de>
Date: Sun, 31 May 2020 22:44:06 +0200
Add UTF-8-encode-function
Merely to detect grapheme clusters, the reasoning behind adding
an encoding-function is not immediately apparent. The main reason
for it is because some decoding-scenarios actually change the text
representation (by identifying invalid codepoints and outputting
them as such).
The user should have the chance to output a "processed" stream.
A minor benefit with very little overhead is that this encoding
function is just useful in general.
Signed-off-by: Laslo Hunhold <dev@frign.de>
Diffstat:
2 files changed, 62 insertions(+), 4 deletions(-)
diff --git a/src/codepoint.c b/src/codepoint.c
@@ -9,7 +9,8 @@
static const struct {
uint8_t lower; /* lower bound of sequence first byte */
uint8_t upper; /* upper bound of sequence first byte */
- uint32_t mincp; /* smallest non-overlong encoded codepoint */
+ uint32_t mincp; /* smallest non-overlong encoded code point */
+ uint32_t maxcp; /* largest encodable code point */
/*
* implicit: table-offset represents the number of following
* bytes of the form 10xxxxxx (6 bits capacity each)
@@ -20,24 +21,28 @@ static const struct {
.lower = 0x00, /* 00000000 */
.upper = 0x7F, /* 01111111 */
.mincp = (uint32_t)0,
+ .maxcp = ((uint32_t)1 << 7) - 1, /* 7 bits capacity */
},
[1] = {
/* 110xxxxx */
.lower = 0xC0, /* 11000000 */
.upper = 0xDF, /* 11011111 */
- .mincp = (uint32_t)1 << 7, /* [0] has 7 bits capacity */
+ .mincp = (uint32_t)1 << 7,
+ .maxcp = ((uint32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
},
[2] = {
/* 1110xxxx */
.lower = 0xE0, /* 11100000 */
.upper = 0xEF, /* 11101111 */
- .mincp = (uint32_t)1 << 11, /* [1] has 5+6=11 bits capacity */
+ .mincp = (uint32_t)1 << 11,
+ .maxcp = ((uint32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
},
[3] = {
/* 11110xxx */
.lower = 0xF0, /* 11110000 */
.upper = 0xF7, /* 11110111 */
- .mincp = (uint32_t)1 << 16, /* [2] has 4+6+6=16 bits capacity */
+ .mincp = (uint32_t)1 << 16,
+ .maxcp = ((uint32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
},
};
@@ -117,3 +122,55 @@ grapheme_cp_decode(uint32_t *cp, const uint8_t *s, size_t n)
return 1 + off;
}
+
+size_t
+grapheme_cp_encode(uint32_t cp, uint8_t *s, size_t n)
+{
+ size_t off, i;
+
+ if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
+ cp > UINT32_C(0x10FFFF)) {
+ /*
+ * code point is a high or low UTF-16 surrogate half
+ * (0xD800..0xDFFF) or not representable in UTF-16
+ * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
+ */
+ cp = CP_INVALID;
+ }
+
+ /* determine necessary sequence type */
+ for (off = 0; off < LEN(lut); off++) {
+ if (cp <= lut[off].maxcp) {
+ break;
+ }
+ }
+ if (1 + off > n) {
+ /* specified buffer is too small to store sequence */
+ return 1 + off;
+ }
+
+ /* build sequence by filling cp-bits into each byte */
+
+ /*
+ * lut[off].lower is the bit-format for the first byte and
+ * the bits to fill into it are determined by shifting the
+ * cp 6 times the number of following bytes, as each
+ * following byte stores 6 bits, yielding the wanted bits.
+ *
+ * We do not overwrite the mask because we guaranteed earlier
+ * that there are no bits higher than the mask allows.
+ */
+ s[0] = lut[off].lower | (cp >> (6 * off));
+
+ for (i = 1; i <= off; i++) {
+ /*
+ * the bit-format for following bytes is 10000000 (0x80)
+ * and it each stores 6 bits in the 6 low bits that we
+ * extract from the properly-shifted value using the
+ * mask 00111111 (0x3F)
+ */
+ s[i] = 0x80 | ((cp >> (6 * (off - i))) & 0x3F);
+ }
+
+ return 1 + off;
+}
diff --git a/src/codepoint.h b/src/codepoint.h
@@ -10,5 +10,6 @@ typedef uint32_t Codepoint;
#define CP_INVALID 0xFFFD
size_t grapheme_cp_decode(uint32_t *, const uint8_t *, size_t);
+size_t grapheme_cp_encode(uint32_t, uint8_t *, size_t);
#endif /* CODEPOINT_H */