libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

utf8-decode.c (7826B)


      1/* See LICENSE file for copyright and license details. */
      2#include <stddef.h>
      3#include <stdint.h>
      4#include <stdio.h>
      5#include <string.h>
      6
      7#include "../grapheme.h"
      8#include "util.h"
      9
     10static const struct {
     11	char *arr;             /* UTF-8 byte sequence */
     12	size_t len;            /* length of UTF-8 byte sequence */
     13	size_t exp_len;        /* expected length returned */
     14	uint_least32_t exp_cp; /* expected codepoint returned */
     15} dec_test[] = {
     16	{
     17		/* empty sequence
     18	         * [ ] ->
     19	         * INVALID
     20	         */
     21		.arr = NULL,
     22		.len = 0,
     23		.exp_len = 0,
     24		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     25	},
     26	{
     27		/* invalid lead byte
     28	         * [ 11111101 ] ->
     29	         * INVALID
     30	         */
     31		.arr = (char *)(unsigned char[]) { 0xFD },
     32		.len = 1,
     33		.exp_len = 1,
     34		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     35	},
     36	{
     37		/* valid 1-byte sequence
     38	         * [ 00000001 ] ->
     39	         * 0000001
     40	         */
     41		.arr = (char *)(unsigned char[]) { 0x01 },
     42		.len = 1,
     43		.exp_len = 1,
     44		.exp_cp = 0x1,
     45	},
     46	{
     47		/* valid 2-byte sequence
     48	         * [ 11000011 10111111 ] ->
     49	         * 00011111111
     50	         */
     51		.arr = (char *)(unsigned char[]) { 0xC3, 0xBF },
     52		.len = 2,
     53		.exp_len = 2,
     54		.exp_cp = 0xFF,
     55	},
     56	{
     57		/* invalid 2-byte sequence (second byte missing)
     58	         * [ 11000011 ] ->
     59	         * INVALID
     60	         */
     61		.arr = (char *)(unsigned char[]) { 0xC3 },
     62		.len = 1,
     63		.exp_len = 2,
     64		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     65	},
     66	{
     67		/* invalid 2-byte sequence (second byte malformed)
     68	         * [ 11000011 11111111 ] ->
     69	         * INVALID
     70	         */
     71		.arr = (char *)(unsigned char[]) { 0xC3, 0xFF },
     72		.len = 2,
     73		.exp_len = 1,
     74		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     75	},
     76	{
     77		/* invalid 2-byte sequence (overlong encoded)
     78	         * [ 11000001 10111111 ] ->
     79	         * INVALID
     80	         */
     81		.arr = (char *)(unsigned char[]) { 0xC1, 0xBF },
     82		.len = 2,
     83		.exp_len = 2,
     84		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
     85	},
     86	{
     87		/* valid 3-byte sequence
     88	         * [ 11100000 10111111 10111111 ] ->
     89	         * 0000111111111111
     90	         */
     91		.arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0xBF },
     92		.len = 3,
     93		.exp_len = 3,
     94		.exp_cp = 0xFFF,
     95	},
     96	{
     97		/* invalid 3-byte sequence (second byte missing)
     98	         * [ 11100000 ] ->
     99	         * INVALID
    100	         */
    101		.arr = (char *)(unsigned char[]) { 0xE0 },
    102		.len = 1,
    103		.exp_len = 3,
    104		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    105	},
    106	{
    107		/* invalid 3-byte sequence (second byte malformed)
    108	         * [ 11100000 01111111 10111111 ] ->
    109	         * INVALID
    110	         */
    111		.arr = (char *)(unsigned char[]) { 0xE0, 0x7F, 0xBF },
    112		.len = 3,
    113		.exp_len = 1,
    114		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    115	},
    116	{
    117		/* invalid 3-byte sequence (short string, second byte malformed)
    118	         * [ 11100000 01111111 ] ->
    119	         * INVALID
    120	         */
    121		.arr = (char *)(unsigned char[]) { 0xE0, 0x7F },
    122		.len = 2,
    123		.exp_len = 1,
    124		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    125	},
    126	{
    127		/* invalid 3-byte sequence (third byte missing)
    128	         * [ 11100000 10111111 ] ->
    129	         * INVALID
    130	         */
    131		.arr = (char *)(unsigned char[]) { 0xE0, 0xBF },
    132		.len = 2,
    133		.exp_len = 3,
    134		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    135	},
    136	{
    137		/* invalid 3-byte sequence (third byte malformed)
    138	         * [ 11100000 10111111 01111111 ] ->
    139	         * INVALID
    140	         */
    141		.arr = (char *)(unsigned char[]) { 0xE0, 0xBF, 0x7F },
    142		.len = 3,
    143		.exp_len = 2,
    144		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    145	},
    146	{
    147		/* invalid 3-byte sequence (overlong encoded)
    148	         * [ 11100000 10011111 10111111 ] ->
    149	         * INVALID
    150	         */
    151		.arr = (char *)(unsigned char[]) { 0xE0, 0x9F, 0xBF },
    152		.len = 3,
    153		.exp_len = 3,
    154		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    155	},
    156	{
    157		/* invalid 3-byte sequence (UTF-16 surrogate half)
    158	         * [ 11101101 10100000 10000000 ] ->
    159	         * INVALID
    160	         */
    161		.arr = (char *)(unsigned char[]) { 0xED, 0xA0, 0x80 },
    162		.len = 3,
    163		.exp_len = 3,
    164		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    165	},
    166	{
    167		/* valid 4-byte sequence
    168	         * [ 11110011 10111111 10111111 10111111 ] ->
    169	         * 011111111111111111111
    170	         */
    171		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0xBF },
    172		.len = 4,
    173		.exp_len = 4,
    174		.exp_cp = UINT32_C(0xFFFFF),
    175	},
    176	{
    177		/* invalid 4-byte sequence (second byte missing)
    178	         * [ 11110011 ] ->
    179	         * INVALID
    180	         */
    181		.arr = (char *)(unsigned char[]) { 0xF3 },
    182		.len = 1,
    183		.exp_len = 4,
    184		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    185	},
    186	{
    187		/* invalid 4-byte sequence (second byte malformed)
    188	         * [ 11110011 01111111 10111111 10111111 ] ->
    189	         * INVALID
    190	         */
    191		.arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF, 0xBF },
    192		.len = 4,
    193		.exp_len = 1,
    194		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    195	},
    196	{
    197		/* invalid 4-byte sequence (short string 1, second byte
    198	         * malformed) [ 11110011 011111111 ] -> INVALID
    199	         */
    200		.arr = (char *)(unsigned char[]) { 0xF3, 0x7F },
    201		.len = 2,
    202		.exp_len = 1,
    203		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    204	},
    205	{
    206		/* invalid 4-byte sequence (short string 2, second byte
    207	         * malformed) [ 11110011 011111111 10111111 ] -> INVALID
    208	         */
    209		.arr = (char *)(unsigned char[]) { 0xF3, 0x7F, 0xBF },
    210		.len = 3,
    211		.exp_len = 1,
    212		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    213	},
    214
    215	{
    216		/* invalid 4-byte sequence (third byte missing)
    217	         * [ 11110011 10111111 ] ->
    218	         * INVALID
    219	         */
    220		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF },
    221		.len = 2,
    222		.exp_len = 4,
    223		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    224	},
    225	{
    226		/* invalid 4-byte sequence (third byte malformed)
    227	         * [ 11110011 10111111 01111111 10111111 ] ->
    228	         * INVALID
    229	         */
    230		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F, 0xBF },
    231		.len = 4,
    232		.exp_len = 2,
    233		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    234	},
    235	{
    236		/* invalid 4-byte sequence (short string, third byte malformed)
    237	         * [ 11110011 10111111 01111111 ] ->
    238	         * INVALID
    239	         */
    240		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0x7F },
    241		.len = 3,
    242		.exp_len = 2,
    243		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    244	},
    245	{
    246		/* invalid 4-byte sequence (fourth byte missing)
    247	         * [ 11110011 10111111 10111111 ] ->
    248	         * INVALID
    249	         */
    250		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF },
    251		.len = 3,
    252		.exp_len = 4,
    253		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    254	},
    255	{
    256		/* invalid 4-byte sequence (fourth byte malformed)
    257	         * [ 11110011 10111111 10111111 01111111 ] ->
    258	         * INVALID
    259	         */
    260		.arr = (char *)(unsigned char[]) { 0xF3, 0xBF, 0xBF, 0x7F },
    261		.len = 4,
    262		.exp_len = 3,
    263		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    264	},
    265	{
    266		/* invalid 4-byte sequence (overlong encoded)
    267	         * [ 11110000 10000000 10000001 10111111 ] ->
    268	         * INVALID
    269	         */
    270		.arr = (char *)(unsigned char[]) { 0xF0, 0x80, 0x81, 0xBF },
    271		.len = 4,
    272		.exp_len = 4,
    273		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    274	},
    275	{
    276		/* invalid 4-byte sequence (UTF-16-unrepresentable)
    277	         * [ 11110100 10010000 10000000 10000000 ] ->
    278	         * INVALID
    279	         */
    280		.arr = (char *)(unsigned char[]) { 0xF4, 0x90, 0x80, 0x80 },
    281		.len = 4,
    282		.exp_len = 4,
    283		.exp_cp = GRAPHEME_INVALID_CODEPOINT,
    284	},
    285};
    286
    287int
    288main(int argc, char *argv[])
    289{
    290	size_t i, failed;
    291
    292	(void)argc;
    293
    294	/* UTF-8 decoder test */
    295	for (i = 0, failed = 0; i < LEN(dec_test); i++) {
    296		size_t len;
    297		uint_least32_t cp;
    298
    299		len = grapheme_decode_utf8(dec_test[i].arr, dec_test[i].len,
    300		                           &cp);
    301
    302		if (len != dec_test[i].exp_len || cp != dec_test[i].exp_cp) {
    303			fprintf(stderr,
    304			        "%s: Failed test %zu: "
    305			        "Expected (%zx,%u), but got (%zx,%u).\n",
    306			        argv[0], i, dec_test[i].exp_len,
    307			        dec_test[i].exp_cp, len, cp);
    308			failed++;
    309		}
    310	}
    311	printf("%s: %zu/%zu unit tests passed.\n", argv[0],
    312	       LEN(dec_test) - failed, LEN(dec_test));
    313
    314	return (failed > 0) ? 1 : 0;
    315}