utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

charwidth.c (3117B)


      1#include "tests.h"
      2#include <ctype.h>
      3#include <wchar.h>
      4
      5static int my_unassigned(int c) {
      6    int cat = utf8proc_get_property(c)->category;
      7    return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
      8}
      9
     10static int my_isprint(int c) {
     11    int cat = utf8proc_get_property(c)->category;
     12    return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) ||
     13           (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) ||
     14           (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO);
     15}
     16
     17int main(int argc, char **argv)
     18{
     19    int c, error = 0, updates = 0;
     20
     21    (void) argc; /* unused */
     22    (void) argv; /* unused */
     23
     24    /* some simple sanity tests of the character widths */
     25    for (c = 0; c <= 0x110000; ++c) {
     26        int cat = utf8proc_get_property(c)->category;
     27        int w = utf8proc_charwidth(c);
     28        if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) {
     29            fprintf(stderr, "nonzero width %d for combining char %x\n", w, c);
     30            error += 1;
     31        }
     32        if (w == 0 &&
     33            ((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) ||
     34             (cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) ||
     35             (cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) {
     36            fprintf(stderr, "zero width for symbol-like char %x\n", c);
     37            error += 1;
     38        }
     39        if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) {
     40            fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n",
     41            wcwidth(c), w,
     42            isprint(c) ? "printable" : "non-printable", c);
     43            error += 1;
     44        }
     45        if (!my_isprint(c) && w > 0) {
     46            fprintf(stderr, "non-printing %x had width %d\n", c, w);
     47            error += 1;
     48        }
     49        if (my_unassigned(c) && w != 1) {
     50            fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c);
     51            error += 1;
     52        }
     53    }
     54    check(!error, "utf8proc_charwidth FAILED %d tests.", error);
     55
     56    check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)");
     57    check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)");
     58
     59    /* print some other information by compariing with system wcwidth */
     60    printf("Mismatches with system wcwidth (not necessarily errors):\n");
     61    for (c = 0; c <= 0x110000; ++c) {
     62        int w = utf8proc_charwidth(c);
     63        int wc = wcwidth(c);
     64        if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue;
     65        /* lots of these errors for out-of-date system unicode tables */
     66        if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0)
     67            updates += 1;
     68        if (wc == -1 && !my_isprint(c) && w > 0)
     69            printf("  wcwidth(%x) = -1 for non-printable width-%d char\n", c, w);
     70        if (wc >= 0 && wc != w)
     71            printf("  wcwidth(%x) = %d != charwidth %d\n", c, wc, w);
     72    }
     73    printf("   ... (positive widths for %d chars unknown to wcwidth) ...\n", updates);
     74    printf("Character-width tests SUCCEEDED.\n");
     75
     76    return 0;
     77}