utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

data_generator.jl (20637B)


      1using OffsetArrays: Origin
      2
      3parsehex(str) = parse(UInt32, str, base=16)
      4
      5function parse_hex_range(line)
      6    m = match(r"^([0-9A-F]+)(\.\.([0-9A-F]+))? +; +([^#]+)", line)
      7    if isnothing(m)
      8        return nothing
      9    end
     10    i = parsehex(m[1])
     11    j = !isnothing(m[3]) ? parsehex(m[3]) : i
     12    desc = rstrip(m[4])
     13    return (i:j, desc)
     14end
     15
     16function read_hex_ranges(filename)
     17    [r for r in parse_hex_range.(readlines(filename)) if !isnothing(r)]
     18end
     19
     20function collect_codepoints(range_desc, description)
     21    list = UInt32[]
     22    for (r,d) in range_desc
     23        if d == description
     24            append!(list, r)
     25        end
     26    end
     27    list
     28end
     29
     30function set_all!(d, keys, value)
     31    for k in keys
     32        d[k] = value
     33    end
     34end
     35
     36#-------------------------------------------------------------------------------
     37
     38derived_core_properties = read_hex_ranges("DerivedCoreProperties.txt")
     39
     40ignorable = Set(collect_codepoints(derived_core_properties, "Default_Ignorable_Code_Point"))
     41uppercase = Set(collect_codepoints(derived_core_properties, "Uppercase"))
     42lowercase = Set(collect_codepoints(derived_core_properties, "Lowercase"))
     43
     44
     45#-------------------------------------------------------------------------------
     46function derive_indic_conjunct_break(derived_core_properties)
     47    props = Dict{UInt32, String}()
     48    set_all!(props, collect_codepoints(derived_core_properties, "InCB; Linker"),    "LINKER")
     49    set_all!(props, collect_codepoints(derived_core_properties, "InCB; Consonant"), "CONSONANT")
     50    set_all!(props, collect_codepoints(derived_core_properties, "InCB; Extend"),    "EXTEND")
     51    props
     52end
     53
     54let indic_conjunct_break = derive_indic_conjunct_break(derived_core_properties)
     55    global function get_indic_conjunct_break(code)
     56        get(indic_conjunct_break, code, "NONE")
     57    end
     58end
     59
     60#-------------------------------------------------------------------------------
     61function read_grapheme_boundclasses(grapheme_break_filename, emoji_data_filename)
     62    grapheme_boundclass = Dict{UInt32, String}()
     63    for (r,desc) in read_hex_ranges(grapheme_break_filename)
     64        set_all!(grapheme_boundclass, r, Base.uppercase(desc))
     65    end
     66    for (r,desc) in read_hex_ranges(emoji_data_filename)
     67        if desc == "Extended_Pictographic"
     68            set_all!(grapheme_boundclass, r, "EXTENDED_PICTOGRAPHIC")
     69        elseif desc == "Emoji_Modifier"
     70            set_all!(grapheme_boundclass, r, "EXTEND")
     71        end
     72    end
     73    return grapheme_boundclass
     74end
     75
     76let grapheme_boundclasses = read_grapheme_boundclasses("GraphemeBreakProperty.txt", "emoji-data.txt")
     77    global function get_grapheme_boundclass(code)
     78        get(grapheme_boundclasses, code, "OTHER")
     79    end
     80end
     81
     82#-------------------------------------------------------------------------------
     83function read_composition_exclusions(pattern)
     84    section = match(pattern, read("CompositionExclusions.txt",String)).match
     85    es = UInt32[]
     86    for line in split(section, '\n')
     87        m = match(r"^([0-9A-F]+) +#"i, line)
     88        if !isnothing(m)
     89            push!(es, parsehex(m[1]))
     90        end
     91    end
     92    es
     93end
     94
     95exclusions = Set(read_composition_exclusions(r"# \(1\) Script Specifics.*?# Total code points:"s))
     96excl_version = Set(read_composition_exclusions(r"# \(2\) Post Composition Version precomposed characters.*?# Total code points:"s))
     97
     98#-------------------------------------------------------------------------------
     99function read_case_folding(filename)
    100    case_folding = Dict{UInt32,Vector{UInt32}}()
    101    for line in readlines(filename)
    102        m = match(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"i, line)
    103        !isnothing(m) || continue
    104        case_folding[parsehex(m[1])] = parsehex.(split(m[2]))
    105    end
    106    case_folding
    107end
    108
    109let case_folding = read_case_folding("CaseFolding.txt")
    110    global function get_case_folding(code)
    111        get(case_folding, code, nothing)
    112    end
    113end
    114
    115#-------------------------------------------------------------------------------
    116# Utilities for reading per-char properties from UnicodeData.txt
    117function split_unicode_data_line(line)
    118    m = match(r"""
    119      ([0-9A-F]+);        # code
    120      ([^;]+);            # name
    121      ([A-Z]+);           # general category
    122      ([0-9]+);           # canonical combining class
    123      ([A-Z]+);           # bidi class
    124      (<([A-Z]*)>)?       # decomposition type
    125      ((\ ?[0-9A-F]+)*);  # decompomposition mapping
    126      ([0-9]*);           # decimal digit
    127      ([0-9]*);           # digit
    128      ([^;]*);            # numeric
    129      ([YN]*);            # bidi mirrored
    130      ([^;]*);            # unicode 1.0 name
    131      ([^;]*);            # iso comment
    132      ([0-9A-F]*);        # simple uppercase mapping
    133      ([0-9A-F]*);        # simple lowercase mapping
    134      ([0-9A-F]*)$        # simple titlecase mapping
    135    """ix, line)
    136    @assert !isnothing(m)
    137    code = parse(UInt32, m[1], base=16)
    138    (code             = code,
    139     name             = m[2],
    140     category         = m[3],
    141     combining_class  = parse(Int, m[4]),
    142     bidi_class       = m[5],
    143     decomp_type      = m[7],
    144     decomp_mapping   = m[8] == "" ? nothing : parsehex.(split(m[8])),
    145     bidi_mirrored    = m[13] == "Y",
    146     # issue #130: use nonstandard uppercase ß -> ẞ
    147     # issue #195: if character is uppercase but has no lowercase mapping,
    148     #             then make lowercase mapping = itself (vice versa for lowercase)
    149     uppercase_mapping = m[16] != ""                      ? parsehex(m[16]) :
    150                         code  == 0x000000df              ? 0x00001e9e      :
    151                         m[17] == "" && code in lowercase ? code            :
    152                         nothing,
    153     lowercase_mapping = m[17] != ""                      ? parsehex(m[17]) :
    154                         m[16] == "" && code in uppercase ? code            :
    155                         nothing,
    156     titlecase_mapping = m[18] != ""         ? parsehex(m[18]) :
    157                         code  == 0x000000df ? 0x00001e9e      :
    158                         nothing,
    159    )
    160end
    161
    162function read_unicode_data(filename)
    163    raw_char_props = split_unicode_data_line.(readlines(filename))
    164    char_props = Origin(0)(Vector{eltype(raw_char_props)}())
    165    @assert issorted(raw_char_props, by=c->c.code)
    166    raw_char_props = Iterators.Stateful(raw_char_props)
    167    while !isempty(raw_char_props)
    168        c = popfirst!(raw_char_props)
    169        if occursin(", First>", c.name)
    170            nc = popfirst!(raw_char_props)
    171            @assert occursin(", Last>", nc.name)
    172            name = replace(c.name, ", First"=>"")
    173            for i in c.code:nc.code
    174                push!(char_props, (; c..., name=name, code=i))
    175            end
    176        else
    177            push!(char_props, c)
    178        end
    179    end
    180    return char_props
    181end
    182
    183char_props = read_unicode_data("UnicodeData.txt")
    184char_hash = Dict(c.code=>c for c in char_props)
    185
    186#-------------------------------------------------------------------------------
    187# Read character widths from UAX #11: East Asian Width
    188function read_east_asian_widths(filename)
    189    ea_widths = Dict{UInt32,Int}()
    190    for (rng,widthcode) in read_hex_ranges(filename)
    191        w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full
    192            widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width
    193            nothing
    194        if !isnothing(w)
    195            set_all!(ea_widths, rng, w)
    196        end
    197    end
    198    return ea_widths
    199end
    200
    201let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
    202    # Following work by @jiahao, we compute character widths using a combination of
    203    #   * character category
    204    #   * UAX 11: East Asian Width
    205    #   * a few exceptions as needed
    206    # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
    207    global function derive_char_width(code, category)
    208        # Use a default width of 1 for all character categories that are
    209        # letter/symbol/number-like, as well as for unassigned/private-use chars.
    210        # This provides a useful nonzero fallback for new codepoints when a new
    211        # Unicode version has been released.
    212        width = 1
    213
    214        # Various zero-width categories
    215        #
    216        # "Sk" not included in zero width - see issue #167
    217        if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs")
    218            width = 0
    219        end
    220
    221        # Widths from UAX #11: East Asian Width
    222        eaw = get(ea_widths, code, nothing)
    223        if !isnothing(eaw)
    224            width = eaw
    225        end
    226
    227        # A few exceptional cases, found by manual comparison to other wcwidth
    228        # functions and similar checks.
    229        if category == "Mn"
    230            width = 0
    231        end
    232
    233        if code == 0x00ad
    234            # Soft hyphen is typically printed as a hyphen (-) in terminals.
    235            width = 1
    236        elseif code == 0x2028 || code == 0x2029
    237            #By definition, should have zero width (on the same line)
    238            #0x002028 '
' category: Zl name: LINE SEPARATOR/
    239            #0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
    240            width = 0
    241        end
    242
    243        return width
    244    end
    245end
    246
    247#-------------------------------------------------------------------------------
    248# Construct data tables which will drive libutf8proc
    249#
    250# These tables are "compressed" with an ad-hoc compression scheme (largely some
    251# simple deduplication and indexing) which can easily and efficiently be
    252# decompressed on the C side at runtime.
    253
    254# Inverse decomposition mapping tables for combining two characters into a single one.
    255comb1st_indices = Dict{UInt32,Int}()
    256comb1st_indices_sorted_keys = Origin(0)(UInt32[])
    257comb2nd_indices = Dict{UInt32,Int}()
    258comb2nd_indices_sorted_keys = Origin(0)(UInt32[])
    259comb2nd_indices_nonbasic = Set{UInt32}()
    260comb_array = Origin(0)(Vector{Dict{Int,UInt32}}())
    261for char in char_props
    262    if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
    263            length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
    264            char_hash[char.decomp_mapping[1]].combining_class == 0 &&
    265            char.code ∉ exclusions
    266        dm0 = char.decomp_mapping[1]
    267        dm1 = char.decomp_mapping[2]
    268        if !haskey(comb1st_indices, dm0)
    269            comb1st_indices[dm0] = length(comb1st_indices)
    270            push!(comb1st_indices_sorted_keys, dm0)
    271            push!(comb_array, Dict{Int,UInt32}())
    272            @assert length(comb1st_indices) == length(comb_array)
    273        end
    274        if !haskey(comb2nd_indices, dm1)
    275            push!(comb2nd_indices_sorted_keys, dm1)
    276            comb2nd_indices[dm1] = length(comb2nd_indices)
    277        end
    278        @assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1])
    279        comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code
    280        if char.code > 0xFFFF
    281            push!(comb2nd_indices_nonbasic, dm1)
    282        end
    283    end
    284end
    285
    286comb_indices = Dict{UInt32,Int}()
    287comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
    288comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices)))
    289let
    290    cumoffset = 0
    291    for dm0 in comb1st_indices_sorted_keys
    292        index = comb1st_indices[dm0]
    293        first = nothing
    294        last = nothing
    295        offset = 0
    296        for b in eachindex(comb2nd_indices_sorted_keys)
    297            dm1 = comb2nd_indices_sorted_keys[b]
    298            if haskey(comb_array[index], b)
    299                if isnothing(first)
    300                    first = offset
    301                end
    302                last = offset
    303                if dm1 in comb2nd_indices_nonbasic
    304                    last += 1
    305                end
    306            end
    307            offset += 1
    308            if dm1 in comb2nd_indices_nonbasic
    309                offset += 1 
    310            end
    311        end
    312        comb1st_indices_firstoffsets[index] = first
    313        comb1st_indices_lastoffsets[index] = last
    314        @assert !haskey(comb_indices, dm0)
    315        comb_indices[dm0] = cumoffset
    316        cumoffset += last - first + 1 + 2
    317    end
    318
    319    offset = 0
    320    for dm1 in comb2nd_indices_sorted_keys
    321        @assert !haskey(comb_indices, dm1)
    322        comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset)
    323        @assert comb2nd_indices[dm1] + offset <= 0x4000
    324        if dm1 in comb2nd_indices_nonbasic
    325            comb_indices[dm1] |= 0x4000
    326            offset += 1
    327        end
    328    end
    329end
    330
    331utf16_encode(utf32_seq) = transcode(UInt16, transcode(String, utf32_seq))
    332
    333# Utility for packing all UTF-16 encoded sequences into one big array
    334struct UTF16Sequences
    335    storage::Vector{UInt16}
    336    indices::Dict{Vector{UInt16},Int}
    337end
    338UTF16Sequences() = UTF16Sequences(UInt16[], Dict{Vector{UInt16},Int}())
    339
    340"""
    341Return "sequence code" (seqindex in the C code) for a sequence: a UInt16 where
    342* The 14 low bits are the index into the `sequences.storage` array where the
    343  sequence resides
    344* The two top bits are the length of the sequence, or if equal to 3, the first
    345  entry of the sequence itself contains the length.
    346"""
    347function encode_sequence!(sequences::UTF16Sequences, utf32_seq::Vector)
    348    if length(utf32_seq) == 0
    349        return typemax(UInt16)
    350    end
    351    # lencode contains the length of the UTF-32 sequence after decoding
    352    # No sequence has len 0, so we encode len 1 as 0, len 2 as 1.
    353    # We have only 2 bits for the length, though, so longer sequences are
    354    # encoded in the sequence data itself.
    355    seq_lencode = length(utf32_seq) - 1
    356    utf16_seq = utf16_encode(utf32_seq)
    357    idx = get!(sequences.indices, utf16_seq) do
    358        i = length(sequences.storage)
    359        utf16_seq_enc = seq_lencode < 3 ? utf16_seq :
    360                        pushfirst!(copy(utf16_seq), seq_lencode)
    361        append!(sequences.storage, utf16_seq_enc)
    362        i
    363    end
    364    @assert idx <= 0x3FFF
    365    seq_code = idx | (min(seq_lencode, 3) << 14)
    366    return seq_code
    367end
    368
    369function encode_sequence!(sequences::UTF16Sequences, code::Integer)
    370    encode_sequence!(sequences, [code])
    371end
    372
    373function encode_sequence!(sequences::UTF16Sequences, ::Nothing)
    374    return typemax(UInt16)
    375end
    376
    377function char_table_properties!(sequences, char)
    378    code = char.code
    379
    380    return (
    381        category             = char.category,
    382        combining_class      = char.combining_class,
    383        bidi_class           = char.bidi_class,
    384        decomp_type          = char.decomp_type,
    385        decomp_seqindex      = encode_sequence!(sequences, char.decomp_mapping),
    386        casefold_seqindex    = encode_sequence!(sequences, get_case_folding(code)),
    387        uppercase_seqindex   = encode_sequence!(sequences, char.uppercase_mapping),
    388        lowercase_seqindex   = encode_sequence!(sequences, char.lowercase_mapping),
    389        titlecase_seqindex   = encode_sequence!(sequences, char.titlecase_mapping),
    390        comb_index           = get(comb_indices, code, typemax(UInt16)),
    391        bidi_mirrored        = char.bidi_mirrored,
    392        comp_exclusion       = code in exclusions || code in excl_version,
    393        ignorable            = code in ignorable,
    394        control_boundary     = char.category in ("Zl", "Zp", "Cc", "Cf") &&
    395                               !(char.code in (0x200C, 0x200D)),
    396        charwidth            = derive_char_width(code, char.category),
    397        boundclass           = get_grapheme_boundclass(code),
    398        indic_conjunct_break = get_indic_conjunct_break(code),
    399    )
    400end
    401
    402# Many character properties are duplicates. Deduplicate them, constructing a
    403# per-character array of indicies into the properties array
    404sequences = UTF16Sequences()
    405char_table_props = [char_table_properties!(sequences, cp) for cp in char_props]
    406
    407deduplicated_props = Origin(0)(Vector{eltype(char_table_props)}())
    408char_property_indices = Origin(0)(zeros(Int, 0x00110000))
    409let index_map = Dict{eltype(char_table_props),Int}()
    410    for (char, table_props) in zip(char_props, char_table_props)
    411        entry_idx = get!(index_map, table_props) do
    412            idx = length(deduplicated_props)
    413            push!(deduplicated_props, table_props)
    414            idx
    415        end
    416        # Add 1 because unassigned codes occupy slot at index 0
    417        char_property_indices[char.code] = entry_idx + 1
    418    end
    419end
    420
    421# Now compress char_property_indices by breaking it into pages and
    422# deduplicating those (this works as compression because there are large
    423# contiguous ranges of code space with identical properties)
    424prop_page_indices = Int[]
    425prop_pages = Int[]
    426let
    427    page_size = 0x100
    428    page_index_map = Dict{Vector{Int}, Int}()
    429    for page in Iterators.partition(char_property_indices, page_size)
    430        page_idx = get!(page_index_map, page) do
    431            idx = length(prop_pages)
    432            append!(prop_pages, page)
    433            idx
    434        end
    435        push!(prop_page_indices, page_idx)
    436    end
    437end
    438
    439#-------------------------------------------------------------------------------
    440function write_c_index_array(io, array, linelen)
    441    print(io, "{\n  ")
    442    i = 0
    443    for x in array
    444        i += 1
    445        if i == linelen
    446            i = 0
    447            print(io, "\n  ")
    448        end
    449        print(io, x, ", ")
    450    end
    451    print(io, "};\n\n")
    452end
    453
    454function c_enum_name(prefix, str)
    455    if isnothing(str)
    456        return "0"
    457    else
    458        return "UTF8PROC_$(prefix)_$(Base.uppercase(str))"
    459    end
    460end
    461
    462function c_uint16(seqindex)
    463    if seqindex == typemax(UInt16)
    464        return "UINT16_MAX"
    465    else
    466        return string(seqindex)
    467    end
    468end
    469
    470function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
    471                             comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
    472                             comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
    473    print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
    474    write_c_index_array(io, sequences.storage, 8)
    475    print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
    476    write_c_index_array(io, prop_page_indices, 8)
    477    print(io, "static const utf8proc_uint16_t utf8proc_stage2table[] = ")
    478    write_c_index_array(io, prop_pages, 8)
    479
    480    print(io, """
    481        static const utf8proc_property_t utf8proc_properties[] = {
    482          {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX,  false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
    483        """)
    484    for prop in deduplicated_props
    485        print(io, "  {",
    486              c_enum_name("CATEGORY", prop.category), ", ",
    487              prop.combining_class, ", ",
    488              c_enum_name("BIDI_CLASS", prop.bidi_class), ", ",
    489              c_enum_name("DECOMP_TYPE", prop.decomp_type), ", ",
    490              c_uint16(prop.decomp_seqindex), ", ",
    491              c_uint16(prop.casefold_seqindex), ", ",
    492              c_uint16(prop.uppercase_seqindex), ", ",
    493              c_uint16(prop.lowercase_seqindex), ", ",
    494              c_uint16(prop.titlecase_seqindex), ", ",
    495              c_uint16(prop.comb_index), ", ",
    496              prop.bidi_mirrored, ", ",
    497              prop.comp_exclusion, ", ",
    498              prop.ignorable, ", ",
    499              prop.control_boundary, ", ",
    500              prop.charwidth, ", ",
    501              "0, ", # bitfield padding
    502              c_enum_name("BOUNDCLASS", prop.boundclass), ", ",
    503              c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break),
    504              "},\n"
    505        )
    506    end
    507    print(io, "};\n\n")
    508
    509    print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n  ")
    510    i = 0
    511    for a in eachindex(comb1st_indices_firstoffsets)
    512        offset = 0
    513        print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ")
    514        for b in eachindex(comb2nd_indices_sorted_keys)
    515            dm1 = comb2nd_indices_sorted_keys[b]
    516            if offset > comb1st_indices_lastoffsets[a]
    517                break
    518            end
    519            if offset >= comb1st_indices_firstoffsets[a]
    520                i += 1
    521                if i == 8
    522                    i = 0
    523                    print(io, "\n  ")
    524                end
    525                v = get(comb_array[a], b, 0)
    526                if dm1 in comb2nd_indices_nonbasic
    527                    print(io, (v & 0xFFFF0000) >> 16, ", ")
    528                end
    529                print(io, v & 0xFFFF, ", ")
    530            end
    531            offset += 1
    532            if dm1 in comb2nd_indices_nonbasic
    533                offset += 1
    534            end
    535        end
    536        print(io, "\n")
    537    end
    538    print(io, "};\n\n")
    539end
    540
    541
    542if !isinteractive()
    543    print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
    544                        comb1st_indices_firstoffsets, comb1st_indices_lastoffsets,
    545                        comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic)
    546end
    547