data_generator.jl (20637B)
1using OffsetArrays: Origin 2 3parsehex(str) = parse(UInt32, str, base=16) 4 5function parse_hex_range(line) 6 m = match(r"^([0-9A-F]+)(\.\.([0-9A-F]+))? +; +([^#]+)", line) 7 if isnothing(m) 8 return nothing 9 end 10 i = parsehex(m[1]) 11 j = !isnothing(m[3]) ? parsehex(m[3]) : i 12 desc = rstrip(m[4]) 13 return (i:j, desc) 14end 15 16function read_hex_ranges(filename) 17 [r for r in parse_hex_range.(readlines(filename)) if !isnothing(r)] 18end 19 20function collect_codepoints(range_desc, description) 21 list = UInt32[] 22 for (r,d) in range_desc 23 if d == description 24 append!(list, r) 25 end 26 end 27 list 28end 29 30function set_all!(d, keys, value) 31 for k in keys 32 d[k] = value 33 end 34end 35 36#------------------------------------------------------------------------------- 37 38derived_core_properties = read_hex_ranges("DerivedCoreProperties.txt") 39 40ignorable = Set(collect_codepoints(derived_core_properties, "Default_Ignorable_Code_Point")) 41uppercase = Set(collect_codepoints(derived_core_properties, "Uppercase")) 42lowercase = Set(collect_codepoints(derived_core_properties, "Lowercase")) 43 44 45#------------------------------------------------------------------------------- 46function derive_indic_conjunct_break(derived_core_properties) 47 props = Dict{UInt32, String}() 48 set_all!(props, collect_codepoints(derived_core_properties, "InCB; Linker"), "LINKER") 49 set_all!(props, collect_codepoints(derived_core_properties, "InCB; Consonant"), "CONSONANT") 50 set_all!(props, collect_codepoints(derived_core_properties, "InCB; Extend"), "EXTEND") 51 props 52end 53 54let indic_conjunct_break = derive_indic_conjunct_break(derived_core_properties) 55 global function get_indic_conjunct_break(code) 56 get(indic_conjunct_break, code, "NONE") 57 end 58end 59 60#------------------------------------------------------------------------------- 61function read_grapheme_boundclasses(grapheme_break_filename, emoji_data_filename) 62 grapheme_boundclass = Dict{UInt32, String}() 63 for (r,desc) in read_hex_ranges(grapheme_break_filename) 64 set_all!(grapheme_boundclass, r, Base.uppercase(desc)) 65 end 66 for (r,desc) in read_hex_ranges(emoji_data_filename) 67 if desc == "Extended_Pictographic" 68 set_all!(grapheme_boundclass, r, "EXTENDED_PICTOGRAPHIC") 69 elseif desc == "Emoji_Modifier" 70 set_all!(grapheme_boundclass, r, "EXTEND") 71 end 72 end 73 return grapheme_boundclass 74end 75 76let grapheme_boundclasses = read_grapheme_boundclasses("GraphemeBreakProperty.txt", "emoji-data.txt") 77 global function get_grapheme_boundclass(code) 78 get(grapheme_boundclasses, code, "OTHER") 79 end 80end 81 82#------------------------------------------------------------------------------- 83function read_composition_exclusions(pattern) 84 section = match(pattern, read("CompositionExclusions.txt",String)).match 85 es = UInt32[] 86 for line in split(section, '\n') 87 m = match(r"^([0-9A-F]+) +#"i, line) 88 if !isnothing(m) 89 push!(es, parsehex(m[1])) 90 end 91 end 92 es 93end 94 95exclusions = Set(read_composition_exclusions(r"# \(1\) Script Specifics.*?# Total code points:"s)) 96excl_version = Set(read_composition_exclusions(r"# \(2\) Post Composition Version precomposed characters.*?# Total code points:"s)) 97 98#------------------------------------------------------------------------------- 99function read_case_folding(filename) 100 case_folding = Dict{UInt32,Vector{UInt32}}() 101 for line in readlines(filename) 102 m = match(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"i, line) 103 !isnothing(m) || continue 104 case_folding[parsehex(m[1])] = parsehex.(split(m[2])) 105 end 106 case_folding 107end 108 109let case_folding = read_case_folding("CaseFolding.txt") 110 global function get_case_folding(code) 111 get(case_folding, code, nothing) 112 end 113end 114 115#------------------------------------------------------------------------------- 116# Utilities for reading per-char properties from UnicodeData.txt 117function split_unicode_data_line(line) 118 m = match(r""" 119 ([0-9A-F]+); # code 120 ([^;]+); # name 121 ([A-Z]+); # general category 122 ([0-9]+); # canonical combining class 123 ([A-Z]+); # bidi class 124 (<([A-Z]*)>)? # decomposition type 125 ((\ ?[0-9A-F]+)*); # decompomposition mapping 126 ([0-9]*); # decimal digit 127 ([0-9]*); # digit 128 ([^;]*); # numeric 129 ([YN]*); # bidi mirrored 130 ([^;]*); # unicode 1.0 name 131 ([^;]*); # iso comment 132 ([0-9A-F]*); # simple uppercase mapping 133 ([0-9A-F]*); # simple lowercase mapping 134 ([0-9A-F]*)$ # simple titlecase mapping 135 """ix, line) 136 @assert !isnothing(m) 137 code = parse(UInt32, m[1], base=16) 138 (code = code, 139 name = m[2], 140 category = m[3], 141 combining_class = parse(Int, m[4]), 142 bidi_class = m[5], 143 decomp_type = m[7], 144 decomp_mapping = m[8] == "" ? nothing : parsehex.(split(m[8])), 145 bidi_mirrored = m[13] == "Y", 146 # issue #130: use nonstandard uppercase ß -> ẞ 147 # issue #195: if character is uppercase but has no lowercase mapping, 148 # then make lowercase mapping = itself (vice versa for lowercase) 149 uppercase_mapping = m[16] != "" ? parsehex(m[16]) : 150 code == 0x000000df ? 0x00001e9e : 151 m[17] == "" && code in lowercase ? code : 152 nothing, 153 lowercase_mapping = m[17] != "" ? parsehex(m[17]) : 154 m[16] == "" && code in uppercase ? code : 155 nothing, 156 titlecase_mapping = m[18] != "" ? parsehex(m[18]) : 157 code == 0x000000df ? 0x00001e9e : 158 nothing, 159 ) 160end 161 162function read_unicode_data(filename) 163 raw_char_props = split_unicode_data_line.(readlines(filename)) 164 char_props = Origin(0)(Vector{eltype(raw_char_props)}()) 165 @assert issorted(raw_char_props, by=c->c.code) 166 raw_char_props = Iterators.Stateful(raw_char_props) 167 while !isempty(raw_char_props) 168 c = popfirst!(raw_char_props) 169 if occursin(", First>", c.name) 170 nc = popfirst!(raw_char_props) 171 @assert occursin(", Last>", nc.name) 172 name = replace(c.name, ", First"=>"") 173 for i in c.code:nc.code 174 push!(char_props, (; c..., name=name, code=i)) 175 end 176 else 177 push!(char_props, c) 178 end 179 end 180 return char_props 181end 182 183char_props = read_unicode_data("UnicodeData.txt") 184char_hash = Dict(c.code=>c for c in char_props) 185 186#------------------------------------------------------------------------------- 187# Read character widths from UAX #11: East Asian Width 188function read_east_asian_widths(filename) 189 ea_widths = Dict{UInt32,Int}() 190 for (rng,widthcode) in read_hex_ranges(filename) 191 w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full 192 widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width 193 nothing 194 if !isnothing(w) 195 set_all!(ea_widths, rng, w) 196 end 197 end 198 return ea_widths 199end 200 201let ea_widths = read_east_asian_widths("EastAsianWidth.txt") 202 # Following work by @jiahao, we compute character widths using a combination of 203 # * character category 204 # * UAX 11: East Asian Width 205 # * a few exceptions as needed 206 # Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734 207 global function derive_char_width(code, category) 208 # Use a default width of 1 for all character categories that are 209 # letter/symbol/number-like, as well as for unassigned/private-use chars. 210 # This provides a useful nonzero fallback for new codepoints when a new 211 # Unicode version has been released. 212 width = 1 213 214 # Various zero-width categories 215 # 216 # "Sk" not included in zero width - see issue #167 217 if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs") 218 width = 0 219 end 220 221 # Widths from UAX #11: East Asian Width 222 eaw = get(ea_widths, code, nothing) 223 if !isnothing(eaw) 224 width = eaw 225 end 226 227 # A few exceptional cases, found by manual comparison to other wcwidth 228 # functions and similar checks. 229 if category == "Mn" 230 width = 0 231 end 232 233 if code == 0x00ad 234 # Soft hyphen is typically printed as a hyphen (-) in terminals. 235 width = 1 236 elseif code == 0x2028 || code == 0x2029 237 #By definition, should have zero width (on the same line) 238 #0x002028 ' ' category: Zl name: LINE SEPARATOR/ 239 #0x002029 ' ' category: Zp name: PARAGRAPH SEPARATOR/ 240 width = 0 241 end 242 243 return width 244 end 245end 246 247#------------------------------------------------------------------------------- 248# Construct data tables which will drive libutf8proc 249# 250# These tables are "compressed" with an ad-hoc compression scheme (largely some 251# simple deduplication and indexing) which can easily and efficiently be 252# decompressed on the C side at runtime. 253 254# Inverse decomposition mapping tables for combining two characters into a single one. 255comb1st_indices = Dict{UInt32,Int}() 256comb1st_indices_sorted_keys = Origin(0)(UInt32[]) 257comb2nd_indices = Dict{UInt32,Int}() 258comb2nd_indices_sorted_keys = Origin(0)(UInt32[]) 259comb2nd_indices_nonbasic = Set{UInt32}() 260comb_array = Origin(0)(Vector{Dict{Int,UInt32}}()) 261for char in char_props 262 if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) && 263 length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) && 264 char_hash[char.decomp_mapping[1]].combining_class == 0 && 265 char.code ∉ exclusions 266 dm0 = char.decomp_mapping[1] 267 dm1 = char.decomp_mapping[2] 268 if !haskey(comb1st_indices, dm0) 269 comb1st_indices[dm0] = length(comb1st_indices) 270 push!(comb1st_indices_sorted_keys, dm0) 271 push!(comb_array, Dict{Int,UInt32}()) 272 @assert length(comb1st_indices) == length(comb_array) 273 end 274 if !haskey(comb2nd_indices, dm1) 275 push!(comb2nd_indices_sorted_keys, dm1) 276 comb2nd_indices[dm1] = length(comb2nd_indices) 277 end 278 @assert !haskey(comb_array[comb1st_indices[dm0]], comb2nd_indices[dm1]) 279 comb_array[comb1st_indices[dm0]][comb2nd_indices[dm1]] = char.code 280 if char.code > 0xFFFF 281 push!(comb2nd_indices_nonbasic, dm1) 282 end 283 end 284end 285 286comb_indices = Dict{UInt32,Int}() 287comb1st_indices_lastoffsets = Origin(0)(zeros(Int, length(comb1st_indices))) 288comb1st_indices_firstoffsets = Origin(0)(zeros(Int, length(comb1st_indices))) 289let 290 cumoffset = 0 291 for dm0 in comb1st_indices_sorted_keys 292 index = comb1st_indices[dm0] 293 first = nothing 294 last = nothing 295 offset = 0 296 for b in eachindex(comb2nd_indices_sorted_keys) 297 dm1 = comb2nd_indices_sorted_keys[b] 298 if haskey(comb_array[index], b) 299 if isnothing(first) 300 first = offset 301 end 302 last = offset 303 if dm1 in comb2nd_indices_nonbasic 304 last += 1 305 end 306 end 307 offset += 1 308 if dm1 in comb2nd_indices_nonbasic 309 offset += 1 310 end 311 end 312 comb1st_indices_firstoffsets[index] = first 313 comb1st_indices_lastoffsets[index] = last 314 @assert !haskey(comb_indices, dm0) 315 comb_indices[dm0] = cumoffset 316 cumoffset += last - first + 1 + 2 317 end 318 319 offset = 0 320 for dm1 in comb2nd_indices_sorted_keys 321 @assert !haskey(comb_indices, dm1) 322 comb_indices[dm1] = 0x8000 | (comb2nd_indices[dm1] + offset) 323 @assert comb2nd_indices[dm1] + offset <= 0x4000 324 if dm1 in comb2nd_indices_nonbasic 325 comb_indices[dm1] |= 0x4000 326 offset += 1 327 end 328 end 329end 330 331utf16_encode(utf32_seq) = transcode(UInt16, transcode(String, utf32_seq)) 332 333# Utility for packing all UTF-16 encoded sequences into one big array 334struct UTF16Sequences 335 storage::Vector{UInt16} 336 indices::Dict{Vector{UInt16},Int} 337end 338UTF16Sequences() = UTF16Sequences(UInt16[], Dict{Vector{UInt16},Int}()) 339 340""" 341Return "sequence code" (seqindex in the C code) for a sequence: a UInt16 where 342* The 14 low bits are the index into the `sequences.storage` array where the 343 sequence resides 344* The two top bits are the length of the sequence, or if equal to 3, the first 345 entry of the sequence itself contains the length. 346""" 347function encode_sequence!(sequences::UTF16Sequences, utf32_seq::Vector) 348 if length(utf32_seq) == 0 349 return typemax(UInt16) 350 end 351 # lencode contains the length of the UTF-32 sequence after decoding 352 # No sequence has len 0, so we encode len 1 as 0, len 2 as 1. 353 # We have only 2 bits for the length, though, so longer sequences are 354 # encoded in the sequence data itself. 355 seq_lencode = length(utf32_seq) - 1 356 utf16_seq = utf16_encode(utf32_seq) 357 idx = get!(sequences.indices, utf16_seq) do 358 i = length(sequences.storage) 359 utf16_seq_enc = seq_lencode < 3 ? utf16_seq : 360 pushfirst!(copy(utf16_seq), seq_lencode) 361 append!(sequences.storage, utf16_seq_enc) 362 i 363 end 364 @assert idx <= 0x3FFF 365 seq_code = idx | (min(seq_lencode, 3) << 14) 366 return seq_code 367end 368 369function encode_sequence!(sequences::UTF16Sequences, code::Integer) 370 encode_sequence!(sequences, [code]) 371end 372 373function encode_sequence!(sequences::UTF16Sequences, ::Nothing) 374 return typemax(UInt16) 375end 376 377function char_table_properties!(sequences, char) 378 code = char.code 379 380 return ( 381 category = char.category, 382 combining_class = char.combining_class, 383 bidi_class = char.bidi_class, 384 decomp_type = char.decomp_type, 385 decomp_seqindex = encode_sequence!(sequences, char.decomp_mapping), 386 casefold_seqindex = encode_sequence!(sequences, get_case_folding(code)), 387 uppercase_seqindex = encode_sequence!(sequences, char.uppercase_mapping), 388 lowercase_seqindex = encode_sequence!(sequences, char.lowercase_mapping), 389 titlecase_seqindex = encode_sequence!(sequences, char.titlecase_mapping), 390 comb_index = get(comb_indices, code, typemax(UInt16)), 391 bidi_mirrored = char.bidi_mirrored, 392 comp_exclusion = code in exclusions || code in excl_version, 393 ignorable = code in ignorable, 394 control_boundary = char.category in ("Zl", "Zp", "Cc", "Cf") && 395 !(char.code in (0x200C, 0x200D)), 396 charwidth = derive_char_width(code, char.category), 397 boundclass = get_grapheme_boundclass(code), 398 indic_conjunct_break = get_indic_conjunct_break(code), 399 ) 400end 401 402# Many character properties are duplicates. Deduplicate them, constructing a 403# per-character array of indicies into the properties array 404sequences = UTF16Sequences() 405char_table_props = [char_table_properties!(sequences, cp) for cp in char_props] 406 407deduplicated_props = Origin(0)(Vector{eltype(char_table_props)}()) 408char_property_indices = Origin(0)(zeros(Int, 0x00110000)) 409let index_map = Dict{eltype(char_table_props),Int}() 410 for (char, table_props) in zip(char_props, char_table_props) 411 entry_idx = get!(index_map, table_props) do 412 idx = length(deduplicated_props) 413 push!(deduplicated_props, table_props) 414 idx 415 end 416 # Add 1 because unassigned codes occupy slot at index 0 417 char_property_indices[char.code] = entry_idx + 1 418 end 419end 420 421# Now compress char_property_indices by breaking it into pages and 422# deduplicating those (this works as compression because there are large 423# contiguous ranges of code space with identical properties) 424prop_page_indices = Int[] 425prop_pages = Int[] 426let 427 page_size = 0x100 428 page_index_map = Dict{Vector{Int}, Int}() 429 for page in Iterators.partition(char_property_indices, page_size) 430 page_idx = get!(page_index_map, page) do 431 idx = length(prop_pages) 432 append!(prop_pages, page) 433 idx 434 end 435 push!(prop_page_indices, page_idx) 436 end 437end 438 439#------------------------------------------------------------------------------- 440function write_c_index_array(io, array, linelen) 441 print(io, "{\n ") 442 i = 0 443 for x in array 444 i += 1 445 if i == linelen 446 i = 0 447 print(io, "\n ") 448 end 449 print(io, x, ", ") 450 end 451 print(io, "};\n\n") 452end 453 454function c_enum_name(prefix, str) 455 if isnothing(str) 456 return "0" 457 else 458 return "UTF8PROC_$(prefix)_$(Base.uppercase(str))" 459 end 460end 461 462function c_uint16(seqindex) 463 if seqindex == typemax(UInt16) 464 return "UINT16_MAX" 465 else 466 return string(seqindex) 467 end 468end 469 470function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props, 471 comb1st_indices_firstoffsets, comb1st_indices_lastoffsets, 472 comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic) 473 print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ") 474 write_c_index_array(io, sequences.storage, 8) 475 print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ") 476 write_c_index_array(io, prop_page_indices, 8) 477 print(io, "static const utf8proc_uint16_t utf8proc_stage2table[] = ") 478 write_c_index_array(io, prop_pages, 8) 479 480 print(io, """ 481 static const utf8proc_property_t utf8proc_properties[] = { 482 {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, false,false,false,false, 1, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE}, 483 """) 484 for prop in deduplicated_props 485 print(io, " {", 486 c_enum_name("CATEGORY", prop.category), ", ", 487 prop.combining_class, ", ", 488 c_enum_name("BIDI_CLASS", prop.bidi_class), ", ", 489 c_enum_name("DECOMP_TYPE", prop.decomp_type), ", ", 490 c_uint16(prop.decomp_seqindex), ", ", 491 c_uint16(prop.casefold_seqindex), ", ", 492 c_uint16(prop.uppercase_seqindex), ", ", 493 c_uint16(prop.lowercase_seqindex), ", ", 494 c_uint16(prop.titlecase_seqindex), ", ", 495 c_uint16(prop.comb_index), ", ", 496 prop.bidi_mirrored, ", ", 497 prop.comp_exclusion, ", ", 498 prop.ignorable, ", ", 499 prop.control_boundary, ", ", 500 prop.charwidth, ", ", 501 "0, ", # bitfield padding 502 c_enum_name("BOUNDCLASS", prop.boundclass), ", ", 503 c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break), 504 "},\n" 505 ) 506 end 507 print(io, "};\n\n") 508 509 print(io, "static const utf8proc_uint16_t utf8proc_combinations[] = {\n ") 510 i = 0 511 for a in eachindex(comb1st_indices_firstoffsets) 512 offset = 0 513 print(io, comb1st_indices_firstoffsets[a], ", ", comb1st_indices_lastoffsets[a], ", ") 514 for b in eachindex(comb2nd_indices_sorted_keys) 515 dm1 = comb2nd_indices_sorted_keys[b] 516 if offset > comb1st_indices_lastoffsets[a] 517 break 518 end 519 if offset >= comb1st_indices_firstoffsets[a] 520 i += 1 521 if i == 8 522 i = 0 523 print(io, "\n ") 524 end 525 v = get(comb_array[a], b, 0) 526 if dm1 in comb2nd_indices_nonbasic 527 print(io, (v & 0xFFFF0000) >> 16, ", ") 528 end 529 print(io, v & 0xFFFF, ", ") 530 end 531 offset += 1 532 if dm1 in comb2nd_indices_nonbasic 533 offset += 1 534 end 535 end 536 print(io, "\n") 537 end 538 print(io, "};\n\n") 539end 540 541 542if !isinteractive() 543 print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props, 544 comb1st_indices_firstoffsets, comb1st_indices_lastoffsets, 545 comb2nd_indices_sorted_keys, comb_array, comb2nd_indices_nonbasic) 546end 547