utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 5404ef8dc7a72a402312381c8c72f80363a7d8c0
parent aa9823f5402299bf0f04634f8307cc07cb2e8d4e
Author: Jiahao Chen <jiahao@mit.edu>
Date:   Thu, 17 Jul 2014 15:55:47 -0700

Mark composition exclusion characters

Diffstat:
Mdata_generator.rb | 89++++++++-----------------------------------------------------------------------
1 file changed, 8 insertions(+), 81 deletions(-)

diff --git a/data_generator.rb b/data_generator.rb @@ -99,91 +99,18 @@ $grapheme_extend_list.each do |entry| end $exclusions = <<END_OF_LIST -0958 # DEVANAGARI LETTER QA -0959 # DEVANAGARI LETTER KHHA -095A # DEVANAGARI LETTER GHHA -095B # DEVANAGARI LETTER ZA -095C # DEVANAGARI LETTER DDDHA -095D # DEVANAGARI LETTER RHA -095E # DEVANAGARI LETTER FA -095F # DEVANAGARI LETTER YYA -09DC # BENGALI LETTER RRA -09DD # BENGALI LETTER RHA -09DF # BENGALI LETTER YYA -0A33 # GURMUKHI LETTER LLA -0A36 # GURMUKHI LETTER SHA -0A59 # GURMUKHI LETTER KHHA -0A5A # GURMUKHI LETTER GHHA -0A5B # GURMUKHI LETTER ZA -0A5E # GURMUKHI LETTER FA -0B5C # ORIYA LETTER RRA -0B5D # ORIYA LETTER RHA -0F43 # TIBETAN LETTER GHA -0F4D # TIBETAN LETTER DDHA -0F52 # TIBETAN LETTER DHA -0F57 # TIBETAN LETTER BHA -0F5C # TIBETAN LETTER DZHA -0F69 # TIBETAN LETTER KSSA -0F76 # TIBETAN VOWEL SIGN VOCALIC R -0F78 # TIBETAN VOWEL SIGN VOCALIC L -0F93 # TIBETAN SUBJOINED LETTER GHA -0F9D # TIBETAN SUBJOINED LETTER DDHA -0FA2 # TIBETAN SUBJOINED LETTER DHA -0FA7 # TIBETAN SUBJOINED LETTER BHA -0FAC # TIBETAN SUBJOINED LETTER DZHA -0FB9 # TIBETAN SUBJOINED LETTER KSSA -FB1D # HEBREW LETTER YOD WITH HIRIQ -FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH -FB2A # HEBREW LETTER SHIN WITH SHIN DOT -FB2B # HEBREW LETTER SHIN WITH SIN DOT -FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT -FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT -FB2E # HEBREW LETTER ALEF WITH PATAH -FB2F # HEBREW LETTER ALEF WITH QAMATS -FB30 # HEBREW LETTER ALEF WITH MAPIQ -FB31 # HEBREW LETTER BET WITH DAGESH -FB32 # HEBREW LETTER GIMEL WITH DAGESH -FB33 # HEBREW LETTER DALET WITH DAGESH -FB34 # HEBREW LETTER HE WITH MAPIQ -FB35 # HEBREW LETTER VAV WITH DAGESH -FB36 # HEBREW LETTER ZAYIN WITH DAGESH -FB38 # HEBREW LETTER TET WITH DAGESH -FB39 # HEBREW LETTER YOD WITH DAGESH -FB3A # HEBREW LETTER FINAL KAF WITH DAGESH -FB3B # HEBREW LETTER KAF WITH DAGESH -FB3C # HEBREW LETTER LAMED WITH DAGESH -FB3E # HEBREW LETTER MEM WITH DAGESH -FB40 # HEBREW LETTER NUN WITH DAGESH -FB41 # HEBREW LETTER SAMEKH WITH DAGESH -FB43 # HEBREW LETTER FINAL PE WITH DAGESH -FB44 # HEBREW LETTER PE WITH DAGESH -FB46 # HEBREW LETTER TSADI WITH DAGESH -FB47 # HEBREW LETTER QOF WITH DAGESH -FB48 # HEBREW LETTER RESH WITH DAGESH -FB49 # HEBREW LETTER SHIN WITH DAGESH -FB4A # HEBREW LETTER TAV WITH DAGESH -FB4B # HEBREW LETTER VAV WITH HOLAM -FB4C # HEBREW LETTER BET WITH RAFE -FB4D # HEBREW LETTER KAF WITH RAFE -FB4E # HEBREW LETTER PE WITH RAFE +#From: +# http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt +#Section: +# (1) Script Specifics END_OF_LIST $exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex } $excl_version = <<END_OF_LIST -2ADC # FORKING -1D15E # MUSICAL SYMBOL HALF NOTE -1D15F # MUSICAL SYMBOL QUARTER NOTE -1D160 # MUSICAL SYMBOL EIGHTH NOTE -1D161 # MUSICAL SYMBOL SIXTEENTH NOTE -1D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE -1D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE -1D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE -1D1BB # MUSICAL SYMBOL MINIMA -1D1BC # MUSICAL SYMBOL MINIMA BLACK -1D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE -1D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK -1D1BF # MUSICAL SYMBOL FUSA WHITE -1D1C0 # MUSICAL SYMBOL FUSA BLACK +#From: +# http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt +#Section: +# (2) Post Composition Version precomposed characters END_OF_LIST $excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }