commit aa9823f5402299bf0f04634f8307cc07cb2e8d4e parent 7633bd03b6ec1bd776a7a78d2650fa0834399584 Author: Jiahao Chen <jiahao@mit.edu> Date: Thu, 17 Jul 2014 15:47:52 -0700 Mark Grapheme_Extend data Diffstat:
M | data_generator.rb | | | 155 | +++---------------------------------------------------------------------------- |
1 file changed, 4 insertions(+), 151 deletions(-)
diff --git a/data_generator.rb b/data_generator.rb @@ -83,157 +83,10 @@ $ignorable_list.each do |entry| end $grapheme_extend_list = <<END_OF_LIST -0300..036F ; Grapheme_Extend # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X -0483..0486 ; Grapheme_Extend # Mn [4] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC PSILI PNEUMATA -0488..0489 ; Grapheme_Extend # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN -0591..05BD ; Grapheme_Extend # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG -05BF ; Grapheme_Extend # Mn HEBREW POINT RAFE -05C1..05C2 ; Grapheme_Extend # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT -05C4..05C5 ; Grapheme_Extend # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT -05C7 ; Grapheme_Extend # Mn HEBREW POINT QAMATS QATAN -0610..0615 ; Grapheme_Extend # Mn [6] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL HIGH TAH -064B..065E ; Grapheme_Extend # Mn [20] ARABIC FATHATAN..ARABIC FATHA WITH TWO DOTS -0670 ; Grapheme_Extend # Mn ARABIC LETTER SUPERSCRIPT ALEF -06D6..06DC ; Grapheme_Extend # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN -06DE ; Grapheme_Extend # Me ARABIC START OF RUB EL HIZB -06DF..06E4 ; Grapheme_Extend # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA -06E7..06E8 ; Grapheme_Extend # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON -06EA..06ED ; Grapheme_Extend # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM -0711 ; Grapheme_Extend # Mn SYRIAC LETTER SUPERSCRIPT ALAPH -0730..074A ; Grapheme_Extend # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH -07A6..07B0 ; Grapheme_Extend # Mn [11] THAANA ABAFILI..THAANA SUKUN -07EB..07F3 ; Grapheme_Extend # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE -0901..0902 ; Grapheme_Extend # Mn [2] DEVANAGARI SIGN CANDRABINDU..DEVANAGARI SIGN ANUSVARA -093C ; Grapheme_Extend # Mn DEVANAGARI SIGN NUKTA -0941..0948 ; Grapheme_Extend # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI -094D ; Grapheme_Extend # Mn DEVANAGARI SIGN VIRAMA -0951..0954 ; Grapheme_Extend # Mn [4] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI ACUTE ACCENT -0962..0963 ; Grapheme_Extend # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL -0981 ; Grapheme_Extend # Mn BENGALI SIGN CANDRABINDU -09BC ; Grapheme_Extend # Mn BENGALI SIGN NUKTA -09BE ; Grapheme_Extend # Mc BENGALI VOWEL SIGN AA -09C1..09C4 ; Grapheme_Extend # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR -09CD ; Grapheme_Extend # Mn BENGALI SIGN VIRAMA -09D7 ; Grapheme_Extend # Mc BENGALI AU LENGTH MARK -09E2..09E3 ; Grapheme_Extend # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL -0A01..0A02 ; Grapheme_Extend # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI -0A3C ; Grapheme_Extend # Mn GURMUKHI SIGN NUKTA -0A41..0A42 ; Grapheme_Extend # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU -0A47..0A48 ; Grapheme_Extend # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI -0A4B..0A4D ; Grapheme_Extend # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA -0A70..0A71 ; Grapheme_Extend # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK -0A81..0A82 ; Grapheme_Extend # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA -0ABC ; Grapheme_Extend # Mn GUJARATI SIGN NUKTA -0AC1..0AC5 ; Grapheme_Extend # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E -0AC7..0AC8 ; Grapheme_Extend # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI -0ACD ; Grapheme_Extend # Mn GUJARATI SIGN VIRAMA -0AE2..0AE3 ; Grapheme_Extend # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL -0B01 ; Grapheme_Extend # Mn ORIYA SIGN CANDRABINDU -0B3C ; Grapheme_Extend # Mn ORIYA SIGN NUKTA -0B3E ; Grapheme_Extend # Mc ORIYA VOWEL SIGN AA -0B3F ; Grapheme_Extend # Mn ORIYA VOWEL SIGN I -0B41..0B43 ; Grapheme_Extend # Mn [3] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC R -0B4D ; Grapheme_Extend # Mn ORIYA SIGN VIRAMA -0B56 ; Grapheme_Extend # Mn ORIYA AI LENGTH MARK -0B57 ; Grapheme_Extend # Mc ORIYA AU LENGTH MARK -0B82 ; Grapheme_Extend # Mn TAMIL SIGN ANUSVARA -0BBE ; Grapheme_Extend # Mc TAMIL VOWEL SIGN AA -0BC0 ; Grapheme_Extend # Mn TAMIL VOWEL SIGN II -0BCD ; Grapheme_Extend # Mn TAMIL SIGN VIRAMA -0BD7 ; Grapheme_Extend # Mc TAMIL AU LENGTH MARK -0C3E..0C40 ; Grapheme_Extend # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II -0C46..0C48 ; Grapheme_Extend # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI -0C4A..0C4D ; Grapheme_Extend # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA -0C55..0C56 ; Grapheme_Extend # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK -0CBC ; Grapheme_Extend # Mn KANNADA SIGN NUKTA -0CBF ; Grapheme_Extend # Mn KANNADA VOWEL SIGN I -0CC2 ; Grapheme_Extend # Mc KANNADA VOWEL SIGN UU -0CC6 ; Grapheme_Extend # Mn KANNADA VOWEL SIGN E -0CCC..0CCD ; Grapheme_Extend # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA -0CD5..0CD6 ; Grapheme_Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK -0CE2..0CE3 ; Grapheme_Extend # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL -0D3E ; Grapheme_Extend # Mc MALAYALAM VOWEL SIGN AA -0D41..0D43 ; Grapheme_Extend # Mn [3] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC R -0D4D ; Grapheme_Extend # Mn MALAYALAM SIGN VIRAMA -0D57 ; Grapheme_Extend # Mc MALAYALAM AU LENGTH MARK -0DCA ; Grapheme_Extend # Mn SINHALA SIGN AL-LAKUNA -0DCF ; Grapheme_Extend # Mc SINHALA VOWEL SIGN AELA-PILLA -0DD2..0DD4 ; Grapheme_Extend # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA -0DD6 ; Grapheme_Extend # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA -0DDF ; Grapheme_Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA -0E31 ; Grapheme_Extend # Mn THAI CHARACTER MAI HAN-AKAT -0E34..0E3A ; Grapheme_Extend # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU -0E47..0E4E ; Grapheme_Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN -0EB1 ; Grapheme_Extend # Mn LAO VOWEL SIGN MAI KAN -0EB4..0EB9 ; Grapheme_Extend # Mn [6] LAO VOWEL SIGN I..LAO VOWEL SIGN UU -0EBB..0EBC ; Grapheme_Extend # Mn [2] LAO VOWEL SIGN MAI KON..LAO SEMIVOWEL SIGN LO -0EC8..0ECD ; Grapheme_Extend # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA -0F18..0F19 ; Grapheme_Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS -0F35 ; Grapheme_Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA -0F37 ; Grapheme_Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS -0F39 ; Grapheme_Extend # Mn TIBETAN MARK TSA -PHRU -0F71..0F7E ; Grapheme_Extend # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO -0F80..0F84 ; Grapheme_Extend # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA -0F86..0F87 ; Grapheme_Extend # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS -0F90..0F97 ; Grapheme_Extend # Mn [8] TIBETAN SUBJOINED LETTER KA..TIBETAN SUBJOINED LETTER JA -0F99..0FBC ; Grapheme_Extend # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA -0FC6 ; Grapheme_Extend # Mn TIBETAN SYMBOL PADMA GDAN -102D..1030 ; Grapheme_Extend # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU -1032 ; Grapheme_Extend # Mn MYANMAR VOWEL SIGN AI -1036..1037 ; Grapheme_Extend # Mn [2] MYANMAR SIGN ANUSVARA..MYANMAR SIGN DOT BELOW -1039 ; Grapheme_Extend # Mn MYANMAR SIGN VIRAMA -1058..1059 ; Grapheme_Extend # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL -135F ; Grapheme_Extend # Mn ETHIOPIC COMBINING GEMINATION MARK -1712..1714 ; Grapheme_Extend # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA -1732..1734 ; Grapheme_Extend # Mn [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD -1752..1753 ; Grapheme_Extend # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U -1772..1773 ; Grapheme_Extend # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U -17B7..17BD ; Grapheme_Extend # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA -17C6 ; Grapheme_Extend # Mn KHMER SIGN NIKAHIT -17C9..17D3 ; Grapheme_Extend # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT -17DD ; Grapheme_Extend # Mn KHMER SIGN ATTHACAN -180B..180D ; Grapheme_Extend # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE -18A9 ; Grapheme_Extend # Mn MONGOLIAN LETTER ALI GALI DAGALGA -1920..1922 ; Grapheme_Extend # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U -1927..1928 ; Grapheme_Extend # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O -1932 ; Grapheme_Extend # Mn LIMBU SMALL LETTER ANUSVARA -1939..193B ; Grapheme_Extend # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I -1A17..1A18 ; Grapheme_Extend # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U -1B00..1B03 ; Grapheme_Extend # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG -1B34 ; Grapheme_Extend # Mn BALINESE SIGN REREKAN -1B36..1B3A ; Grapheme_Extend # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA -1B3C ; Grapheme_Extend # Mn BALINESE VOWEL SIGN LA LENGA -1B42 ; Grapheme_Extend # Mn BALINESE VOWEL SIGN PEPET -1B6B..1B73 ; Grapheme_Extend # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG -1DC0..1DCA ; Grapheme_Extend # Mn [11] COMBINING DOTTED GRAVE ACCENT..COMBINING LATIN SMALL LETTER R BELOW -1DFE..1DFF ; Grapheme_Extend # Mn [2] COMBINING LEFT ARROWHEAD ABOVE..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW -200C..200D ; Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER -20D0..20DC ; Grapheme_Extend # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE -20DD..20E0 ; Grapheme_Extend # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH -20E1 ; Grapheme_Extend # Mn COMBINING LEFT RIGHT ARROW ABOVE -20E2..20E4 ; Grapheme_Extend # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE -20E5..20EF ; Grapheme_Extend # Mn [11] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING RIGHT ARROW BELOW -302A..302F ; Grapheme_Extend # Mn [6] IDEOGRAPHIC LEVEL TONE MARK..HANGUL DOUBLE DOT TONE MARK -3099..309A ; Grapheme_Extend # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK -A806 ; Grapheme_Extend # Mn SYLOTI NAGRI SIGN HASANTA -A80B ; Grapheme_Extend # Mn SYLOTI NAGRI SIGN ANUSVARA -A825..A826 ; Grapheme_Extend # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E -FB1E ; Grapheme_Extend # Mn HEBREW POINT JUDEO-SPANISH VARIKA -FE00..FE0F ; Grapheme_Extend # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 -FE20..FE23 ; Grapheme_Extend # Mn [4] COMBINING LIGATURE LEFT HALF..COMBINING DOUBLE TILDE RIGHT HALF -10A01..10A03 ; Grapheme_Extend # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R -10A05..10A06 ; Grapheme_Extend # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O -10A0C..10A0F ; Grapheme_Extend # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA -10A38..10A3A ; Grapheme_Extend # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW -10A3F ; Grapheme_Extend # Mn KHAROSHTHI VIRAMA -1D165 ; Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM -1D167..1D169 ; Grapheme_Extend # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3 -1D16E..1D172 ; Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5 -1D17B..1D182 ; Grapheme_Extend # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE -1D185..1D18B ; Grapheme_Extend # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE -1D1AA..1D1AD ; Grapheme_Extend # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO -1D242..1D244 ; Grapheme_Extend # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME -E0100..E01EF ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 +#From: +# http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt +#Section: +# Derived Property: Grapheme_Extend_List END_OF_LIST $grapheme_extend = []