libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

case.c (12993B)


      1/* See LICENSE file for copyright and license details. */
      2#include <stddef.h>
      3#include <stdint.h>
      4
      5#include "../gen/case.h"
      6#include "../grapheme.h"
      7#include "util.h"
      8
      9static inline enum case_property
     10get_case_property(uint_least32_t cp)
     11{
     12	if (likely(cp <= UINT32_C(0x10FFFF))) {
     13		return (enum case_property)
     14			case_minor[case_major[cp >> 8] + (cp & 0xFF)];
     15	} else {
     16		return CASE_PROP_OTHER;
     17	}
     18}
     19
     20static inline int_least32_t
     21get_case_offset(uint_least32_t cp, const uint_least16_t *major,
     22                const int_least32_t *minor)
     23{
     24	if (likely(cp <= UINT32_C(0x10FFFF))) {
     25		/*
     26		 * this value might be larger than or equal to 0x110000
     27		 * for the special-case-mapping. This needs to be handled
     28		 * separately
     29		 */
     30		return minor[major[cp >> 8] + (cp & 0xFF)];
     31	} else {
     32		return 0;
     33	}
     34}
     35
     36static inline size_t
     37to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
     38        uint_least8_t final_sigma_level, const uint_least16_t *major,
     39        const int_least32_t *minor, const struct special_case *sc)
     40{
     41	HERODOTUS_READER tmp;
     42	enum case_property prop;
     43	enum herodotus_status s;
     44	size_t off, i;
     45	uint_least32_t cp, tmp_cp;
     46	int_least32_t map;
     47
     48	for (; herodotus_read_codepoint(r, true, &cp) ==
     49	       HERODOTUS_STATUS_SUCCESS;) {
     50		if (sc == lower_special) {
     51			/*
     52			 * For the special Final_Sigma-rule (see
     53			 * SpecialCasing.txt), which is the only non-localized
     54			 * case-dependent rule, we apply a different mapping
     55			 * when a sigma is at the end of a word.
     56			 *
     57			 * Before: cased case-ignorable*
     58			 * After: not(case-ignorable* cased)
     59			 *
     60			 * We check the after-condition on demand, but the
     61			 * before- condition is best checked using the
     62			 * "level"-heuristic also used in the sentence and line
     63			 * breaking-implementations.
     64			 */
     65			if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER
     66			                                 SIGMA */
     67			    (final_sigma_level == 1 ||
     68			     final_sigma_level == 2)) {
     69				/*
     70				 * check succeeding characters by first skipping
     71				 * all case-ignorable characters and then
     72				 * checking if the succeeding character is
     73				 * cased, invalidating the after-condition
     74				 */
     75				herodotus_reader_copy(r, &tmp);
     76				for (prop = NUM_CASE_PROPS;
     77				     (s = herodotus_read_codepoint(&tmp, true,
     78				                                   &tmp_cp)) ==
     79				     HERODOTUS_STATUS_SUCCESS;) {
     80					prop = get_case_property(tmp_cp);
     81
     82					if (prop != CASE_PROP_CASE_IGNORABLE &&
     83					    prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
     84						break;
     85					}
     86				}
     87
     88				/*
     89				 * Now prop is something other than
     90				 * case-ignorable or the source-string ended. If
     91				 * it is something other than cased, we know
     92				 * that the after-condition holds
     93				 */
     94				if (s != HERODOTUS_STATUS_SUCCESS ||
     95				    (prop != CASE_PROP_CASED &&
     96				     prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
     97					/*
     98					 * write GREEK SMALL LETTER FINAL SIGMA
     99					 * to destination
    100					 */
    101					herodotus_write_codepoint(
    102						w, UINT32_C(0x03C2));
    103
    104					/* reset Final_Sigma-state and continue
    105					 */
    106					final_sigma_level = 0;
    107					continue;
    108				}
    109			}
    110
    111			/* update state */
    112			prop = get_case_property(cp);
    113			if ((final_sigma_level == 0 ||
    114			     final_sigma_level == 1) &&
    115			    (prop == CASE_PROP_CASED ||
    116			     prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
    117				/* sequence has begun */
    118				final_sigma_level = 1;
    119			} else if (
    120				(final_sigma_level == 1 ||
    121			         final_sigma_level == 2) &&
    122				(prop == CASE_PROP_CASE_IGNORABLE ||
    123			         prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
    124				/* case-ignorable sequence begins or continued
    125				 */
    126				final_sigma_level = 2;
    127			} else {
    128				/* sequence broke */
    129				final_sigma_level = 0;
    130			}
    131		}
    132
    133		/* get and handle case mapping */
    134		if (unlikely((map = get_case_offset(cp, major, minor)) >=
    135		             INT32_C(0x110000))) {
    136			/* we have a special case and the offset in the sc-array
    137			 * is the difference to 0x110000*/
    138			off = (uint_least32_t)map - UINT32_C(0x110000);
    139
    140			for (i = 0; i < sc[off].cplen; i++) {
    141				herodotus_write_codepoint(w, sc[off].cp[i]);
    142			}
    143		} else {
    144			/* we have a simple mapping */
    145			herodotus_write_codepoint(
    146				w, (uint_least32_t)((int_least32_t)cp + map));
    147		}
    148	}
    149
    150	herodotus_writer_nul_terminate(w);
    151
    152	return herodotus_writer_number_written(w);
    153}
    154
    155static size_t
    156herodotus_next_word_break(const HERODOTUS_READER *r)
    157{
    158	HERODOTUS_READER tmp;
    159
    160	herodotus_reader_copy(r, &tmp);
    161
    162	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
    163		return grapheme_next_word_break(tmp.src, tmp.srclen);
    164	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
    165		return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
    166	}
    167}
    168
    169static inline size_t
    170to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
    171{
    172	enum case_property prop;
    173	enum herodotus_status s;
    174	uint_least32_t cp;
    175	size_t nwb;
    176
    177	for (; (nwb = herodotus_next_word_break(r)) > 0;) {
    178		herodotus_reader_push_advance_limit(r, nwb);
    179		for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
    180		       HERODOTUS_STATUS_SUCCESS;) {
    181			/* check if we have a cased character */
    182			prop = get_case_property(cp);
    183			if (prop == CASE_PROP_CASED ||
    184			    prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
    185				break;
    186			} else {
    187				/* write the data to the output verbatim, it if
    188				 * permits */
    189				herodotus_write_codepoint(w, cp);
    190
    191				/* increment reader */
    192				herodotus_read_codepoint(r, true, &cp);
    193			}
    194		}
    195
    196		if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
    197			/* we are done */
    198			herodotus_reader_pop_limit(r);
    199			break;
    200		} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
    201			/*
    202			 * we did not encounter any cased character
    203			 * up to the word break
    204			 */
    205			herodotus_reader_pop_limit(r);
    206			continue;
    207		} else {
    208			/*
    209			 * we encountered a cased character before the word
    210			 * break, convert it to titlecase
    211			 */
    212			herodotus_reader_push_advance_limit(
    213				r, herodotus_reader_next_codepoint_break(r));
    214			to_case(r, w, 0, title_major, title_minor,
    215			        title_special);
    216			herodotus_reader_pop_limit(r);
    217		}
    218
    219		/* cast the rest of the codepoints in the word to lowercase */
    220		to_case(r, w, 1, lower_major, lower_minor, lower_special);
    221
    222		/* remove the limit on the word before the next iteration */
    223		herodotus_reader_pop_limit(r);
    224	}
    225
    226	herodotus_writer_nul_terminate(w);
    227
    228	return herodotus_writer_number_written(w);
    229}
    230
    231size_t
    232grapheme_to_uppercase(const uint_least32_t *src, size_t srclen,
    233                      uint_least32_t *dest, size_t destlen)
    234{
    235	HERODOTUS_READER r;
    236	HERODOTUS_WRITER w;
    237
    238	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    239	herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
    240
    241	return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
    242}
    243
    244size_t
    245grapheme_to_lowercase(const uint_least32_t *src, size_t srclen,
    246                      uint_least32_t *dest, size_t destlen)
    247{
    248	HERODOTUS_READER r;
    249	HERODOTUS_WRITER w;
    250
    251	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    252	herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
    253
    254	return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
    255}
    256
    257size_t
    258grapheme_to_titlecase(const uint_least32_t *src, size_t srclen,
    259                      uint_least32_t *dest, size_t destlen)
    260{
    261	HERODOTUS_READER r;
    262	HERODOTUS_WRITER w;
    263
    264	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    265	herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
    266
    267	return to_titlecase(&r, &w);
    268}
    269
    270size_t
    271grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest,
    272                           size_t destlen)
    273{
    274	HERODOTUS_READER r;
    275	HERODOTUS_WRITER w;
    276
    277	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    278	herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
    279
    280	return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
    281}
    282
    283size_t
    284grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest,
    285                           size_t destlen)
    286{
    287	HERODOTUS_READER r;
    288	HERODOTUS_WRITER w;
    289
    290	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    291	herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
    292
    293	return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
    294}
    295
    296size_t
    297grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest,
    298                           size_t destlen)
    299{
    300	HERODOTUS_READER r;
    301	HERODOTUS_WRITER w;
    302
    303	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    304	herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
    305
    306	return to_titlecase(&r, &w);
    307}
    308
    309static inline bool
    310is_case(HERODOTUS_READER *r, const uint_least16_t *major,
    311        const int_least32_t *minor, const struct special_case *sc,
    312        size_t *output)
    313{
    314	size_t off, i;
    315	bool ret = true;
    316	uint_least32_t cp;
    317	int_least32_t map;
    318
    319	for (; herodotus_read_codepoint(r, false, &cp) ==
    320	       HERODOTUS_STATUS_SUCCESS;) {
    321		/* get and handle case mapping */
    322		if (unlikely((map = get_case_offset(cp, major, minor)) >=
    323		             INT32_C(0x110000))) {
    324			/* we have a special case and the offset in the sc-array
    325			 * is the difference to 0x110000*/
    326			off = (uint_least32_t)map - UINT32_C(0x110000);
    327
    328			for (i = 0; i < sc[off].cplen; i++) {
    329				if (herodotus_read_codepoint(r, false, &cp) ==
    330				    HERODOTUS_STATUS_SUCCESS) {
    331					if (cp != sc[off].cp[i]) {
    332						ret = false;
    333						goto done;
    334					} else {
    335						/* move forward */
    336						herodotus_read_codepoint(
    337							r, true, &cp);
    338					}
    339				} else {
    340					/*
    341					 * input ended and we didn't see
    342					 * any difference so far, so this
    343					 * string is in fact okay
    344					 */
    345					ret = true;
    346					goto done;
    347				}
    348			}
    349		} else {
    350			/* we have a simple mapping */
    351			if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
    352				/* we have a difference */
    353				ret = false;
    354				goto done;
    355			} else {
    356				/* move forward */
    357				herodotus_read_codepoint(r, true, &cp);
    358			}
    359		}
    360	}
    361done:
    362	if (output) {
    363		*output = herodotus_reader_number_read(r);
    364	}
    365	return ret;
    366}
    367
    368static inline bool
    369is_titlecase(HERODOTUS_READER *r, size_t *output)
    370{
    371	enum case_property prop;
    372	enum herodotus_status s;
    373	bool ret = true;
    374	uint_least32_t cp;
    375	size_t nwb;
    376
    377	for (; (nwb = herodotus_next_word_break(r)) > 0;) {
    378		herodotus_reader_push_advance_limit(r, nwb);
    379		for (; (s = herodotus_read_codepoint(r, false, &cp)) ==
    380		       HERODOTUS_STATUS_SUCCESS;) {
    381			/* check if we have a cased character */
    382			prop = get_case_property(cp);
    383			if (prop == CASE_PROP_CASED ||
    384			    prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
    385				break;
    386			} else {
    387				/* increment reader */
    388				herodotus_read_codepoint(r, true, &cp);
    389			}
    390		}
    391
    392		if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
    393			/* we are done */
    394			break;
    395		} else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
    396			/*
    397			 * we did not encounter any cased character
    398			 * up to the word break
    399			 */
    400			herodotus_reader_pop_limit(r);
    401			continue;
    402		} else {
    403			/*
    404			 * we encountered a cased character before the word
    405			 * break, check if it's titlecase
    406			 */
    407			herodotus_reader_push_advance_limit(
    408				r, herodotus_reader_next_codepoint_break(r));
    409			if (!is_case(r, title_major, title_minor, title_special,
    410			             NULL)) {
    411				ret = false;
    412				goto done;
    413			}
    414			herodotus_reader_pop_limit(r);
    415		}
    416
    417		/* check if the rest of the codepoints in the word are lowercase
    418		 */
    419		if (!is_case(r, lower_major, lower_minor, lower_special,
    420		             NULL)) {
    421			ret = false;
    422			goto done;
    423		}
    424
    425		/* remove the limit on the word before the next iteration */
    426		herodotus_reader_pop_limit(r);
    427	}
    428done:
    429	if (output) {
    430		*output = herodotus_reader_number_read(r);
    431	}
    432	return ret;
    433}
    434
    435bool
    436grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
    437{
    438	HERODOTUS_READER r;
    439
    440	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    441
    442	return is_case(&r, upper_major, upper_minor, upper_special, caselen);
    443}
    444
    445bool
    446grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
    447{
    448	HERODOTUS_READER r;
    449
    450	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    451
    452	return is_case(&r, lower_major, lower_minor, lower_special, caselen);
    453}
    454
    455bool
    456grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
    457{
    458	HERODOTUS_READER r;
    459
    460	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
    461
    462	return is_titlecase(&r, caselen);
    463}
    464
    465bool
    466grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
    467{
    468	HERODOTUS_READER r;
    469
    470	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    471
    472	return is_case(&r, upper_major, upper_minor, upper_special, caselen);
    473}
    474
    475bool
    476grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
    477{
    478	HERODOTUS_READER r;
    479
    480	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    481
    482	return is_case(&r, lower_major, lower_minor, lower_special, caselen);
    483}
    484
    485bool
    486grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
    487{
    488	HERODOTUS_READER r;
    489
    490	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
    491
    492	return is_titlecase(&r, caselen);
    493}