libgrapheme

Freestanding C library for unicode string handling
git clone https://git.sinitax.com/suckless/libgrapheme
Log | Files | Refs | README | LICENSE | sfeed.txt

commit 8a7e2ee85f0a2824e48e85e57534c5b18113cf07
parent 9f15d7eb0c9cf216f069d6972c58520013b80acb
Author: Laslo Hunhold <dev@frign.de>
Date:   Sat, 24 Sep 2022 01:54:52 +0200

Compile the library in freestanding mode

Looking closely, we never explicitly depend on the standard library
within the actual library code. This can be explicitly expressed by
setting -ffreestanding during object-compilation and -nostdlib during
linking. The result is a clean library with zero libc-symbols, allowing
it to be used even without an operating system (kernel code, ELF,
etc.), by making use of the freestanding implementation form defined
in the standard[0].

To be freestanding, the code may only include <float.h>, <iso646.h>,
<limits.h>, <stdalign.h>, <stdarg.h>, <stdbool.h>, <stddef.h>,
<stdint.h> and <stdnoreturn.h>. We satisfy this condition implictly,
but there are some erroneous supplementary includes that are removed
in this commit. Additionally, the strict compiler-implementation simply
adds the U-prefix to the argument of UINT16_C (et. al.), which is why
calls to it have to be changed to really include only constants.

[0]:https://www.iso-9899.info/n1570.html#4.p6

Signed-off-by: Laslo Hunhold <dev@frign.de>

Diffstat:
Mconfig.mk | 4++--
Msrc/character.c | 139+++++++++++++++++++++++++++++++++++++++----------------------------------------
Msrc/line.c | 2--
Msrc/sentence.c | 2--
Msrc/utf8.c | 3++-
Msrc/util.c | 11+++++++++--
Msrc/word.c | 2--
7 files changed, 82 insertions(+), 81 deletions(-)

diff --git a/config.mk b/config.mk @@ -15,8 +15,8 @@ BUILD_CPPFLAGS = $(CPPFLAGS) BUILD_CFLAGS = $(CFLAGS) BUILD_LDFLAGS = $(LDFLAGS) -SHFLAGS = -fPIC -SOFLAGS = -shared -Wl,--soname=libgrapheme.so +SHFLAGS = -fPIC -ffreestanding +SOFLAGS = -shared -nostdlib -Wl,--soname=libgrapheme.so # tools CC = cc diff --git a/src/character.c b/src/character.c @@ -1,8 +1,7 @@ /* See LICENSE file for copyright and license details. */ +#include <limits.h> #include <stdbool.h> #include <stddef.h> -#include <stdlib.h> -#include <string.h> #include "../gen/character.h" #include "../grapheme.h" @@ -10,96 +9,96 @@ static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_OTHER] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_CR] = - UINT16_C(1 << CHAR_BREAK_PROP_LF), /* GB3 */ + UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */ [CHAR_BREAK_PROP_EXTEND] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_L] = - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_L) | /* GB6 */ - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB6 */ - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LV) | /* GB6 */ - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_LVT) | /* GB6 */ - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_V] = - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */ - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */ - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_T] = - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */ - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_LV] = - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_V) | /* GB7 */ - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB7 */ - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_LVT] = - UINT16_C(1 << CHAR_BREAK_PROP_HANGUL_T) | /* GB8 */ - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_PREPEND] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK) | /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */ (UINT16_C(0xFFFF) & - ~(UINT16_C(1 << CHAR_BREAK_PROP_CR) | - UINT16_C(1 << CHAR_BREAK_PROP_LF) | - UINT16_C(1 << CHAR_BREAK_PROP_CONTROL) + ~(UINT16_C(1) << CHAR_BREAK_PROP_CR | + UINT16_C(1) << CHAR_BREAK_PROP_LF | + UINT16_C(1) << CHAR_BREAK_PROP_CONTROL ) ), /* GB9b */ [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_SPACINGMARK] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_ZWJ] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | /* GB9 */ - UINT16_C(1 << CHAR_BREAK_PROP_SPACINGMARK), /* GB9a */ + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ + UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ }; static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND), + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC), + UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND) | - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ), + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ, [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1 << CHAR_BREAK_PROP_ZWJ) | - UINT16_C(1 << CHAR_BREAK_PROP_EXTEND), + UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | + UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, }; static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1 << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC), + UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, }; static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = - UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR), + UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, }; static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = - UINT16_C(1 << CHAR_BREAK_PROP_REGIONAL_INDICATOR), + UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, }; static inline enum char_break_property @@ -135,23 +134,23 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA state->gb11_flag = flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS * state->gb11_flag] & - UINT16_C(1 << cp1_prop); + UINT16_C(1) << cp1_prop; state->gb12_13_flag = flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS * state->gb12_13_flag] & - UINT16_C(1 << cp1_prop); + UINT16_C(1) << cp1_prop; /* * Apply grapheme cluster breaking algorithm (UAX #29), see * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules */ - notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) || + notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) || (dont_break_gb11[cp0_prop + state->gb11_flag * NUM_CHAR_BREAK_PROPS] & - UINT16_C(1 << cp1_prop)) || + (UINT16_C(1) << cp1_prop)) || (dont_break_gb12_13[cp0_prop + state->gb12_13_flag * NUM_CHAR_BREAK_PROPS] & - UINT16_C(1 << cp1_prop)); + (UINT16_C(1) << cp1_prop)); /* update or reset flags (when we have a break) */ if (likely(!notbreak)) { @@ -168,9 +167,9 @@ grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, GRAPHEME_STA * Given we have no state, this behaves as if the state-booleans * were all set to false */ - notbreak = (dont_break[cp0_prop] & UINT16_C(1 << cp1_prop)) || - (dont_break_gb11[cp0_prop] & UINT16_C(1 << cp1_prop)) || - (dont_break_gb12_13[cp0_prop] & UINT16_C(1 << cp1_prop)); + notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) || + (dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) || + (dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop)); } return !notbreak; diff --git a/src/line.c b/src/line.c @@ -1,8 +1,6 @@ /* See LICENSE file for copyright and license details. */ #include <stdbool.h> #include <stddef.h> -#include <stdlib.h> -#include <string.h> #include "../gen/line.h" #include "../grapheme.h" diff --git a/src/sentence.c b/src/sentence.c @@ -1,8 +1,6 @@ /* See LICENSE file for copyright and license details. */ #include <stdbool.h> #include <stddef.h> -#include <stdlib.h> -#include <string.h> #include "../gen/sentence.h" #include "../grapheme.h" diff --git a/src/utf8.c b/src/utf8.c @@ -1,5 +1,6 @@ /* See LICENSE file for copyright and license details. */ -#include <stdio.h> +#include <stddef.h> +#include <stdint.h> #include "../grapheme.h" #include "util.h" diff --git a/src/util.c b/src/util.c @@ -1,7 +1,8 @@ /* See LICENSE file for copyright and license details. */ +#include <limits.h> #include <stdbool.h> +#include <stddef.h> #include <stdint.h> -#include <stdlib.h> #include "../gen/types.h" #include "../grapheme.h" @@ -88,6 +89,12 @@ herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r) } } +size_t +herodotus_reader_number_read(const HERODOTUS_READER *r) +{ + return r->off; +} + enum herodotus_status herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp) { @@ -202,7 +209,7 @@ herodotus_writer_nul_terminate(HERODOTUS_WRITER *w) } size_t -herodotus_writer_number_written(HERODOTUS_WRITER *w) +herodotus_writer_number_written(const HERODOTUS_WRITER *w) { return w->off; } diff --git a/src/word.c b/src/word.c @@ -1,8 +1,6 @@ /* See LICENSE file for copyright and license details. */ #include <stdbool.h> #include <stddef.h> -#include <stdlib.h> -#include <string.h> #include "../gen/word.h" #include "../grapheme.h"