utf8proc

A clean C library for processing UTF-8 Unicode data
git clone https://git.sinitax.com/juliastrings/utf8proc
Log | Files | Refs | README | LICENSE | sfeed.txt

commit a8b688c734d6dac677f7c68a2d915f85baf41b2b
parent 50381b951a2b156c1c236c77d34ac0fddbc0ea46
Author: Tony Kelman <tony@kelman.net>
Date:   Sun,  8 Mar 2015 15:33:27 -0700

Minimal cmake build script

move flags for MSVC

rename lump.txt to lump.md, add data/*.txt to .gitignore

Diffstat:
M.gitignore | 2+-
ACMakeLists.txt | 22++++++++++++++++++++++
MNEWS.md | 2+-
Alump.md | 27+++++++++++++++++++++++++++
Dlump.txt | 26--------------------------
Mutf8proc.h | 2+-
Autils.cmake | 20++++++++++++++++++++
7 files changed, 72 insertions(+), 29 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -8,8 +8,8 @@ *.dll *.dylib *.dSYM -*.txt *.out +data/*.txt bench/bench bench/icu bench/unistring diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required (VERSION 2.8) + +include (utils.cmake) + +disallow_intree_builds() + +project (utf8proc C) + +add_definitions ( + -DUTF8PROC_EXPORTS +) + +if (NOT MSVC) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -std=c99 -pedantic -Wall") +endif () + +add_library (utf8proc + utf8proc.c + utf8proc.h +) + +set_property (TARGET utf8proc PROPERTY POSITION_INDEPENDENT_CODE ON) diff --git a/NEWS.md b/NEWS.md @@ -105,7 +105,7 @@ Release of version 1.0.1 2006-09-17: -- added the `LUMP` option, which lumps certain characters together (see `lump.txt`) (also used for the PostgreSQL `unifold` function) +- added the `LUMP` option, which lumps certain characters together (see `lump.md`) (also used for the PostgreSQL `unifold` function) - added the `STRIPMARK` option, which strips marking characters (or marks of composed characters) - deprecated ruby method `String#char_ary` in favour of `String#utf8chars` diff --git a/lump.md b/lump.md @@ -0,0 +1,27 @@ +``` +U+0020 <-- all space characters (general category Zs) +U+0027 ' <-- left/right single quotation mark U+2018..2019, + modifier letter apostrophe U+02BC, + modifier letter vertical line U+02C8 +U+002D - <-- all dash characters (general category Pd), + minus U+2212 +U+002F / <-- fraction slash U+2044, + division slash U+2215 +U+003A : <-- ratio U+2236 +U+003C < <-- single left-pointing angle quotation mark U+2039, + left-pointing angle bracket U+2329, + left angle bracket U+3008 +U+003E > <-- single right-pointing angle quotation mark U+203A, + right-pointing angle bracket U+232A, + right angle bracket U+3009 +U+005C \ <-- set minus U+2216 +U+005E ^ <-- modifier letter up arrowhead U+02C4, + modifier letter circumflex accent U+02C6, + caret U+2038, + up arrowhead U+2303 +U+005F _ <-- all connector characters (general category Pc), + modifier letter low macron U+02CD +U+0060 ` <-- modifier letter grave accent U+02CB +U+007C | <-- divides U+2223 +U+007E ~ <-- tilde operator U+223C +``` diff --git a/lump.txt b/lump.txt @@ -1,26 +0,0 @@ -U+0020 <-- all space characters (general category Zs) -U+0027 ' <-- left/right single quotation mark U+2018..2019, - modifier letter apostrophe U+02BC, - modifier letter vertical line U+02C8 -U+002D - <-- all dash characters (general category Pd), - minus U+2212 -U+002F / <-- fraction slash U+2044, - division slash U+2215 -U+003A : <-- ratio U+2236 -U+003C < <-- single left-pointing angle quotation mark U+2039, - left-pointing angle bracket U+2329, - left angle bracket U+3008 -U+003E > <-- single right-pointing angle quotation mark U+203A, - right-pointing angle bracket U+232A, - right angle bracket U+3009 -U+005C \ <-- set minus U+2216 -U+005E ^ <-- modifier letter up arrowhead U+02C4, - modifier letter circumflex accent U+02C6, - caret U+2038, - up arrowhead U+2303 -U+005F _ <-- all connector characters (general category Pc), - modifier letter low macron U+02CD -U+0060 ` <-- modifier letter grave accent U+02CB -U+007C | <-- divides U+2223 -U+007E ~ <-- tilde operator U+223C - diff --git a/utf8proc.h b/utf8proc.h @@ -140,7 +140,7 @@ extern "C" { * is representing a single grapheme cluster (see UAX#29). * LUMP: Lumps certain characters together * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-"). - * (See lump.txt for details.) + * (See lump.md for details.) * If NLF2LF is set, this includes a transformation of * paragraph and line separators to ASCII line-feed (LF). * STRIPMARK: Strips all character markings diff --git a/utils.cmake b/utils.cmake @@ -0,0 +1,20 @@ + +function (disallow_intree_builds) + # Adapted from LLVM's toplevel CMakeLists.txt file + if( CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND NOT MSVC_IDE ) + message(FATAL_ERROR " + In-source builds are not allowed. CMake would overwrite the + makefiles distributed with utf8proc. Please create a directory + and run cmake from there. Building in a subdirectory is + fine, e.g.: + + mkdir build + cd build + cmake .. + + This process created the file `CMakeCache.txt' and the + directory `CMakeFiles'. Please delete them. + + ") + endif() +endfunction()