Improve UTF-8 support in some natives (bug 6475) (#407)

* Compile as static library, update AMBuildScript and link to core

* Update VS project files to include the library

* Add UTF-8 Rewind library (v1.5.1) to third_party directory

* Update ACKNOWLEDGEMENTS.txt

* Move AMXX buffer in its own function

* Move constants from string.inc to string_const.inc and update project files

* Move stocks from string.inc to string_stocks.inc and update project files

* Improve UTF-8 support in containi() and update documentation

* Improve UTF-8 support in strcmp() and update documentation

* Improve UTF-8 support in strfind() and update documentation

Worth to be noted that this native with ignorecase set was not working properly. So broken that no one reported the issue.
This adds also a safety check for "pos" parameter to not go < 0.

* Improve UTF-8 support in strncmp() and update documentation

* Improve UTF-8 support in equali() and update documentation

* Add an option to some UTF-8 Rewind functions for avoiding invalid data to be replaced

By default it replaces any invalid byte or sequence of bytes by 0xFFFD (3 bytes). It can be problematic when the input buffer is not changed (from a plugin) and that some natives need to calculate a position from the converted string. With such replacement, the position is displaced due the final string length being larger.

This compiles the library as C++, because I added some silly param with a default default value which is not supported by C.

* Improve UTF-8 support in replace_string/ex() and update documentation

* Add is_string_category() and update documentation

* Update a little testsuite plugin (and fix linux compilation)

* Add mb_strotolower/upper() and update documentation

* Add mb_ucfirst() and update documentation

* Add mb_strtotile() and update documentation

* Improve UTF-8 support in get_players() and find_player() with name/case insenstive flags set

* Fix KliPPy's complain
This commit is contained in:
Vincent Herbet
2017-08-05 10:32:16 +02:00
committed by GitHub
parent 07c3d49cfa
commit ab854ec035
34 changed files with 20166 additions and 532 deletions

26
third_party/utf8rewind/AMBuilder vendored Normal file
View File

@ -0,0 +1,26 @@
# vim: sts=2 ts=8 sw=2 tw=99 et ft=python:
import os, platform
lib = builder.compiler.StaticLibrary('utf8rewind')
lib.compiler.defines += [
'UTF8REWINDS_EXPORTS',
]
lib.compiler.includes += [
os.path.join(builder.sourcePath, 'third_party', 'utf8rewind'),
]
lib.sources += [
'utf8rewind.c',
'unicodedatabase.c',
'internal/casemapping.c',
'internal/codepoint.c',
'internal/composition.c',
'internal/database.c',
'internal/decomposition.c',
'internal/seeking.c',
'internal/streaming.c',
]
rvalue = builder.Add(lib)

147
third_party/utf8rewind/internal/base.h vendored Normal file
View File

@ -0,0 +1,147 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_BASE_H_
#define _UTF8REWIND_INTERNAL_BASE_H_
/*!
\file
\brief Base header for internal interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
#if defined(__GNUC__) && !defined(COMPILER_ICC)
#define UTF8_UNUSED(_parameter) _parameter __attribute__ ((unused))
#else
#define UTF8_UNUSED(_parameter) _parameter
#endif
#define UTF8_SET_ERROR(_error) \
if (errors != 0) { *errors = UTF8_ERR_ ## _error; }
/* Validates input before transforming */
/* Check for parameter overlap using the separating axis theorem */
#define UTF8_VALIDATE_PARAMETERS_CHAR(_inputType, _result) \
if (input == 0) { \
UTF8_SET_ERROR(INVALID_DATA); \
return _result; \
} \
else if (inputSize < sizeof(_inputType)) { \
if (target != 0) { \
if (targetSize < 3) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
memcpy(target, REPLACEMENT_CHARACTER_STRING, REPLACEMENT_CHARACTER_STRING_LENGTH); \
} \
UTF8_SET_ERROR(INVALID_DATA); \
return _result + REPLACEMENT_CHARACTER_STRING_LENGTH; \
} \
if (target != 0 && targetSize == 0) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
if ((char*)input == target) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
{ \
char* input_center = (char*)input + (inputSize / 2); \
char* target_center = target + (targetSize / 2); \
size_t delta = (size_t)((input_center > target_center) ? (input_center - target_center) : (target_center - input_center)); \
if (delta < (inputSize + targetSize) / 2) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
}
#define UTF8_VALIDATE_PARAMETERS_CHAR_NOCR(_inputType, _result) \
if (input == 0) { \
UTF8_SET_ERROR(INVALID_DATA); \
return _result; \
} \
else if (inputSize < sizeof(_inputType)) { \
UTF8_SET_ERROR(INVALID_DATA); \
return _result; \
} \
if (target != 0 && targetSize == 0) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
if ((char*)input == target) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
{ \
char* input_center = (char*)input + (inputSize / 2); \
char* target_center = target + (targetSize / 2); \
size_t delta = (size_t)((input_center > target_center) ? (input_center - target_center) : (target_center - input_center)); \
if (delta < (inputSize + targetSize) / 2) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
}
#define UTF8_VALIDATE_PARAMETERS(_inputType, _outputType, _result) \
if (input == 0) { \
UTF8_SET_ERROR(INVALID_DATA); \
return _result; \
} \
else if (inputSize < sizeof(_inputType)) { \
if (target != 0) { \
if (targetSize < sizeof(_outputType)) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
*target = REPLACEMENT_CHARACTER; \
} \
UTF8_SET_ERROR(INVALID_DATA); \
return _result + sizeof(_outputType); \
} \
if (target != 0 && targetSize < sizeof(_outputType)) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
if ((char*)input == (char*)target) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
{ \
char* input_center = (char*)input + (inputSize / 2); \
char* target_center = (char*)target + (targetSize / 2); \
size_t delta = (size_t)((input_center > target_center) ? (input_center - target_center) : (target_center - input_center)); \
if (delta < (inputSize + targetSize) / 2) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
}
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_BASE_H_ */

View File

@ -0,0 +1,663 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "casemapping.h"
#include "base.h"
#include "codepoint.h"
#include "database.h"
#include "streaming.h"
static const char basic_latin_lowercase_table[58] = {
/* LATIN CAPITAL LETTER A - LATIN CAPITAL LETTER Z */
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C,
0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
0x79, 0x7A,
0x5B, /* LEFT SQUARE BRACKET */
0x5C, /* REVERSE SOLIDUS */
0x5D, /* RIGHT SQUARE BRACKET */
0x5E, /* CIRCUMFLEX ACCENT */
0x5F, /* LOW LINE */
0x60, /* GRAVE ACCENT */
/* LATIN SMALL LETTER A - LATIN SMALL LETTER Z */
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C,
0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
0x79, 0x7A
};
static const char basic_latin_uppercase_table[58] = {
/* LATIN CAPITAL LETTER A - LATIN CAPITAL LETTER Z */
0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C,
0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
0x59, 0x5A,
0x5B, /* LEFT SQUARE BRACKET */
0x5C, /* REVERSE SOLIDUS */
0x5D, /* RIGHT SQUARE BRACKET */
0x5E, /* CIRCUMFLEX ACCENT */
0x5F, /* LOW LINE */
0x60, /* GRAVE ACCENT */
/* LATIN SMALL LETTER A - LATIN SMALL LETTER Z */
0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C,
0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
0x59, 0x5A
};
uint8_t casemapping_initialize(
CaseMappingState* state,
const char* input, size_t inputSize,
char* target, size_t targetSize,
const uint32_t* propertyIndex1, const uint32_t* propertyIndex2, const uint32_t* propertyData,
uint8_t quickCheck, size_t locale,
int32_t* errors)
{
memset(state, 0, sizeof(CaseMappingState));
if (locale >= UTF8_LOCALE_MAXIMUM)
{
UTF8_SET_ERROR(INVALID_LOCALE);
return 0;
}
state->src = input;
state->src_size = inputSize;
state->dst = target;
state->dst_size = targetSize;
state->property_index1 = propertyIndex1;
state->property_index2 = propertyIndex2;
state->property_data = propertyData;
state->quickcheck_flags = quickCheck;
state->locale = locale;
return 1;
}
size_t casemapping_execute(CaseMappingState* state, int32_t* errors, int no_replacement)
{
uint8_t qc_casemapped = 0;
uint8_t bytes_needed = 0;
const char* resolved = 0;
StreamState stream;
uint8_t i;
/* Read next code point */
state->last_code_point_size = codepoint_read(state->src, state->src_size, &state->last_code_point);
if (state->last_code_point_size == 0)
{
goto invaliddata;
}
/* Check for invalid characters */
if (state->last_code_point == REPLACEMENT_CHARACTER)
{
/* If option set, we want to avoid invalid byte to be replaced. Forces size to 1 to read the next byte. */
if (no_replacement)
{
state->last_code_point_size = 1;
}
else
{
/* Get code point properties */
state->last_canonical_combining_class = CCC_NOT_REORDERED;
state->last_general_category = UTF8_CATEGORY_SYMBOL_OTHER;
resolved = REPLACEMENT_CHARACTER_STRING;
bytes_needed = REPLACEMENT_CHARACTER_STRING_LENGTH;
goto writeresolved;
}
}
if (state->locale == UTF8_LOCALE_TURKISH_AND_AZERI_LATIN)
{
/*
Code point General Category does not need to be modified, because
all mappings result in the same General Category
*/
if (state->property_data == LowercaseDataPtr)
{
if (state->last_code_point == CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
{
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
resolved = "i";
bytes_needed = 1;
}
else if (
state->last_code_point == CP_LATIN_CAPITAL_LETTER_I)
{
if (state->src_size == 0)
{
/* Early-out for easy case */
state->last_code_point = CP_LATIN_SMALL_LETTER_DOTLESS_I;
resolved = "\xC4\xB1";
bytes_needed = 2;
}
else
{
uint8_t found = 0;
/* Initialize stream and read the next sequence */
if (!stream_initialize(&stream, state->src, state->src_size) ||
!stream_read(&stream, QuickCheckNFCIndexPtr, QuickCheckNFCDataPtr))
{
goto writeregular;
}
/* Erase COMBINING DOT ABOVE from sequence */
for (i = stream.current - 1; i > 0; --i)
{
if (stream.codepoint[i] == CP_COMBINING_DOT_ABOVE)
{
stream.canonical_combining_class[i] = CCC_INVALID;
found++;
}
}
/* Stabilize sequence and write to output */
if (!stream.stable ||
found > 0)
{
stream_reorder(&stream);
stream.current -= found;
}
stream.codepoint[0] = (found > 0) ? CP_LATIN_SMALL_LETTER_I : CP_LATIN_SMALL_LETTER_DOTLESS_I;
goto writestream;
}
}
}
else
{
if (state->last_code_point == CP_LATIN_SMALL_LETTER_I)
{
state->last_code_point = CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
resolved = "\xC4\xB0";
bytes_needed = 2;
}
else if (
state->last_code_point == CP_LATIN_SMALL_LETTER_DOTLESS_I)
{
state->last_code_point = CP_LATIN_CAPITAL_LETTER_I;
resolved = "I";
bytes_needed = 1;
}
}
/* Check if mapping succeeded */
if (resolved != 0)
{
/* Code point properties */
state->last_general_category = UTF8_CATEGORY_LETTER;
goto writeresolved;
}
}
else if (
state->locale == UTF8_LOCALE_LITHUANIAN)
{
if (state->property_data == LowercaseDataPtr)
{
unicode_t cp_additional_accent = 0;
uint8_t write_soft_dot = 1;
switch (state->last_code_point)
{
case CP_LATIN_CAPITAL_LETTER_I:
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
break;
case CP_LATIN_CAPITAL_LETTER_J:
state->last_code_point = CP_LATIN_SMALL_LETTER_J;
break;
case CP_LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
state->last_code_point = CP_LATIN_SMALL_LETTER_I_WITH_OGONEK;
break;
case CP_LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
cp_additional_accent = CP_COMBINING_GRAVE_ACCENT;
break;
case CP_LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
cp_additional_accent = CP_COMBINING_ACUTE_ACCENT;
break;
case CP_LATIN_CAPITAL_LETTER_I_WITH_TILDE:
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
cp_additional_accent = CP_COMBINING_TILDE_ACCENT;
break;
default:
goto writeregular;
}
/* Initialize stream and read the next sequence */
if (!stream_initialize(&stream, state->src, state->src_size) ||
!stream_read(&stream, QuickCheckNFCIndexPtr, QuickCheckNFCDataPtr))
{
goto writeregular;
}
/* Assign the lowercase code point to the start of the stream */
stream.codepoint[0] = state->last_code_point;
/* Check if COMBINING DOT ABOVE is not yet present */
for (i = stream.current - 1; i > 0; --i)
{
if (stream.codepoint[i] == CP_COMBINING_DOT_ABOVE)
{
write_soft_dot = 0;
break;
}
}
/* Stabilize the sequence */
if (!stream.stable)
{
stream_reorder(&stream);
stream.stable = 1;
}
/* Write COMBINING DOT ABOVE */
if (write_soft_dot &&
stream.current < STREAM_BUFFER_MAX)
{
/* Ensure the COMBINING DOT ABOVE comes before other accents with the same CCC */
if (stream.canonical_combining_class[stream.current - 1] == CCC_ABOVE)
{
unicode_t cp_swap = stream.codepoint[stream.current - 1];
stream.codepoint[stream.current - 1] = CP_COMBINING_DOT_ABOVE;
stream.codepoint[stream.current] = cp_swap;
}
else
{
stream.codepoint[stream.current] = CP_COMBINING_DOT_ABOVE;
}
stream.canonical_combining_class[stream.current] = CCC_ABOVE;
/* Check if sequence has become unstable */
stream.stable = stream.canonical_combining_class[stream.current - 1] <= CCC_ABOVE;
stream.current++;
}
/* Write additional accent */
if (cp_additional_accent != 0 &&
stream.current < STREAM_BUFFER_MAX)
{
/* Additional accents are always of the upper variety */
stream.codepoint[stream.current] = cp_additional_accent;
stream.canonical_combining_class[stream.current] = CCC_ABOVE;
/* Check if sequence has become unstable */
if (stream.stable &&
stream.canonical_combining_class[stream.current] < stream.canonical_combining_class[stream.current - 1])
{
stream.stable = 0;
}
stream.current++;
}
/* Stabilize the sequence */
if (!stream.stable)
{
stream_reorder(&stream);
}
}
else
{
uint8_t erase_count = 0;
switch (state->last_code_point)
{
case CP_LATIN_SMALL_LETTER_I:
state->last_code_point = CP_LATIN_CAPITAL_LETTER_I;
break;
case CP_LATIN_SMALL_LETTER_J:
state->last_code_point = CP_LATIN_CAPITAL_LETTER_J;
break;
case CP_LATIN_SMALL_LETTER_I_WITH_OGONEK:
state->last_code_point = CP_LATIN_CAPITAL_LETTER_I_WITH_OGONEK;
break;
default:
goto writeregular;
}
/* Initialize stream and read the next sequence */
if (!stream_initialize(&stream, state->src, state->src_size) ||
!stream_read(&stream, QuickCheckNFCIndexPtr, QuickCheckNFCDataPtr))
{
goto writeregular;
}
/* Assign the uppercase code point to the start of the stream */
stream.codepoint[0] = state->last_code_point;
/* Remove COMBINING DOT ABOVE from sequence */
for (i = 1; i < stream.current; ++i)
{
if (stream.codepoint[i] == CP_COMBINING_DOT_ABOVE)
{
stream.canonical_combining_class[i] = CCC_INVALID;
erase_count++;
}
}
/* Stabilize the sequence */
if (!stream.stable ||
erase_count > 0)
{
stream_reorder(&stream);
stream.current -= erase_count;
}
}
goto writestream;
}
writeregular:
/* Get code point properties */
state->last_canonical_combining_class = PROPERTY_GET_CCC(state->last_code_point);
state->last_general_category = PROPERTY_GET_GC(state->last_code_point);
/* Move source cursor */
if (state->src_size >= state->last_code_point_size)
{
state->src += state->last_code_point_size;
state->src_size -= state->last_code_point_size;
}
else
{
state->src_size = 0;
}
/* Write to output */
if (state->last_code_point_size == 1)
{
/* Write Basic Latin to output buffer*/
if (state->dst != 0)
{
if (state->dst_size < 1)
{
goto outofspace;
}
/*
Uppercase letters are U+0041 ('A') to U+005A ('Z')
Lowercase letters are U+0061 ('a') to U+007A ('z')
*/
if (state->last_code_point >= 0x41 &&
state->last_code_point <= 0x7A)
{
if (state->property_data == LowercaseDataPtr)
{
*state->dst = basic_latin_lowercase_table[state->last_code_point - 0x41];
}
else
{
*state->dst = basic_latin_uppercase_table[state->last_code_point - 0x41];
}
}
else
{
/* All other code points in Basic Latin are unaffected by case mapping */
if (no_replacement && state->last_code_point == REPLACEMENT_CHARACTER)
{
*state->dst = (char)*(state->src - state->last_code_point_size);
}
else
{
*state->dst = (char)state->last_code_point;
}
}
state->dst++;
state->dst_size--;
}
bytes_needed = 1;
}
else
{
if (state->property_data == LowercaseDataPtr &&
state->last_code_point == CP_GREEK_CAPITAL_LETTER_SIGMA)
{
/*
If the final letter of a word (defined as "a collection of code
points with the General Category 'Letter'") is a GREEK CAPITAL
LETTER SIGMA and more than one code point was processed, the
lowercase version is U+03C2 GREEK SMALL LETTER FINAL SIGMA
instead of U+03C3 GREEK SMALL LETTER SIGMA.
*/
/* At least one code point should have been read */
uint8_t should_convert = state->total_bytes_needed > 0;
if (state->src_size > 0)
{
unicode_t peeked = 0;
const char* peeked_src = state->src;
size_t peeked_src_size = state->src_size;
while (1)
{
uint8_t peeked_read = 0;
/* Peek next code point */
if ((peeked_read = codepoint_read(peeked_src, peeked_src_size, &peeked)) == 0 ||
peeked_src_size < peeked_read)
{
should_convert = 1;
break;
}
/* Convert if the "word" has ended */
if (PROPERTY_GET_CCC(peeked) == CCC_NOT_REORDERED)
{
should_convert = (PROPERTY_GET_GC(peeked) & UTF8_CATEGORY_LETTER) == 0;
break;
}
peeked_src += peeked_read;
peeked_src_size -= peeked_read;
}
}
/* Write the converted code point to the output buffer */
bytes_needed = 2;
if (state->dst != 0)
{
if (state->dst_size < bytes_needed)
{
goto outofspace;
}
memcpy(state->dst, should_convert ? "\xCF\x82" : "\xCF\x83", bytes_needed);
state->dst += bytes_needed;
state->dst_size -= bytes_needed;
}
return bytes_needed;
}
/* Check if the code point is case mapped */
qc_casemapped = PROPERTY_GET_CM(state->last_code_point);
if ((qc_casemapped & state->quickcheck_flags) != 0)
{
/* Attempt to resolve the case mapping */
resolved = database_querydecomposition(state->last_code_point, state->property_index1, state->property_index2, state->property_data, &bytes_needed);
if (resolved != 0)
{
/* Code point properties */
state->last_general_category = UTF8_CATEGORY_LETTER;
goto writeresolvedonly;
}
}
/* Write code point unchanged to output */
bytes_needed = codepoint_write(state->last_code_point, &state->dst, &state->dst_size);
if (bytes_needed == 0)
{
goto outofspace;
}
}
return bytes_needed;
writeresolved:
/* Move source cursor */
if (state->src_size >= state->last_code_point_size)
{
state->src += state->last_code_point_size;
state->src_size -= state->last_code_point_size;
}
else
{
state->src_size = 0;
}
writeresolvedonly:
/* Write resolved string to output */
if (state->dst != 0)
{
if (state->dst_size < bytes_needed)
{
goto outofspace;
}
memcpy(state->dst, resolved, bytes_needed);
state->dst += bytes_needed;
state->dst_size -= bytes_needed;
}
return bytes_needed;
writestream:
/* Get code point properties */
state->last_code_point = stream.codepoint[stream.current - 1];
state->last_canonical_combining_class = stream.canonical_combining_class[stream.current - 1];
state->last_general_category = PROPERTY_GET_GC(stream.codepoint[0]);
/* Move source cursor */
state->src = stream.src;
state->src_size = stream.src_size;
/* Write result to the output buffer */
if (!stream_write(&stream, &state->dst, &state->dst_size, &bytes_needed))
{
goto outofspace;
}
return bytes_needed;
invaliddata:
UTF8_SET_ERROR(INVALID_DATA);
state->src_size = 0;
return 0;
outofspace:
UTF8_SET_ERROR(NOT_ENOUGH_SPACE);
state->src_size = 0;
return 0;
}

View File

@ -0,0 +1,67 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_CASEMAPPING_H_
#define _UTF8REWIND_INTERNAL_CASEMAPPING_H_
/*!
\file
\brief Case mapping interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
typedef struct {
const char* src;
char* dst;
size_t src_size;
size_t dst_size;
size_t total_bytes_needed;
unicode_t last_code_point;
size_t locale;
const uint32_t* property_index1;
const uint32_t* property_index2;
const uint32_t* property_data;
uint32_t last_general_category;
uint8_t last_code_point_size;
uint8_t last_canonical_combining_class;
uint8_t quickcheck_flags;
} CaseMappingState;
uint8_t casemapping_initialize(
CaseMappingState* state,
const char* input, size_t inputSize,
char* target, size_t targetSize,
const uint32_t* propertyIndex1, const uint32_t* propertyIndex2, const uint32_t* propertyData,
uint8_t quickCheck, size_t locale,
int32_t* errors);
size_t casemapping_execute(CaseMappingState* state, int32_t* errors, int no_replacement);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_CASEMAPPING_H_ */

View File

@ -0,0 +1,272 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "codepoint.h"
const uint8_t codepoint_decoded_length[256] = {
/* Basic Latin */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 - 0x07 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x08 - 0x0F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x10 - 0x17 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x18 - 0x1F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x20 - 0x27 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x28 - 0x2F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x30 - 0x37 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x38 - 0x3F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x40 - 0x47 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x48 - 0x4F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x50 - 0x57 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x58 - 0x5F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x60 - 0x67 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x68 - 0x6F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x70 - 0x77 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x78 - 0x7F */
/* Malformed continuation byte */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x87 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 - 0x8F */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x97 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xA8 - 0xAF */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xB7 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xB8 - 0xBF */
/* Two bytes */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0 - 0xC7 */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xC8 - 0xCF */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0 - 0xD7 */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xD8 - 0xDF */
/* Three bytes */
3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0 - 0xE7 */
3, 3, 3, 3, 3, 3, 3, 3, /* 0xE8 - 0xEF */
/* Four bytes */
4, 4, 4, 4, 4, 4, 4, 4, /* 0xF0 - 0xF7 */
/* Five bytes */
5, 5, 5, 5, /* 0xF8 - 0xFB */
/* Six bytes */
6, 6, /* 0xFC - 0xFD */
/* Invalid */
7, 7 /* 0xFE - 0xFF */
};
uint8_t codepoint_write(unicode_t encoded, char** target, size_t* targetSize)
{
uint8_t encoded_length;
/* Determine encoded length of code point */
if (encoded <= MAX_BASIC_LATIN)
{
encoded_length = 1;
}
else if (
encoded <= 0x7FF)
{
encoded_length = 2;
}
else if (
encoded <= MAX_BASIC_MULTILINGUAL_PLANE)
{
encoded_length = 3;
}
else if (
encoded > MAX_LEGAL_UNICODE)
{
encoded = REPLACEMENT_CHARACTER;
encoded_length = REPLACEMENT_CHARACTER_STRING_LENGTH;
}
else
{
encoded_length = 4;
}
/* Write to target */
if (*target != 0)
{
char* dst;
if (*targetSize < encoded_length)
{
return 0;
}
dst = *target;
switch (encoded_length)
{
case 1:
*dst++ = (char)encoded;
break;
case 2:
*dst++ = (char)(encoded >> 6) | 0xC0;
*dst++ = (char)(encoded & 0x3F) | 0x80;
break;
case 3:
*dst++ = (char)(encoded >> 12) | 0xE0;
*dst++ = (char)((encoded >> 6) & 0x3F) | 0x80;
*dst++ = (char)(encoded & 0x3F) | 0x80;
break;
case 4:
*dst++ = (char)(encoded >> 18) | 0xF0;
*dst++ = (char)((encoded >> 12) & 0x3F) | 0x80;
*dst++ = (char)((encoded >> 6) & 0x3F) | 0x80;
*dst++ = (char)(encoded & 0x3F) | 0x80;
break;
default:
break;
}
*target += encoded_length;
*targetSize -= encoded_length;
}
return encoded_length;
}
uint8_t codepoint_read(const char* input, size_t inputSize, unicode_t* decoded)
{
const uint8_t* src = (const uint8_t*)input;
if (input == 0 ||
inputSize == 0)
{
/* Invalid data */
return 0;
}
if (*src <= MAX_BASIC_LATIN)
{
/* Basic Latin */
*decoded = (unicode_t)*src;
return 1;
}
else
{
/* Multi-byte sequence */
static const uint8_t SequenceMask[7] = {
0x00, 0x7F, 0x1F, 0x0F,
0x07, 0x03, 0x01
};
static const unicode_t SequenceMinimum[7] = {
0x0000, 0x0000, 0x0080, 0x0800,
0x10000, MAX_LEGAL_UNICODE, MAX_LEGAL_UNICODE
};
size_t src_size = inputSize;
uint8_t src_index;
/* Length of sequence is determined by first byte */
uint8_t decoded_length = codepoint_decoded_length[*src];
if (decoded_length < 1 ||
decoded_length > 6)
{
/* Not a multi-byte sequence starter */
*decoded = REPLACEMENT_CHARACTER;
decoded_length = 1;
}
else if (decoded_length > 4)
{
/* Always an overlong sequence */
*decoded = REPLACEMENT_CHARACTER;
/* All bytes in the sequence must be processed */
for (src_index = 1; src_index < decoded_length; ++src_index)
{
src++;
/* Check if next byte is valid */
if (src_size == 0 || /* Not enough data */
(*src < 0x80 || *src > 0xBF)) /* Not a continuation byte */
{
return src_index;
}
src_size--;
}
}
else
{
/* Use mask to strip value from first byte */
*decoded = (unicode_t)(*src & SequenceMask[decoded_length]);
/* All bytes in the sequence must be processed */
for (src_index = 1; src_index < decoded_length; ++src_index)
{
src++;
/* Check if next byte is valid */
if (src_size == 0 || /* Not enough data */
(*src < 0x80 || *src > 0xBF)) /* Not a continuation byte */
{
*decoded = REPLACEMENT_CHARACTER;
return src_index;
}
src_size--;
/* Add value of continuation byte to codepoint */
*decoded = (*decoded << 6) | (*src & 0x3F);
}
/* Check for overlong sequences and surrogate pairs */
if (*decoded < SequenceMinimum[decoded_length] ||
*decoded > MAX_LEGAL_UNICODE ||
(*decoded >= SURROGATE_HIGH_START && *decoded <= SURROGATE_LOW_END))
{
*decoded = REPLACEMENT_CHARACTER;
}
}
return decoded_length;
}
}

View File

@ -0,0 +1,291 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_CODEPOINT_H_
#define _UTF8REWIND_INTERNAL_CODEPOINT_H_
/*!
\file
\brief Codepoint interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
/*!
\addtogroup internal Internal functions and definitions
\{
*/
/*!
\def MAX_BASIC_LATIN
\brief The last codepoint part of Basic Latin (U+0000 - U+007F).
*/
#define MAX_BASIC_LATIN 0x007F
/*!
\def MAX_LATIN_1
\brief The last codepoint part of Latin-1 Supplement (U+0080 - U+00FF).
*/
#define MAX_LATIN_1 0x00FF
/*!
\def MAX_BASIC_MULTILINGUAL_PLANE
\brief The last legal codepoint in the Basic Multilingual Plane (BMP).
*/
#define MAX_BASIC_MULTILINGUAL_PLANE 0xFFFF
/*!
\def MAX_LEGAL_UNICODE
\brief The last legal codepoint in Unicode.
*/
#define MAX_LEGAL_UNICODE 0x10FFFF
/*!
\def REPLACEMENT_CHARACTER
\brief The codepoint used to replace illegal codepoints.
*/
#define REPLACEMENT_CHARACTER 0xFFFD
/*!
\def REPLACEMENT_CHARACTER_STRING
\brief The replacement character as a UTF-8 encoded string.
*/
#define REPLACEMENT_CHARACTER_STRING "\xEF\xBF\xBD"
/*!
\def REPLACEMENT_CHARACTER_STRING_LENGTH
\brief Length of the UTF-8 encoded string of the replacment character.
*/
#define REPLACEMENT_CHARACTER_STRING_LENGTH 3
/*!
\def SURROGATE_HIGH_START
\brief The minimum codepoint for the high member of a surrogate pair.
*/
#define SURROGATE_HIGH_START 0xD800
/*!
\def SURROGATE_HIGH_END
\brief The maximum codepoint for the high member of a surrogate pair.
*/
#define SURROGATE_HIGH_END 0xDBFF
/*!
\def SURROGATE_LOW_START
\brief The minimum codepoint for the low member of a surrogate pair.
*/
#define SURROGATE_LOW_START 0xDC00
/*!
\def SURROGATE_LOW_END
\brief The maximum codepoint for the low member of a surrogate pair.
*/
#define SURROGATE_LOW_END 0xDFFF
/*!
\def HANGUL_JAMO_FIRST
\brief The first codepoint part of the Hangul Jamo block.
*/
#define HANGUL_JAMO_FIRST 0x1100
/*!
\def HANGUL_JAMO_LAST
\brief The last codepoint part of the Hangul Jamo block.
*/
#define HANGUL_JAMO_LAST 0x11FF
/*!
\def HANGUL_L_FIRST
\brief The first codepoint part of the Hangul Jamo L section used for
normalization.
*/
#define HANGUL_L_FIRST 0x1100
/*!
\def HANGUL_L_LAST
\brief The last codepoint part of the Hangul Jamo L section used for
normalization.
*/
#define HANGUL_L_LAST 0x1112
/*!
\def HANGUL_L_COUNT
\brief The number of codepoints in the Hangul Jamo L section.
*/
#define HANGUL_L_COUNT 19
/*!
\def HANGUL_V_FIRST
\brief The first codepoint part of the Hangul Jamo V section used for
normalization.
*/
#define HANGUL_V_FIRST 0x1161
/*!
\def HANGUL_V_LAST
\brief The last codepoint part of the Hangul Jamo V section used for
normalization.
*/
#define HANGUL_V_LAST 0x1175
/*!
\def HANGUL_V_COUNT
\brief The number of codepoints in the Hangul Jamo V section.
*/
#define HANGUL_V_COUNT 21
/*!
\def HANGUL_T_FIRST
\brief The first codepoint part of the Hangul Jamo T section used for
normalization.
*/
#define HANGUL_T_FIRST 0x11A7
/*!
\def HANGUL_T_LAST
\brief The last codepoint part of the Hangul Jamo V section used for
normalization.
*/
#define HANGUL_T_LAST 0x11C2
/*!
\def HANGUL_T_COUNT
\brief The number of codepoints in the Hangul Jamo T section.
*/
#define HANGUL_T_COUNT 28
/*!
\def HANGUL_N_COUNT
\brief Number of codepoints part of the Hangul Jamo V and T sections.
*/
#define HANGUL_N_COUNT 588 /* VCount * TCount */
/*!
\def HANGUL_S_FIRST
\brief The first codepoint in the Hangul Syllables block.
*/
#define HANGUL_S_FIRST 0xAC00
/*!
\def HANGUL_S_LAST
\brief The last codepoint in the Hangul Syllables block.
*/
#define HANGUL_S_LAST 0xD7A3
/*!
\def HANGUL_S_COUNT
\brief The number of codepoints in the Hangul Syllables block.
*/
#define HANGUL_S_COUNT 11172 /* LCount * NCount */
#define CP_LATIN_CAPITAL_LETTER_I 0x0049
#define CP_LATIN_CAPITAL_LETTER_J 0x004A
#define CP_LATIN_SMALL_LETTER_I 0x0069
#define CP_LATIN_SMALL_LETTER_J 0x006A
#define CP_LATIN_CAPITAL_LETTER_I_WITH_GRAVE 0x00CC
#define CP_LATIN_CAPITAL_LETTER_I_WITH_ACUTE 0x00CD
#define CP_LATIN_CAPITAL_LETTER_I_WITH_TILDE 0x0128
#define CP_LATIN_CAPITAL_LETTER_I_WITH_OGONEK 0x012E
#define CP_LATIN_SMALL_LETTER_I_WITH_OGONEK 0x012F
#define CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x0130
#define CP_LATIN_SMALL_LETTER_DOTLESS_I 0x0131
#define CP_COMBINING_GRAVE_ACCENT 0x0300
#define CP_COMBINING_ACUTE_ACCENT 0x0301
#define CP_COMBINING_TILDE_ACCENT 0x0303
#define CP_COMBINING_DOT_ABOVE 0x0307
#define CP_COMBINING_GREEK_YPOGEGRAMMENI 0x0345
#define CP_COMBINING_GRAPHEME_JOINER 0x034F
#define CP_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
#define CCC_NOT_REORDERED 0
#define CCC_OVERLAY 1
#define CCC_NUKTA 7
#define CCC_KANA_VOICING 8
#define CCC_VIRAMA 9
#define CCC_FIXED_POSITION_START 10
#define CCC_FIXED_POSITION_END 199
#define CCC_ATTACHED_BELOW_LEFT 200
#define CCC_ATTACHED_BELOW 202
#define CCC_ATTACHED_BOTTOM_RIGHT 204
#define CCC_ATTACHED_LEFT 208
#define CCC_ATTACHED_RIGHT 210
#define CCC_ATTACHED_TOP_LEFT 212
#define CCC_ATTACHED_ABOVE 214
#define CCC_ATTACHED_ABOVE_RIGHT 216
#define CCC_BELOW_LEFT 218
#define CCC_BELOW 220
#define CCC_BELOW_RIGHT 222
#define CCC_LEFT 224
#define CCC_RIGHT 226
#define CCC_ABOVE_LEFT 228
#define CCC_ABOVE 230
#define CCC_ABOVE_RIGHT 232
#define CCC_DOUBLE_BELOW 233
#define CCC_DOUBLE_ABOVE 234
#define CCC_IOTA_SUBSCRIPT 240
#define CCC_INVALID 255
/*!
\brief Get the number of bytes used for encoding a code point.
\param[in] byte Encoded byte
\return Number of bytes needed for decoding or 0 if input is illegal.
*/
extern const uint8_t codepoint_decoded_length[256];
/*!
\brief Write Unicode code point to UTF-8 encoded string.
Target buffer and size is modified by encoded size.
\param[in] encoded Unicode code point
\param[in,out] target Target buffer
\param[in,out] targetSize Size of output buffer in bytes
\return Bytes needed for encoding or 0 on error.
*/
uint8_t codepoint_write(unicode_t encoded, char** target, size_t* targetSize);
/*!
\brief Read Unicode code point from UTF-8 encoded string.
\param[in] input Input buffer
\param[in] inputSize Size of input buffer in bytes
\param[out] decoded Unicode codepoint
\return Bytes read from string or 0 on error.
*/
uint8_t codepoint_read(const char* input, size_t inputSize, unicode_t* decoded);
/*!
\}
*/
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_CODEPOINT_H_ */

View File

@ -0,0 +1,336 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "composition.h"
#include "codepoint.h"
#include "database.h"
uint8_t compose_initialize(ComposeState* state, StreamState* input, StreamState* output, uint8_t compatibility)
{
memset(state, 0, sizeof(ComposeState));
/* Ensure streams are valid */
if (input == 0 ||
output == 0)
{
return 0;
}
/* Set up streams */
state->input = input;
state->output = output;
memset(state->output, 0, sizeof(StreamState));
/* Set up codepoint quickcheck property */
if (compatibility == 1)
{
state->qc_index = QuickCheckNFKCIndexPtr;
state->qc_data = QuickCheckNFKCDataPtr;
}
else
{
state->qc_index = QuickCheckNFCIndexPtr;
state->qc_data = QuickCheckNFCDataPtr;
}
return 1;
}
uint8_t compose_readcodepoint(ComposeState* state, uint8_t index)
{
if (state->input->index == state->input->current &&
!stream_read(state->input, state->qc_index, state->qc_data))
{
/* End of data */
return 0;
}
/* Get next codepoint from sequence */
state->output->codepoint[index] = state->input->codepoint[state->input->index];
state->output->quick_check[index] = state->input->quick_check[state->input->index];
state->output->canonical_combining_class[index] = state->input->canonical_combining_class[state->input->index];
state->input->index++;
state->output->current++;
return 1;
}
uint8_t compose_execute(ComposeState* state)
{
uint8_t output_index;
uint8_t cursor_current;
uint8_t cursor_next;
/* Check if input is available */
if (state->input == 0)
{
return 0;
}
/* Reset output */
state->output->current = 0;
/* Read first codepoint */
if (!compose_readcodepoint(state, 0))
{
return 0;
}
for (output_index = 0; output_index < state->output->current; ++output_index)
{
/* Ensure current codepoint is a starter */
cursor_current = output_index;
while (state->output->canonical_combining_class[cursor_current] != CCC_NOT_REORDERED)
{
cursor_current++;
if (cursor_current == state->output->current &&
!compose_readcodepoint(state, cursor_current))
{
/* Only non-starters left */
return 1;
}
}
/* Get next codepoint */
cursor_next = cursor_current + 1;
while (
cursor_next < state->output->current ||
compose_readcodepoint(state, cursor_next))
{
/*
Two codepoints can be composed if the current codepoint is a starter
and the next codepoint isn't blocked by a previous codepoint.
*/
if (state->output->canonical_combining_class[cursor_next] > state->output->canonical_combining_class[cursor_next - 1] || /* Can be composed based on CCC */
/* Quick check value can override composition block by previous codepoint */
(state->output->quick_check[cursor_next] != QuickCheckResult_Yes && state->output->canonical_combining_class[cursor_next - 1] == CCC_NOT_REORDERED))
{
unicode_t composed = 0;
/*
Hangul composition
Algorithm adapted from Unicode Technical Report #15:
http://www.unicode.org/reports/tr15/tr15-18.html#Hangul
*/
if (state->output->codepoint[cursor_current] >= HANGUL_L_FIRST &&
state->output->codepoint[cursor_current] <= HANGUL_L_LAST)
{
/* Check for Hangul LV pair */
if (state->output->codepoint[cursor_next] >= HANGUL_V_FIRST &&
state->output->codepoint[cursor_next] <= HANGUL_V_LAST)
{
unicode_t l_index = state->output->codepoint[cursor_current] - HANGUL_L_FIRST;
unicode_t v_index = state->output->codepoint[cursor_next] - HANGUL_V_FIRST;
composed = HANGUL_S_FIRST + (((l_index * HANGUL_V_COUNT) + v_index) * HANGUL_T_COUNT);
}
}
else if (
state->output->codepoint[cursor_current] >= HANGUL_S_FIRST &&
state->output->codepoint[cursor_current] <= HANGUL_S_LAST)
{
/* Check for Hangul LV and T pair */
if (state->output->codepoint[cursor_next] >= HANGUL_T_FIRST &&
state->output->codepoint[cursor_next] <= HANGUL_T_LAST)
{
unicode_t t_index = state->output->codepoint[cursor_next] - HANGUL_T_FIRST;
composed = state->output->codepoint[cursor_current] + t_index;
}
}
else
{
/* Attempt to compose codepoints using the database */
composed = database_querycomposition(
state->output->codepoint[cursor_current],
state->output->codepoint[cursor_next]);
}
/* Check if composition succeeded */
if (composed != 0)
{
/*
When we successfully compose two codepoints, the second must be removed
from the sequence. The way this is accomplished is by marking the cell
empty with a NUL codepoint.
Decomposed:
codepoint U+0044 U+0307 U+0031
index 0 1 2
Composed:
codepoint U+1E0A U+0000 U+0031
index 0 1 2
If the second codepoint was at the end of the sequence, the output
sequence is shortened by one.
*/
/* Add composition to output */
state->output->codepoint[cursor_current] = composed;
state->output->quick_check[cursor_current] = PROPERTY_GET(state->qc_index, state->qc_data, composed);
state->output->canonical_combining_class[cursor_current] = PROPERTY_GET_CCC(composed);
/* Clear next codepoint from output */
state->output->codepoint[cursor_next] = 0;
state->output->quick_check[cursor_next] = QuickCheckResult_Yes;
state->output->canonical_combining_class[cursor_next] = CCC_NOT_REORDERED;
if (cursor_next == state->output->current - 1)
{
/* Next codepoint was at end of output */
state->output->current--;
}
/* Reset cursor to current output index */
cursor_current = output_index;
cursor_next = output_index;
}
}
else if (
state->output->canonical_combining_class[cursor_next] == CCC_NOT_REORDERED)
{
/* Attempt to compose starters, but do not read from the next sequence */
break;
}
/* Evaluate next codepoint */
cursor_next++;
}
/* Fill up "holes" left by composing codepoints not at the end of the sequence */
if (state->output->current > 1)
{
uint8_t write_index = 0;
uint8_t read_index = 1;
/*
We want to move valid codepoints to the left as much as possible in order to fill up
holes left by the composition process.
Note that the process does not clear unused codepoints at the end, this is a small
optimization in order to avoid unnecessary clears. The length member is adjusted to
the new size.
Before reordering:
codepoint A B 0 0 0 D
index 0 1 2 3 4 5
length 6
After reordering:
codepoint A B D 0 0 D
index 0 1 2 3 4 5
length 3
*/
/* Evaluate all codepoints in output sequence */
while (write_index < state->output->current)
{
/* Check if read cursor is on an empty cell */
if (read_index < state->output->current &&
state->output->codepoint[read_index] == 0)
{
/* Skip all empty cells */
while (
read_index < state->output->current &&
state->output->codepoint[read_index] == 0)
{
read_index++;
}
if (read_index == state->output->current)
{
/* Reached end of data */
break;
}
/* Copy cell at read cursor to write cursor */
state->output->codepoint[write_index] = state->output->codepoint[read_index];
state->output->quick_check[write_index] = state->output->quick_check[read_index];
state->output->canonical_combining_class[write_index] = state->output->canonical_combining_class[read_index];
}
/* Move cursors */
write_index++;
read_index++;
}
/* Adjust length of output sequence */
state->output->current = write_index;
}
else
{
/* Evaluated all sequences in output */
state->input = 0;
break;
}
}
return 1;
}

View File

@ -0,0 +1,54 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_COMPOSITION_H_
#define _UTF8REWIND_INTERNAL_COMPOSITION_H_
/*!
\file
\brief Composition interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
#include "streaming.h"
typedef struct {
StreamState* input;
StreamState* output;
const size_t* qc_index;
const uint8_t* qc_data;
} ComposeState;
uint8_t compose_initialize(ComposeState* state, StreamState* input, StreamState* output, uint8_t compatibility);
uint8_t compose_readcodepoint(ComposeState* state, uint8_t index);
uint8_t compose_execute(ComposeState* state);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_COMPOSITION_H_ */

View File

@ -0,0 +1,113 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "database.h"
#include "../unicodedatabase.h"
#include "codepoint.h"
#define DECOMPOSE_INDEX1_SHIFT (12)
#define DECOMPOSE_INDEX2_SHIFT (5)
static const unicode_t DECOMPOSE_INDEX1_MASK = MAX_LEGAL_UNICODE;
static const unicode_t DECOMPOSE_INDEX2_MASK = (1 << DECOMPOSE_INDEX1_SHIFT) - 1;
static const unicode_t DECOMPOSE_DATA_MASK = (1 << DECOMPOSE_INDEX2_SHIFT) - 1;
const char* database_querydecomposition(unicode_t codepoint, const uint32_t* index1Array, const uint32_t* index2Array, const uint32_t* dataArray, uint8_t* length)
{
uint32_t index;
uint32_t data;
index = index1Array[codepoint >> DECOMPOSE_INDEX1_SHIFT];
index = index2Array[index + ((codepoint & DECOMPOSE_INDEX2_MASK) >> DECOMPOSE_INDEX2_SHIFT)];
index = index + (codepoint & DECOMPOSE_DATA_MASK);
if (index == 0 ||
(data = dataArray[index]) == 0)
{
*length = 0;
return 0;
}
*length = (uint8_t)((data & 0xFF000000) >> 24);
return CompressedStringData + (data & 0x00FFFFFF);
}
unicode_t database_querycomposition(unicode_t left, unicode_t right)
{
uint64_t key = ((uint64_t)left << 32) + (uint64_t)right;
size_t offset_start = 0;
size_t offset_end = UnicodeCompositionRecordCount - 1;
size_t offset_pivot;
size_t i;
if (key < UnicodeCompositionRecordPtr[offset_start].key ||
key > UnicodeCompositionRecordPtr[offset_end].key)
{
return 0;
}
do
{
offset_pivot = offset_start + ((offset_end - offset_start) / 2);
if (key == UnicodeCompositionRecordPtr[offset_start].key)
{
return UnicodeCompositionRecordPtr[offset_start].value;
}
else if (key == UnicodeCompositionRecordPtr[offset_end].key)
{
return UnicodeCompositionRecordPtr[offset_end].value;
}
else if (key == UnicodeCompositionRecordPtr[offset_pivot].key)
{
return UnicodeCompositionRecordPtr[offset_pivot].value;
}
else
{
if (key > UnicodeCompositionRecordPtr[offset_pivot].key)
{
offset_start = offset_pivot;
}
else
{
offset_end = offset_pivot;
}
}
}
while (offset_end - offset_start > 32);
for (i = offset_start; i <= offset_end; ++i)
{
if (key == UnicodeCompositionRecordPtr[i].key)
{
return UnicodeCompositionRecordPtr[i].value;
}
}
return 0;
}

View File

@ -0,0 +1,91 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_DATABASE_H_
#define _UTF8REWIND_INTERNAL_DATABASE_H_
/*!
\file
\brief Database interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
#include "../unicodedatabase.h"
enum QuickCheckCaseMapped
{
QuickCheckCaseMapped_Uppercase = 0x01,
QuickCheckCaseMapped_Lowercase = 0x02,
QuickCheckCaseMapped_Titlecase = 0x04,
QuickCheckCaseMapped_Casefolded = 0x08,
};
enum QuickCheckResult
{
QuickCheckResult_Yes,
QuickCheckResult_Maybe,
QuickCheckResult_No,
};
#define PROPERTY_INDEX_SHIFT (5)
static const unicode_t PROPERTY_DATA_MASK = (1 << PROPERTY_INDEX_SHIFT) - 1;
#define PROPERTY_GET(_indexArray, _dataArray, _cp) \
(_dataArray)[ \
(_indexArray)[(_cp) >> PROPERTY_INDEX_SHIFT] + \
((_cp) & PROPERTY_DATA_MASK)]
#define PROPERTY_GET_GC(_cp) \
PROPERTY_GET(GeneralCategoryIndexPtr, GeneralCategoryDataPtr, _cp)
#define PROPERTY_GET_CCC(_cp) \
PROPERTY_GET(CanonicalCombiningClassIndexPtr, CanonicalCombiningClassDataPtr, _cp)
#define PROPERTY_GET_CM(_cp) \
PROPERTY_GET(QuickCheckCaseMappedIndexPtr, QuickCheckCaseMappedDataPtr, _cp)
#define PROPERTY_GET_NFC(_cp) \
PROPERTY_GET(QuickCheckNFCIndexPtr, QuickCheckNFCDataPtr, _cp)
#define PROPERTY_GET_NFD(_cp) \
PROPERTY_GET(QuickCheckNFDIndexPtr, QuickCheckNFDDataPtr, _cp)
#define PROPERTY_GET_NFKC(_cp) \
PROPERTY_GET(QuickCheckNFKCIndexPtr, QuickCheckNFKCDataPtr, _cp)
#define PROPERTY_GET_NFKD(_cp) \
PROPERTY_GET(QuickCheckNFKDIndexPtr, QuickCheckNFKDDataPtr, _cp)
const char* database_querydecomposition(unicode_t codepoint, const uint32_t* index1Array, const uint32_t* index2Array, const uint32_t* dataArray, uint8_t* length);
unicode_t database_querycomposition(unicode_t left, unicode_t right);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_DATABASE_H_ */

View File

@ -0,0 +1,339 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "decomposition.h"
#include "codepoint.h"
#include "database.h"
uint8_t decompose_initialize(
DecomposeState* state,
StreamState* input, StreamState* output,
uint8_t compatibility)
{
memset(state, 0, sizeof(DecomposeState));
/* Ensure streams are valid */
if (input == 0 ||
output == 0)
{
return 0;
}
/* Set up streams */
state->input = input;
state->output = output;
memset(state->output, 0, sizeof(StreamState));
/* Set up codepoint quickcheck property */
if (compatibility == 1)
{
state->property_index1 = NFKDIndex1Ptr;
state->property_index2 = NFKDIndex2Ptr;
state->property_data = NFKDDataPtr;
state->qc_index = QuickCheckNFKDIndexPtr;
state->qc_data = QuickCheckNFKDDataPtr;
}
else
{
state->property_index1 = NFDIndex1Ptr;
state->property_index2 = NFDIndex2Ptr;
state->property_data = NFDDataPtr;
state->qc_index = QuickCheckNFDIndexPtr;
state->qc_data = QuickCheckNFDDataPtr;
}
return 1;
}
uint8_t decompose_execute(DecomposeState* state)
{
unicode_t* src_codepoint;
unicode_t* dst_codepoint;
uint8_t* dst_canonical_combining_class;
uint8_t* dst_quick_check;
uint8_t uncached = 1;
/* Check if input is valid */
if (state->input == 0)
{
return 0;
}
/* Set up output */
state->output->current = 0;
state->output->index = 0;
state->output->stable = 1;
dst_codepoint = state->output->codepoint;
dst_canonical_combining_class = state->output->canonical_combining_class;
dst_quick_check = state->output->quick_check;
/* Check cache for stored sequences */
if (state->cache_current < state->cache_filled)
{
/* Read from cache */
while (state->cache_current < state->cache_filled)
{
if (state->output->current > 0 &&
state->cache_canonical_combining_class[state->cache_current] == CCC_NOT_REORDERED)
{
/* Sequence ends on next non-starter or end of data */
break;
}
*dst_codepoint++ = state->cache_codepoint[state->cache_current];
*dst_canonical_combining_class++ = state->cache_canonical_combining_class[state->cache_current];
*dst_quick_check++ = QuickCheckResult_Yes;
state->output->current++;
state->cache_current++;
}
/* Check if cache has been emptied */
if (state->cache_current == state->cache_filled)
{
state->cache_current = 0;
state->cache_filled = 0;
}
/* Check for additional input */
if (state->input->index == state->input->current)
{
/* Don't compare canonical combining classes, output will always be stable */
return state->output->current;
}
}
/* Read next sequence from input */
if (state->input->index == state->input->current &&
!stream_read(state->input, state->qc_index, state->qc_data))
{
/* End of data */
state->input = 0;
return 0;
}
/* Read from source */
src_codepoint = state->input->codepoint + state->input->index;
while (state->input->index < state->input->current)
{
if (*src_codepoint <= MAX_BASIC_LATIN)
{
/* Basic Latin codepoints are already decomposed */
if (uncached)
{
*dst_codepoint++ = *src_codepoint;
*dst_canonical_combining_class++ = CCC_NOT_REORDERED;
*dst_quick_check++ = QuickCheckResult_Yes;
state->output->current++;
}
else
{
state->cache_codepoint[state->cache_filled] = *src_codepoint;
state->cache_canonical_combining_class[state->cache_filled] = CCC_NOT_REORDERED;
state->cache_filled++;
}
}
else if (
*src_codepoint >= HANGUL_S_FIRST &&
*src_codepoint <= HANGUL_S_LAST)
{
/*
Hangul decomposition
Algorithm adapted from Unicode Technical Report #15:
http://www.unicode.org/reports/tr15/tr15-18.html#Hangul
*/
unicode_t s_index = *src_codepoint - HANGUL_S_FIRST;
if (uncached)
{
*dst_codepoint++ = HANGUL_L_FIRST + (s_index / HANGUL_N_COUNT);
*dst_canonical_combining_class++ = CCC_NOT_REORDERED;
*dst_quick_check++ = QuickCheckResult_Yes;
state->output->current++;
}
else
{
state->cache_codepoint[state->cache_filled] = HANGUL_L_FIRST + (s_index / HANGUL_N_COUNT);
state->cache_canonical_combining_class[state->cache_filled] = CCC_NOT_REORDERED;
state->cache_filled++;
}
/* Store subsequent non-starters in cache */
uncached = 0;
state->cache_codepoint[state->cache_filled] = HANGUL_V_FIRST + (s_index % HANGUL_N_COUNT) / HANGUL_T_COUNT;
state->cache_canonical_combining_class[state->cache_filled] = CCC_NOT_REORDERED;
state->cache_filled++;
if ((s_index % HANGUL_T_COUNT) != 0)
{
state->cache_codepoint[state->cache_filled] = HANGUL_T_FIRST + (s_index % HANGUL_T_COUNT);
state->cache_canonical_combining_class[state->cache_filled] = CCC_NOT_REORDERED;
state->cache_filled++;
}
}
else
{
/* Use quick check to skip stable codepoints */
unicode_t decoded_codepoint = *src_codepoint;
uint8_t decoded_quick_check = PROPERTY_GET(state->qc_index, state->qc_data, decoded_codepoint);
uint8_t decoded_canonical_combining_class;
uint8_t decoded_size;
if (decoded_quick_check != QuickCheckResult_Yes)
{
/* Check database for decomposition */
uint8_t src_size;
const char* src = database_querydecomposition(
decoded_codepoint,
state->property_index1, state->property_index2, state->property_data,
&src_size);
while (src_size > 0)
{
/* Decode current codepoint */
decoded_size = codepoint_read(src, src_size, &decoded_codepoint);
if (decoded_size == 0)
{
break;
}
decoded_canonical_combining_class = PROPERTY_GET_CCC(decoded_codepoint);
/* Check for end of sequence */
if (uncached &&
state->output->current > 0 &&
decoded_canonical_combining_class == CCC_NOT_REORDERED)
{
uncached = 0;
}
if (uncached)
{
/* Write codepoint to output */
*dst_codepoint++ = decoded_codepoint;
*dst_canonical_combining_class++ = decoded_canonical_combining_class;
*dst_quick_check++ = QuickCheckResult_Yes;
state->output->current++;
}
else
{
/* Store in cache */
state->cache_codepoint[state->cache_filled] = decoded_codepoint;
state->cache_canonical_combining_class[state->cache_filled] = decoded_canonical_combining_class;
state->cache_filled++;
}
src += decoded_size;
src_size -= decoded_size;
}
}
else
{
decoded_canonical_combining_class = PROPERTY_GET_CCC(decoded_codepoint);
if (uncached)
{
/* Write codepoint to output */
*dst_codepoint++ = decoded_codepoint;
*dst_canonical_combining_class++ = decoded_canonical_combining_class;
*dst_quick_check++ = decoded_quick_check;
state->output->current++;
}
else
{
/* Store in cache */
state->cache_codepoint[state->cache_filled] = decoded_codepoint;
state->cache_canonical_combining_class[state->cache_filled] = decoded_canonical_combining_class;
state->cache_filled++;
}
}
}
src_codepoint++;
state->input->index++;
}
if (state->output->current > 1)
{
/* Check if output is stable by comparing canonical combining classes */
uint8_t i;
for (i = 1; i < state->output->current; ++i)
{
if (state->output->canonical_combining_class[i] < state->output->canonical_combining_class[i - 1])
{
state->output->stable = 0;
break;
}
}
}
return state->output->current;
}

View File

@ -0,0 +1,59 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_DECOMPOSITION_H_
#define _UTF8REWIND_INTERNAL_DECOMPOSITION_H_
/*!
\file
\brief Decomposition interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
#include "streaming.h"
typedef struct {
StreamState* input;
StreamState* output;
const size_t* qc_index;
const uint8_t* qc_data;
const uint32_t* property_index1;
const uint32_t* property_index2;
const uint32_t* property_data;
unicode_t cache_codepoint[STREAM_BUFFER_MAX];
uint8_t cache_canonical_combining_class[STREAM_BUFFER_MAX];
uint8_t cache_current;
uint8_t cache_filled;
} DecomposeState;
uint8_t decompose_initialize(DecomposeState* state, StreamState* input, StreamState* output, uint8_t compatibility);
uint8_t decompose_execute(DecomposeState* state);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_DECOMPOSITION_H_ */

View File

@ -0,0 +1,187 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "seeking.h"
#include "codepoint.h"
const char* seeking_forward(const char* input, const char* inputEnd, size_t inputSize, off_t offset)
{
if (inputEnd <= input || /* Swapped parameters */
offset <= 0 || /* Invalid offset */
inputSize == 0) /* Nothing to do */
{
return input;
}
else if (
offset >= (off_t)inputSize) /* Out of bounds */
{
return inputEnd;
}
do
{
/* Get decoded length of next sequence */
uint8_t codepoint_length = codepoint_decoded_length[(uint8_t)*input];
if (codepoint_length > 1 &&
codepoint_length < 7)
{
/* Check all bytes of multi-byte sequence */
uint8_t i;
for (i = 0; i < codepoint_length; ++i)
{
/* Next byte of sequence */
input++;
if (input == inputEnd || /* End of data */
codepoint_decoded_length[(uint8_t)*input] != 0) /* Not a continuation byte */
{
break;
}
}
}
else
{
/* Skip to next sequence */
input++;
}
}
while (input < inputEnd &&
--offset > 0);
return input;
}
const char* seeking_rewind(const char* inputStart, const char* input, size_t inputSize, off_t offset)
{
const char* marker;
const char* marker_valid;
if (inputStart >= input || /* Swapped parameters */
offset >= 0) /* Invalid offset */
{
return input;
}
else if (
-offset >= (off_t)inputSize) /* Out of bounds */
{
return inputStart;
}
/* Set up the marker */
marker = input - 1;
marker_valid = marker;
do
{
/* Move the cursor */
input--;
/* Move the marker until we encounter a valid sequence */
while (marker_valid == input)
{
uint8_t codepoint_length = codepoint_decoded_length[(uint8_t)*marker];
if (codepoint_length == 1 || /* Basic Latin */
codepoint_length == 7) /* Illegal byte */
{
marker_valid = marker;
break;
}
else if (
codepoint_length > 1)
{
if (marker == inputStart &&
/* Not overlong */
marker_valid - inputStart == codepoint_length - 1)
{
/* Last sequence */
return marker;
}
else
{
/* Multi-byte sequence */
marker_valid = marker + codepoint_length - 1;
break;
}
}
else if (
marker <= inputStart)
{
/* Continuation bytes only */
marker_valid = marker;
break;
}
else
{
/* Move marker to next byte */
marker--;
}
}
/* Read the next part of a sequence */
if (input <= marker_valid)
{
if (marker == inputStart)
{
/* Last sequence */
return marker;
}
else
{
/* Move the cursor to the start of the sequence */
input = marker;
/* Reset the marker on the next byte */
marker--;
marker_valid = marker;
}
}
}
while (input >= inputStart &&
++offset < 0);
return input;
}

View File

@ -0,0 +1,44 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_SEEKING_H_
#define _UTF8REWIND_INTERNAL_SEEKING_H_
/*!
\file
\brief Seeking interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
const char* seeking_forward(const char* input, const char* inputEnd, size_t inputSize, off_t offset);
const char* seeking_rewind(const char* inputStart, const char* input, size_t inputSize, off_t offset);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_SEEKING_H_ */

View File

@ -0,0 +1,236 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "streaming.h"
#include "codepoint.h"
#include "database.h"
uint8_t stream_initialize(StreamState* state, const char* input, size_t inputSize)
{
memset(state, 0, sizeof(StreamState));
if (input == 0 ||
inputSize == 0)
{
return 0;
}
state->src = input;
state->src_size = inputSize;
state->stable = 1;
return 1;
}
uint8_t stream_read(StreamState* state, const size_t* propertyIndex, const uint8_t* propertyData)
{
/* Ensure input is available */
if (state->src_size == 0 ||
propertyIndex == 0 ||
propertyData == 0)
{
return 0;
}
/* Reset sequence after the first pass */
if (state->filled > 0)
{
/* Check for end of data */
if (state->filled == state->current &&
state->src_size <= state->last_length)
{
state->src_size = 0;
state->index = 0;
state->current = 0;
state->filled = 0;
return 0;
}
/* Copy last peeked codepoint to new sequence */
state->codepoint[0] = state->codepoint[state->filled - 1];
state->canonical_combining_class[0] = state->canonical_combining_class[state->filled - 1];
state->quick_check[0] = state->quick_check[state->filled - 1];
/* New sequence always starts as stable */
state->stable = 1;
/* Reset buffer members */
state->index = 0;
state->current = 1;
state->filled = 1;
}
/* Read codepoints */
while (state->filled < STREAM_SAFE_MAX)
{
/* Move the input cursor after peeking */
if (state->last_length > 0)
{
if (state->src_size <= state->last_length)
{
state->src += state->src_size;
state->src_size = 0;
break;
}
state->src += state->last_length;
state->src_size -= state->last_length;
}
/* Peek the next codepoint */
state->last_length = codepoint_read(state->src, state->src_size, &state->codepoint[state->filled]);
state->quick_check[state->filled] = PROPERTY_GET(propertyIndex, propertyData, state->codepoint[state->filled]);
state->canonical_combining_class[state->filled] = PROPERTY_GET_CCC(state->codepoint[state->filled]);
state->filled++;
if (state->current > 0)
{
/* Sequences end on the next starter and can consist of only non-starters */
if (state->canonical_combining_class[state->current] == 0)
{
break;
}
/* Check if sequence is unstable by comparing canonical combining classes */
if (state->stable &&
state->canonical_combining_class[state->current] < state->canonical_combining_class[state->current - 1])
{
state->stable = 0;
}
}
state->current++;
}
if (state->filled == STREAM_SAFE_MAX)
{
/* Insert COMBINING GRAPHEME JOINER into output */
state->codepoint[state->filled] = CP_COMBINING_GRAPHEME_JOINER;
state->quick_check[state->filled] = QuickCheckResult_Yes;
state->canonical_combining_class[state->filled] = CCC_NOT_REORDERED;
state->filled++;
}
return 1;
}
uint8_t stream_write(StreamState* state, char** output, size_t* outputSize, uint8_t* bytesWritten)
{
uint8_t i;
if (state->current == 0)
{
/* Nothing to write */
*bytesWritten = 0;
return 1;
}
/* Encode code points as UTF-8 */
for (i = 0; i < state->current; ++i)
{
uint8_t encoded_size = codepoint_write(state->codepoint[i], output, outputSize);
if (encoded_size == 0)
{
/* Not enough space */
return 0;
}
*bytesWritten += encoded_size;
}
return 1;
}
uint8_t stream_reorder(StreamState* state)
{
uint8_t i;
uint8_t dirty = 1;
if (state->current == 0)
{
/* Nothing to do */
return 0;
}
/* Reorder codepoints until the entire sequence is table */
do
{
dirty = 0;
for (i = 1; i < state->current; i++)
{
/* Sort codepoints by canonical combining class, smallest to largest */
if (state->canonical_combining_class[i] < state->canonical_combining_class[i - 1])
{
unicode_t swap_cp;
uint8_t swap_qc;
uint8_t swap_ccc;
swap_cp = state->codepoint[i];
state->codepoint[i] = state->codepoint[i - 1];
state->codepoint[i - 1] = swap_cp;
swap_qc = state->quick_check[i];
state->quick_check[i] = state->quick_check[i - 1];
state->quick_check[i - 1] = swap_qc;
swap_ccc = state->canonical_combining_class[i];
state->canonical_combining_class[i] = state->canonical_combining_class[i - 1];
state->canonical_combining_class[i - 1] = swap_ccc;
dirty = 1;
}
}
}
while (dirty == 1);
return 1;
}

View File

@ -0,0 +1,84 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_STREAMING_H_
#define _UTF8REWIND_INTERNAL_STREAMING_H_
/*!
\file
\brief Streaming interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
/*
UAX15-D4. Stream-Safe Text Process
This is the process of producing a Unicode string in Stream-Safe Text Format by processing that string
from start to finish, inserting U+034F COMBINING GRAPHEME JOINER (CGJ) within long sequences of
non-starters. The exact position of the inserted CGJs are determined according to the following algorithm,
which describes the generation of an output string from an input string:
* If the input string is empty, return an empty output string.
* Set nonStarterCount to zero.
* For each code point C in the input string:
* Produce the NFKD decomposition S.
* If nonStarterCount plus the number of initial non-starters in S is greater than 30, append a CGJ to
the output string and set the nonStarterCount to zero.
* Append C to the output string.
* If there are no starters in S, increment nonStarterCount by the number of code points in S; otherwise,
set nonStarterCount to the number of trailing non-starters in S (which may be zero).
* Return the output string.
*/
#define STREAM_SAFE_MAX 30
#define STREAM_BUFFER_MAX 32
typedef struct {
const char* src;
size_t src_size;
uint8_t index;
uint8_t current;
uint8_t filled;
uint8_t stable;
uint8_t last_length;
unicode_t codepoint[STREAM_BUFFER_MAX];
uint8_t quick_check[STREAM_BUFFER_MAX];
uint8_t canonical_combining_class[STREAM_BUFFER_MAX];
} StreamState;
uint8_t stream_initialize(StreamState* state, const char* input, size_t inputSize);
uint8_t stream_read(StreamState* state, const size_t* propertyIndex, const uint8_t* propertyData);
uint8_t stream_write(StreamState* state, char** output, size_t* outputSize, uint8_t* bytesWritten);
uint8_t stream_reorder(StreamState* state);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_STREAMING_H_ */

11739
third_party/utf8rewind/unicodedatabase.c vendored Normal file

File diff suppressed because it is too large Load Diff

119
third_party/utf8rewind/unicodedatabase.h vendored Normal file
View File

@ -0,0 +1,119 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_UNICODEDATABASE_H_
#define _UTF8REWIND_UNICODEDATABASE_H_
/*!
\file
\brief Unicode property database.
\cond INTERNAL
*/
#include "utf8rewind.h"
typedef struct {
unicode_t codepoint;
uint32_t length_and_offset;
} DecompositionRecord;
typedef struct {
uint64_t key;
unicode_t value;
} CompositionRecord;
extern const size_t* GeneralCategoryIndexPtr;
extern const uint32_t* GeneralCategoryDataPtr;
extern const size_t* CanonicalCombiningClassIndexPtr;
extern const uint8_t* CanonicalCombiningClassDataPtr;
extern const size_t* QuickCheckCaseMappedIndexPtr;
extern const uint8_t* QuickCheckCaseMappedDataPtr;
extern const size_t* QuickCheckNFCIndexPtr;
extern const uint8_t* QuickCheckNFCDataPtr;
extern const size_t* QuickCheckNFDIndexPtr;
extern const uint8_t* QuickCheckNFDDataPtr;
extern const size_t* QuickCheckNFKCIndexPtr;
extern const uint8_t* QuickCheckNFKCDataPtr;
extern const size_t* QuickCheckNFKDIndexPtr;
extern const uint8_t* QuickCheckNFKDDataPtr;
extern const size_t UnicodeNFDRecordCount;
extern const DecompositionRecord* UnicodeNFDRecordPtr;
extern const size_t UnicodeNFKDRecordCount;
extern const DecompositionRecord* UnicodeNFKDRecordPtr;
extern const size_t UnicodeUppercaseRecordCount;
extern const DecompositionRecord* UnicodeUppercaseRecordPtr;
extern const size_t UnicodeLowercaseRecordCount;
extern const DecompositionRecord* UnicodeLowercaseRecordPtr;
extern const size_t UnicodeTitlecaseRecordCount;
extern const DecompositionRecord* UnicodeTitlecaseRecordPtr;
extern const size_t UnicodeCompositionRecordCount;
extern const CompositionRecord* UnicodeCompositionRecordPtr;
extern const uint32_t* NFDIndex1Ptr;
extern const uint32_t* NFDIndex2Ptr;
extern const uint32_t* NFDDataPtr;
extern const uint32_t* NFKDIndex1Ptr;
extern const uint32_t* NFKDIndex2Ptr;
extern const uint32_t* NFKDDataPtr;
extern const uint32_t* UppercaseIndex1Ptr;
extern const uint32_t* UppercaseIndex2Ptr;
extern const uint32_t* UppercaseDataPtr;
extern const uint32_t* LowercaseIndex1Ptr;
extern const uint32_t* LowercaseIndex2Ptr;
extern const uint32_t* LowercaseDataPtr;
extern const uint32_t* TitlecaseIndex1Ptr;
extern const uint32_t* TitlecaseIndex2Ptr;
extern const uint32_t* TitlecaseDataPtr;
extern const uint32_t* CaseFoldingIndex1Ptr;
extern const uint32_t* CaseFoldingIndex2Ptr;
extern const uint32_t* CaseFoldingDataPtr;
extern const char* CompressedStringData;
extern const size_t CompressedStringDataLength;
extern const char* DecompositionData;
extern const size_t DecompositionDataLength;
/*! \endcond */
#endif /* _UTF8REWIND_UNICODEDATABASE_H_ */

1429
third_party/utf8rewind/utf8rewind.c vendored Normal file

File diff suppressed because it is too large Load Diff

1870
third_party/utf8rewind/utf8rewind.h vendored Normal file

File diff suppressed because it is too large Load Diff