1877 lines
56 KiB
C
1877 lines
56 KiB
C
/*
|
|
Copyright (C) 2014-2016 Quinten Lansu
|
|
|
|
Permission is hereby granted, free of charge, to any person
|
|
obtaining a copy of this software and associated documentation
|
|
files (the "Software"), to deal in the Software without
|
|
restriction, including without limitation the rights to use,
|
|
copy, modify, merge, publish, distribute, sublicense, and/or
|
|
sell copies of the Software, and to permit persons to whom the
|
|
Software is furnished to do so, subject to the following
|
|
conditions:
|
|
|
|
The above copyright notice and this permission notice shall be
|
|
included in all copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef _UTF8REWIND_H_
|
|
#define _UTF8REWIND_H_
|
|
|
|
/*!
|
|
\file
|
|
\brief Public interface for UTF-8 functions.
|
|
|
|
`utf8rewind` is a system library written in C designed to extend the default
|
|
string handling functions with support for UTF-8 encoded text.
|
|
*/
|
|
|
|
/*!
|
|
\defgroup public Public interface
|
|
The public interface for the library.
|
|
|
|
\defgroup version Version information
|
|
Macros used to identify the version of the library.
|
|
|
|
\defgroup global-config Global configuration
|
|
Defines used for determining the global configuration of the system and your
|
|
application.
|
|
|
|
\defgroup errors Error codes
|
|
Values returned by functions on error.
|
|
|
|
\defgroup locales Locales
|
|
Values used by functions that change behavior based on the input locale.
|
|
|
|
\defgroup normalization Normalization flags
|
|
Flags used as input for #utf8normalize and the result of #utf8isnormalized.
|
|
|
|
\defgroup category Category flags
|
|
Flags to be used with #utf8iscategory, to check whether code points in a
|
|
string are part of that category.
|
|
|
|
\defgroup types Types
|
|
Custom type definitions used throughout the library.
|
|
*/
|
|
|
|
#include <locale.h>
|
|
#include <stddef.h>
|
|
#include <stdio.h>
|
|
#include <stdint.h>
|
|
#include <string.h>
|
|
#include <wchar.h>
|
|
|
|
#if defined(LINUX) || defined(OSX)
|
|
#include <strings.h>
|
|
#include <unistd.h>
|
|
#include <sys/types.h>
|
|
#endif
|
|
|
|
/*!
|
|
\addtogroup version
|
|
\{
|
|
*/
|
|
|
|
/*!
|
|
\def UTF8_VERSION_MAKE
|
|
\brief Macro for creating a version number from a major, minor and bugfix
|
|
number.
|
|
*/
|
|
#define UTF8_VERSION_MAKE(_major, _minor, _bugfix) \
|
|
((_major) * 10000) + ((_minor) * 100) + (_bugfix)
|
|
|
|
/*!
|
|
\def UTF8_VERSION_MAJOR
|
|
\brief The major version number of this release.
|
|
*/
|
|
#define UTF8_VERSION_MAJOR 1
|
|
|
|
/*!
|
|
\def UTF8_VERSION_MINOR
|
|
\brief The minor version number of this release.
|
|
*/
|
|
#define UTF8_VERSION_MINOR 5
|
|
|
|
/*!
|
|
\def UTF8_VERSION_BUGFIX
|
|
\brief The bugfix version number of this release.
|
|
*/
|
|
#define UTF8_VERSION_BUGFIX 1
|
|
|
|
/*!
|
|
\def UTF8_VERSION
|
|
\brief The version number as an integer.
|
|
*/
|
|
#define UTF8_VERSION \
|
|
UTF8_VERSION_MAKE(UTF8_VERSION_MAJOR, UTF8_VERSION_MINOR, UTF8_VERSION_BUGFIX)
|
|
|
|
/*!
|
|
\def UTF8_VERSION_STRING
|
|
\brief The verion number as a string.
|
|
*/
|
|
#define UTF8_VERSION_STRING "1.5.1"
|
|
|
|
/*!
|
|
\def UTF8_VERSION_GUARD
|
|
\brief Check if feature is supported by the current release.
|
|
*/
|
|
#define UTF8_VERSION_GUARD(_major, _minor, _bugfix) \
|
|
(UTF8_VERSION >= UTF8_VERSION_MAKE(_major, _minor, _bugfix))
|
|
|
|
/*!
|
|
\}
|
|
*/
|
|
|
|
/*!
|
|
\addtogroup errors
|
|
\{
|
|
*/
|
|
|
|
/*!
|
|
\def UTF8_ERR_NONE
|
|
\brief No errors.
|
|
*/
|
|
#define UTF8_ERR_NONE (0)
|
|
|
|
/*!
|
|
\def UTF8_ERR_INVALID_DATA
|
|
\brief Input data is invalid.
|
|
*/
|
|
#define UTF8_ERR_INVALID_DATA (-1)
|
|
|
|
/*!
|
|
\def UTF8_ERR_INVALID_FLAG
|
|
\brief Input flag is invalid.
|
|
*/
|
|
#define UTF8_ERR_INVALID_FLAG (-2)
|
|
|
|
/*!
|
|
\def UTF8_ERR_NOT_ENOUGH_SPACE
|
|
\brief Not enough space in buffer to store result.
|
|
*/
|
|
#define UTF8_ERR_NOT_ENOUGH_SPACE (-3)
|
|
|
|
/*!
|
|
\def UTF8_ERR_OVERLAPPING_PARAMETERS
|
|
\brief Input and output buffers overlap in memory.
|
|
*/
|
|
#define UTF8_ERR_OVERLAPPING_PARAMETERS (-4)
|
|
|
|
/*!
|
|
\def UTF8_ERR_INVALID_LOCALE
|
|
\brief Invalid locale specified.
|
|
*/
|
|
#define UTF8_ERR_INVALID_LOCALE (-5)
|
|
|
|
/*!
|
|
\}
|
|
*/
|
|
|
|
/*!
|
|
\addtogroup locales
|
|
\{
|
|
*/
|
|
|
|
/*!
|
|
\def UTF8_LOCALE_DEFAULT
|
|
\brief Used for text unaffected by changes in locale.
|
|
*/
|
|
#define UTF8_LOCALE_DEFAULT 0
|
|
|
|
/*!
|
|
\def UTF8_LOCALE_LITHUANIAN
|
|
\brief Changes behavior of the case mapping implementation when processing
|
|
specific code points. For more information, see here:
|
|
ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
|
|
*/
|
|
#define UTF8_LOCALE_LITHUANIAN 1
|
|
|
|
/*!
|
|
\def UTF8_LOCALE_TURKISH_AND_AZERI_LATIN
|
|
\brief Changes behavior of the case mapping implementation when processing
|
|
specific code points. For more information, see here:
|
|
ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
|
|
*/
|
|
#define UTF8_LOCALE_TURKISH_AND_AZERI_LATIN 2
|
|
|
|
/*!
|
|
\def UTF8_LOCALE_MAXIMUM
|
|
\brief Terminal value for locales. Valid locales do not exceed this value.
|
|
*/
|
|
#define UTF8_LOCALE_MAXIMUM 3
|
|
|
|
/*!
|
|
\}
|
|
*/
|
|
|
|
/*!
|
|
\addtogroup normalization
|
|
\{
|
|
*/
|
|
|
|
/*!
|
|
\def UTF8_NORMALIZE_COMPOSE
|
|
\brief Normalize input to Normalization Form C (NFC).
|
|
*/
|
|
#define UTF8_NORMALIZE_COMPOSE 0x00000001
|
|
|
|
/*!
|
|
\def UTF8_NORMALIZE_DECOMPOSE
|
|
\brief Normalize input to Normalization Form D (NFD).
|
|
*/
|
|
#define UTF8_NORMALIZE_DECOMPOSE 0x00000002
|
|
|
|
/*!
|
|
\def UTF8_NORMALIZE_COMPATIBILITY
|
|
\brief Changes Normalization Form from NFC to NFKC or from NFD to NFKD.
|
|
*/
|
|
#define UTF8_NORMALIZE_COMPATIBILITY 0x00000004
|
|
|
|
/*!
|
|
\def UTF8_NORMALIZATION_RESULT_YES
|
|
\brief Text is stable and does not need to be normalized.
|
|
*/
|
|
#define UTF8_NORMALIZATION_RESULT_YES (0)
|
|
|
|
/*!
|
|
\def UTF8_NORMALIZATION_RESULT_MAYBE
|
|
\brief Text is unstable, but normalization may be skipped.
|
|
*/
|
|
#define UTF8_NORMALIZATION_RESULT_MAYBE (1)
|
|
|
|
/*!
|
|
\def UTF8_NORMALIZATION_RESULT_NO
|
|
\brief Text is unstable and must be normalized.
|
|
*/
|
|
#define UTF8_NORMALIZATION_RESULT_NO (2)
|
|
|
|
/*!
|
|
\}
|
|
*/
|
|
|
|
/*!
|
|
\addtogroup category
|
|
\{
|
|
*/
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_LETTER_UPPERCASE
|
|
\brief Uppercase letter code points, Lu in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_LETTER_UPPERCASE 0x00000001
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_LETTER_LOWERCASE
|
|
\brief Lowercase letter code points, Ll in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_LETTER_LOWERCASE 0x00000002
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_LETTER_TITLECASE
|
|
\brief Titlecase letter code points, Lt in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_LETTER_TITLECASE 0x00000004
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_LETTER_MODIFIER
|
|
\brief Modifier letter code points, Lm in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_LETTER_MODIFIER 0x00000008
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_LETTER_OTHER
|
|
\brief Other letter code points, Lo in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_LETTER_OTHER 0x00000010
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_LETTER
|
|
\brief Combined flag for all letter categories.
|
|
*/
|
|
#define UTF8_CATEGORY_LETTER \
|
|
(UTF8_CATEGORY_LETTER_UPPERCASE | UTF8_CATEGORY_LETTER_LOWERCASE | \
|
|
UTF8_CATEGORY_LETTER_TITLECASE | UTF8_CATEGORY_LETTER_MODIFIER | \
|
|
UTF8_CATEGORY_LETTER_OTHER)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_CASE_MAPPED
|
|
\brief Combined flag for all letter categories with case mapping.
|
|
*/
|
|
#define UTF8_CATEGORY_CASE_MAPPED \
|
|
(UTF8_CATEGORY_LETTER_UPPERCASE | UTF8_CATEGORY_LETTER_LOWERCASE | \
|
|
UTF8_CATEGORY_LETTER_TITLECASE)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_MARK_NON_SPACING
|
|
\brief Non-spacing mark code points, Mn in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_MARK_NON_SPACING 0x00000020
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_MARK_SPACING
|
|
\brief Spacing mark code points, Mc in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_MARK_SPACING 0x00000040
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_MARK_ENCLOSING
|
|
\brief Enclosing mark code points, Me in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_MARK_ENCLOSING 0x00000080
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_MARK
|
|
\brief Combined flag for all mark categories.
|
|
*/
|
|
#define UTF8_CATEGORY_MARK \
|
|
(UTF8_CATEGORY_MARK_NON_SPACING | UTF8_CATEGORY_MARK_SPACING | \
|
|
UTF8_CATEGORY_MARK_ENCLOSING)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_NUMBER_DECIMAL
|
|
\brief Decimal number code points, Nd in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_NUMBER_DECIMAL 0x00000100
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_NUMBER_LETTER
|
|
\brief Letter number code points, Nl in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_NUMBER_LETTER 0x00000200
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_NUMBER_OTHER
|
|
\brief Other number code points, No in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_NUMBER_OTHER 0x00000400
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_NUMBER
|
|
\brief Combined flag for all number categories.
|
|
*/
|
|
#define UTF8_CATEGORY_NUMBER \
|
|
(UTF8_CATEGORY_NUMBER_DECIMAL | UTF8_CATEGORY_NUMBER_LETTER | \
|
|
UTF8_CATEGORY_NUMBER_OTHER)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_PUNCTUATION_CONNECTOR
|
|
\brief Connector punctuation category, Pc in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_PUNCTUATION_CONNECTOR 0x00000800
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_PUNCTUATION_DASH
|
|
\brief Dash punctuation category, Pd in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_PUNCTUATION_DASH 0x00001000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_PUNCTUATION_OPEN
|
|
\brief Open punctuation category, Ps in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_PUNCTUATION_OPEN 0x00002000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_PUNCTUATION_CLOSE
|
|
\brief Close punctuation category, Pe in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_PUNCTUATION_CLOSE 0x00004000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_PUNCTUATION_INITIAL
|
|
\brief Initial punctuation category, Pi in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_PUNCTUATION_INITIAL 0x00008000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_PUNCTUATION_FINAL
|
|
\brief Final punctuation category, Pf in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_PUNCTUATION_FINAL 0x00010000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_PUNCTUATION_OTHER
|
|
\brief Other punctuation category, Po in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_PUNCTUATION_OTHER 0x00020000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_PUNCTUATION
|
|
\brief Combined flag for all punctuation categories.
|
|
*/
|
|
#define UTF8_CATEGORY_PUNCTUATION \
|
|
(UTF8_CATEGORY_PUNCTUATION_CONNECTOR | UTF8_CATEGORY_PUNCTUATION_DASH | \
|
|
UTF8_CATEGORY_PUNCTUATION_OPEN | UTF8_CATEGORY_PUNCTUATION_CLOSE | \
|
|
UTF8_CATEGORY_PUNCTUATION_INITIAL | UTF8_CATEGORY_PUNCTUATION_FINAL | \
|
|
UTF8_CATEGORY_PUNCTUATION_OTHER)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SYMBOL_MATH
|
|
\brief Math symbol category, Sm in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_SYMBOL_MATH 0x00040000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SYMBOL_CURRENCY
|
|
\brief Currency symbol category, Sc in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_SYMBOL_CURRENCY 0x00080000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SYMBOL_MODIFIER
|
|
\brief Modifier symbol category, Sk in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_SYMBOL_MODIFIER 0x00100000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SYMBOL_OTHER
|
|
\brief Other symbol category, So in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_SYMBOL_OTHER 0x00200000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SYMBOL
|
|
\brief Combined flag for all symbol categories.
|
|
*/
|
|
#define UTF8_CATEGORY_SYMBOL \
|
|
(UTF8_CATEGORY_SYMBOL_MATH | UTF8_CATEGORY_SYMBOL_CURRENCY | \
|
|
UTF8_CATEGORY_SYMBOL_MODIFIER | UTF8_CATEGORY_SYMBOL_OTHER)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SEPARATOR_SPACE
|
|
\brief Space separator category, Zs in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_SEPARATOR_SPACE 0x00400000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SEPARATOR_LINE
|
|
\brief Line separator category, Zl in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_SEPARATOR_LINE 0x00800000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SEPARATOR_PARAGRAPH
|
|
\brief Paragraph separator category, Zp in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_SEPARATOR_PARAGRAPH 0x01000000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SEPARATOR
|
|
\brief Combined flag for all separator categories.
|
|
*/
|
|
#define UTF8_CATEGORY_SEPARATOR \
|
|
(UTF8_CATEGORY_SEPARATOR_SPACE | UTF8_CATEGORY_SEPARATOR_LINE | \
|
|
UTF8_CATEGORY_SEPARATOR_PARAGRAPH)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_CONTROL
|
|
\brief Control category, Cc in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_CONTROL 0x02000000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_FORMAT
|
|
\brief Format category, Cf in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_FORMAT 0x04000000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_SURROGATE
|
|
\brief Surrogate category, Cs in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_SURROGATE 0x08000000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_PRIVATE_USE
|
|
\brief Private use category, Co in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_PRIVATE_USE 0x10000000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_UNASSIGNED
|
|
\brief Unassigned category, Cn in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_UNASSIGNED 0x20000000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_COMPATIBILITY
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
functions, not found in the Unicode database.
|
|
*/
|
|
#define UTF8_CATEGORY_COMPATIBILITY 0x40000000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_IGNORE_GRAPHEME_CLUSTER
|
|
\brief Flag used for checking only the general category of code points at
|
|
the start of a grapheme cluster.
|
|
*/
|
|
#define UTF8_CATEGORY_IGNORE_GRAPHEME_CLUSTER 0x80000000
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISCNTRL
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`iscntrl` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISCNTRL \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_CONTROL)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISPRINT
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`isprint` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISPRINT \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER | \
|
|
UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL | \
|
|
UTF8_CATEGORY_SEPARATOR)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISSPACE
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`isspace` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISSPACE \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_SEPARATOR_SPACE)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISBLANK
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`isblank` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISBLANK \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_SEPARATOR_SPACE | UTF8_CATEGORY_PRIVATE_USE)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISGRAPH
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`isgraph` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISGRAPH \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER | \
|
|
UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISPUNCT
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`ispunct` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISPUNCT \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISALNUM
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`isalnum` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISALNUM \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISALPHA
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`isalpha` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISALPHA \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_LETTER)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISUPPER
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`isupper` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISUPPER \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_LETTER_UPPERCASE)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISLOWER
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`islower` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISLOWER \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_LETTER_LOWERCASE)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISDIGIT
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`isdigit` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISDIGIT \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_NUMBER)
|
|
|
|
/*!
|
|
\def UTF8_CATEGORY_ISXDIGIT
|
|
\brief Flag used for maintaining backwards compatibility with POSIX
|
|
`isxdigit` function.
|
|
*/
|
|
#define UTF8_CATEGORY_ISXDIGIT \
|
|
(UTF8_CATEGORY_COMPATIBILITY | \
|
|
UTF8_CATEGORY_NUMBER | UTF8_CATEGORY_PRIVATE_USE)
|
|
|
|
/*!
|
|
\}
|
|
*/
|
|
|
|
/*!
|
|
\addtogroup global-config
|
|
\{
|
|
*/
|
|
|
|
/*!
|
|
\def UTF8_WCHAR_SIZE
|
|
\brief Specifies the size of the `wchar_t` type. On Windows this is two
|
|
bytes, on POSIX systems it is four. If not specified on the command line,
|
|
the compiler tries to automatically determine the size of the `wchar_t` type
|
|
based on the environment.
|
|
*/
|
|
|
|
#ifndef UTF8_WCHAR_SIZE
|
|
#if (__SIZEOF_WCHAR_T__ == 4) || (WCHAR_MAX > UINT16_MAX) || (__WCHAR_MAX__ > UINT16_MAX)
|
|
#define UTF8_WCHAR_SIZE (4)
|
|
#else
|
|
#define UTF8_WCHAR_SIZE (2)
|
|
#endif
|
|
#endif
|
|
|
|
#if (UTF8_WCHAR_SIZE == 4)
|
|
/*!
|
|
\def UTF8_WCHAR_UTF32
|
|
\brief The `wchar_t` type is treated as UTF-32 (four byte fixed
|
|
encoding).
|
|
*/
|
|
#define UTF8_WCHAR_UTF32 (1)
|
|
#elif (UTF8_WCHAR_SIZE == 2)
|
|
/*!
|
|
\def UTF8_WCHAR_UTF16
|
|
\brief The `wchar_t` type is treated as UTF-16 (two byte variable
|
|
length encoding).
|
|
*/
|
|
#define UTF8_WCHAR_UTF16 (1)
|
|
#else
|
|
#error Invalid size for wchar_t type.
|
|
#endif
|
|
|
|
/*!
|
|
\def UTF8_API
|
|
\brief Calling convention for public functions.
|
|
*/
|
|
|
|
#ifndef UTF8_API
|
|
#ifdef __cplusplus
|
|
#define UTF8_API extern "C"
|
|
#else
|
|
#define UTF8_API
|
|
#endif
|
|
#endif
|
|
|
|
/*!
|
|
\}
|
|
*/
|
|
|
|
/*!
|
|
\addtogroup types
|
|
\{
|
|
*/
|
|
|
|
/*!
|
|
\var utf16_t
|
|
\brief UTF-16 encoded code point.
|
|
*/
|
|
typedef uint16_t utf16_t;
|
|
|
|
/*!
|
|
\var unicode_t
|
|
\brief UTF-32 encoded code point.
|
|
*/
|
|
typedef uint32_t unicode_t;
|
|
|
|
/*!
|
|
\}
|
|
*/
|
|
|
|
/*!
|
|
\addtogroup public
|
|
\{
|
|
*/
|
|
|
|
/*!
|
|
\brief Get the length in code points of a UTF-8 encoded string.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
uint8_t CheckPassword(const char* password)
|
|
{
|
|
size_t length = utf8len(password);
|
|
return (length == utf8len("hunter2"));
|
|
}
|
|
\endcode
|
|
|
|
\param[in] text UTF-8 encoded string.
|
|
|
|
\return Length in code points.
|
|
*/
|
|
UTF8_API size_t utf8len(const char* text);
|
|
|
|
/*!
|
|
\brief Convert a UTF-16 encoded string to a UTF-8 encoded string.
|
|
|
|
\note This function should only be called directly if you are positive that
|
|
you are working with UTF-16 encoded text. If you're working with wide
|
|
strings, take a look at #widetoutf8 instead.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
uint8_t Player_SetNameUtf16(const utf16_t* name, size_t nameSize)
|
|
{
|
|
char buffer[256];
|
|
size_t buffer_size = 255;
|
|
size_t converted_size;
|
|
int32_t errors;
|
|
|
|
if ((converted_size = utf16toutf8(name, nameSize, buffer, buffer_size, &errors)) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
return 0;
|
|
}
|
|
buffer[converted_size] = 0;
|
|
|
|
return Player_SetName(converted_name);
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-16 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf32toutf8
|
|
\sa widetoutf8
|
|
*/
|
|
UTF8_API size_t utf16toutf8(const utf16_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
|
|
|
|
/*!
|
|
\brief Convert a UTF-32 encoded string to a UTF-8 encoded string.
|
|
|
|
\note This function should only be called directly if you are positive that
|
|
you are working with UTF-32 encoded text. If you're working with wide
|
|
strings, take a look at #widetoutf8 instead.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
uint8_t Database_ExecuteQuery_Unicode(const unicode_t* query, size_t querySize)
|
|
{
|
|
char* converted = NULL;
|
|
size_t converted_size;
|
|
uint8_t result = 0;
|
|
int32_t errors;
|
|
|
|
if ((converted_size = utf32toutf8(query, querySize, NULL, 0, &errors)) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
goto cleanup;
|
|
}
|
|
|
|
converted = (char*)malloc(converted_size + 1);
|
|
utf32toutf8(query, querySize, converted, converted_size, NULL);
|
|
converted[converted_size] = 0;
|
|
|
|
result = Database_ExecuteQuery(converted);
|
|
|
|
cleanup:
|
|
if (converted != NULL)
|
|
{
|
|
free(converted);
|
|
converted = 0;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-32 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf16toutf8
|
|
\sa widetoutf8
|
|
*/
|
|
UTF8_API size_t utf32toutf8(const unicode_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
|
|
|
|
/*!
|
|
\brief Convert a wide string to a UTF-8 encoded string.
|
|
|
|
Depending on the platform, wide strings are either UTF-16 or UTF-32 encoded.
|
|
This function takes a wide string as input and automatically calls the
|
|
correct conversion function.
|
|
|
|
This allows for a cross-platform treatment of wide text and is preferable to
|
|
using the UTF-16 or UTF-32 versions directly.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
texture_t Texture_Load_Wide(const wchar_t* input)
|
|
{
|
|
char* converted = NULL;
|
|
size_t converted_size;
|
|
size_t input_size = wcslen(input) * sizeof(wchar_t);
|
|
texture_t result = NULL;
|
|
int32_t errors;
|
|
|
|
if ((converted_size = widetoutf8(input, input_size, NULL, 0, &errors)) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
goto cleanup;
|
|
}
|
|
|
|
converted = (char*)malloc(converted_size + 1);
|
|
widetoutf8(input, input_size, converted, converted_size, NULL);
|
|
converted[converted_size / sizeof(wchar_t)] = 0;
|
|
|
|
result = Texture_Load(converted);
|
|
|
|
cleanup:
|
|
if (converted != NULL)
|
|
{
|
|
free(converted);
|
|
converted = NULL;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input Wide-encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf8towide
|
|
\sa utf16toutf8
|
|
\sa utf32toutf8
|
|
*/
|
|
UTF8_API size_t widetoutf8(const wchar_t* input, size_t inputSize, char* target, size_t targetSize, int32_t* errors);
|
|
|
|
/*!
|
|
\brief Convert a UTF-8 encoded string to a UTF-16 encoded string.
|
|
|
|
\note This function should only be called directly if you are positive that
|
|
you *must* convert to UTF-16, independent of platform. If you're working
|
|
with wide strings, take a look at #utf8towide instead.
|
|
|
|
Erroneous byte sequences such as missing or illegal bytes or overlong
|
|
encoding of code points (e.g. using five bytes to encode a sequence that
|
|
can be represented by two bytes) are converted to the replacement
|
|
character U+FFFD.
|
|
|
|
Code points outside the Basic Multilingual Plane (BMP) will be converted to
|
|
surrogate pairs, which use four bytes instead of two.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
void Font_DrawText(int x, int y, const char* text)
|
|
{
|
|
utf16_t buffer[256];
|
|
size_t buffer_size = 255 * sizeof(utf16_t);
|
|
int32_t errors;
|
|
|
|
size_t converted_size = utf8toutf16(text, strlen(text), buffer, buffer_size, &errors);
|
|
if (converted_size > 0 &&
|
|
errors == UTF8_ERR_NONE)
|
|
{
|
|
Legacy_DrawText(g_FontCurrent, x, y, (unsigned short*)buffer, converted_size / sizeof(utf16_t));
|
|
}
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf8towide
|
|
\sa utf8toutf32
|
|
*/
|
|
UTF8_API size_t utf8toutf16(const char* input, size_t inputSize, utf16_t* target, size_t targetSize, int32_t* errors);
|
|
|
|
/*!
|
|
\brief Convert a UTF-8 encoded string to a UTF-32 encoded string.
|
|
|
|
\note This function should only be called directly if you are positive that
|
|
you *must* convert to UTF-32, independent of platform. If you're working
|
|
with wide strings, take a look at #utf8towide instead.
|
|
|
|
Erroneous byte sequences such as missing or illegal bytes or overlong
|
|
encoding of code points (e.g. using five bytes to encode a sequence that
|
|
can be represented by two bytes) are converted to the replacement
|
|
character U+FFFD.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
void TextField_AddCharacter(const char* encoded)
|
|
{
|
|
unicode_t code_point = 0;
|
|
int32_t errors;
|
|
|
|
utf8toutf32(encoded, strlen(encoded), &code_point, sizeof(unicode_t), &errors);
|
|
if (errors == UTF8_ERR_NONE)
|
|
{
|
|
TextField_AddCodePoint(code_point);
|
|
}
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf8towide
|
|
\sa utf8toutf16
|
|
*/
|
|
UTF8_API size_t utf8toutf32(const char* input, size_t inputSize, unicode_t* target, size_t targetSize, int32_t* errors);
|
|
|
|
/*!
|
|
\brief Convert a UTF-8 encoded string to a wide string.
|
|
|
|
Depending on the platform, wide strings are either UTF-16 or UTF-32 encoded.
|
|
This function takes a UTF-8 encoded string as input and automatically calls
|
|
the correct conversion function.
|
|
|
|
This allows for a cross-platform treatment of wide text and is preferable to
|
|
using the UTF-16 or UTF-32 versions directly.
|
|
|
|
Erroneous byte sequences such as missing or illegal bytes or overlong
|
|
encoding of code points (e.g. using five bytes to encode a sequence that
|
|
can be represented by two bytes) are converted to the replacement
|
|
character U+FFFD.
|
|
|
|
\note Code points outside the Basic Multilingual Plane (BMP) are converted
|
|
to surrogate pairs when using UTF-16. This means that strings containing
|
|
characters outside the BMP converted on a platform with UTF-32 wide strings
|
|
are *not* compatible with platforms with UTF-16 wide strings.
|
|
|
|
\par Hence, it is preferable to store all data as UTF-8 and only convert to
|
|
wide strings when required by a third-party interface.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
void Window_SetTitle(void* windowHandle, const char* text)
|
|
{
|
|
size_t input_size = strlen(text);
|
|
wchar_t* converted = NULL;
|
|
size_t converted_size;
|
|
int32_t errors;
|
|
|
|
converted_size = utf8towide(text, input_size, NULL, 0, &errors);
|
|
if (converted_size == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
goto cleanup;
|
|
}
|
|
|
|
converted = (wchar_t*)malloc(converted_size + sizeof(wchar_t));
|
|
utf8towide(text, input_size, converted, converted_size, NULL);
|
|
converted[converted_size / sizeof(wchar_t)] = 0;
|
|
|
|
SetWindowTextW((HWND)windowHandle, converted);
|
|
|
|
cleanup:
|
|
if (converted != NULL)
|
|
{
|
|
free(converted);
|
|
converted = NULL;
|
|
}
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa widetoutf8
|
|
\sa utf8toutf16
|
|
\sa utf8toutf32
|
|
*/
|
|
UTF8_API size_t utf8towide(const char* input, size_t inputSize, wchar_t* target, size_t targetSize, int32_t* errors);
|
|
|
|
/*!
|
|
\brief Seek into a UTF-8 encoded string.
|
|
|
|
Working with UTF-8 encoded strings can be tricky due to the nature of the
|
|
variable-length encoding. Because one character no longer equals one byte,
|
|
it can be difficult to skip around in a UTF-8 encoded string without
|
|
decoding the code points.
|
|
|
|
This function provides an interface similar to `fseek` in order to enable
|
|
skipping to another part of the string.
|
|
|
|
\note `textStart` must come before `text` in memory when seeking from the
|
|
current or end position.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
const char* text = "Press \xE0\x80\x13 to continue.";
|
|
const char fixed[1024];
|
|
const char* commandStart;
|
|
const char* commandEnd;
|
|
|
|
memset(fixed, 0, sizeof(fixed));
|
|
|
|
commandStart = strstr(text, "\xE0\x80\x13");
|
|
if (commandStart == 0)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
strncpy(fixed, text, commandStart - text);
|
|
strcat(fixed, "ENTER");
|
|
|
|
commandEnd = utf8seek(commandStart, strlen(commandStart), text, 1, SEEK_CUR);
|
|
if (commandEnd != commandStart)
|
|
{
|
|
strcat(fixed, commandEnd);
|
|
}
|
|
\endcode
|
|
|
|
\param[in] text Input string.
|
|
\param[in] textSize Size of the complete input string in bytes, starting from `textStart`.
|
|
\param[in] textStart Start of input string.
|
|
\param[in] offset Requested offset in code points.
|
|
\param[in] direction Direction to seek in.
|
|
\arg `SEEK_SET` Offset is from the start of the string.
|
|
\arg `SEEK_CUR` Offset is from the current position of the string.
|
|
\arg `SEEK_END` Offset is from the end of the string.
|
|
|
|
\return Pointer to offset string or no change on error.
|
|
|
|
\sa utf8iscategory
|
|
*/
|
|
UTF8_API const char* utf8seek(const char* text, size_t textSize, const char* textStart, off_t offset, int direction);
|
|
|
|
/*!
|
|
\brief Returns the environment's locale as an enum value.
|
|
|
|
This function retrieves the (thread-local) environment locale as an enum
|
|
value in the \ref locales "list of locales". This value can be used with
|
|
functions in this library that change behavior on certain inputs, depending
|
|
on the specified locale.
|
|
|
|
Unfortunately, no cross-platform way of setting and retrieving the system
|
|
locale is available without adding dependencies to the library. Please refer
|
|
to your operating system's manual to determine how to setup the system
|
|
locale on your target system.
|
|
|
|
\warning This function should not be used as a replacement for platform-
|
|
specific methods for retrieving the locale. Its intended usage is to
|
|
"guess" the desired locale by looking at the system locale.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
uint8_t Employee_PrintNames(const char** names, size_t nameCount)
|
|
{
|
|
size_t locale = utf8envlocale();
|
|
char buffer[256];
|
|
size_t buffer_size = 255;
|
|
int32_t errors;
|
|
size_t i;
|
|
|
|
for (i = 0; i < nameCount; ++i)
|
|
{
|
|
size_t buffer_filled;
|
|
|
|
memset(buffer, 0, buffer_size);
|
|
|
|
if ((buffer_filled = utf8toupper(names[i], strlen(names[i]), buffer, buffer_size, locale, &errors)) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
Log_Print(buffer, buffer_filled);
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
\endcode
|
|
|
|
\return A specific \ref locales "locale" or #UTF8_LOCALE_DEFAULT.
|
|
|
|
\sa utf8toupper
|
|
\sa utf8tolower
|
|
\sa utf8totitle
|
|
\sa utf8casefold
|
|
*/
|
|
UTF8_API size_t utf8envlocale();
|
|
|
|
/*!
|
|
\brief Convert UTF-8 encoded text to uppercase.
|
|
|
|
This function allows conversion of UTF-8 encoded strings to uppercase
|
|
without first changing the encoding to UTF-32. Conversion is fully compliant
|
|
with the Unicode 7.0 standard.
|
|
|
|
Although most code points can be converted in-place, there are notable
|
|
exceptions. For example, U+00DF (LATIN SMALL LETTER SHARP S) maps to
|
|
"U+0053 U+0053" (LATIN CAPITAL LETTER S and LATIN CAPITAL LETTER S) when
|
|
converted to uppercase. Therefor, it is advised to first determine the size
|
|
in bytes of the output by calling the function with a NULL output buffer.
|
|
|
|
Only a handful of scripts make a distinction between upper and lowercase.
|
|
In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic,
|
|
a few historic or archaic scripts have case. The vast majority of scripts
|
|
do not have case distinctions.
|
|
|
|
\note Case mapping is not reversible. That is, `toUpper(toLower(x))
|
|
!= toLower(toUpper(x))`.
|
|
|
|
\warning Certain code points (or combinations of code points) apply rules
|
|
based on the locale. For more information about these exceptional
|
|
code points, please refer to the Unicode standard:
|
|
ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
|
|
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
void Button_Draw(int32_t x, int32_t y, const char* text)
|
|
{
|
|
size_t input_size = strlen(text);
|
|
char* converted = NULL;
|
|
size_t converted_size;
|
|
int32_t text_box_width, text_box_height;
|
|
int32_t errors;
|
|
|
|
if ((utf8toupper(text, input_size, NULL, 0, UTF8_LOCALE_DEFAULT, &errors)) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
goto cleanup;
|
|
}
|
|
|
|
converted = (char*)malloc(converted_size + 1);
|
|
|
|
if (converted == NULL ||
|
|
utf8toupper(text, input_size, converted, converted_size, UTF8_LOCALE_DEFAULT, &errors) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
goto cleanup;
|
|
}
|
|
|
|
converted[converted_size] = 0;
|
|
|
|
Font_GetTextDimensions(converted, &text_box_width, &text_box_height);
|
|
|
|
Draw_BoxFilled(x - 4, y - 4, text_box_width + 8, text_box_height + 8, 0x088A08);
|
|
Draw_BoxOutline(x - 4, y - 4, text_box_width + 8, text_box_height + 8, 0xA9F5A9);
|
|
Font_DrawText(x + 2, y + 1, converted, 0x000000);
|
|
Font_DrawText(x, y, converted, 0xFFFFFF);
|
|
|
|
cleanup:
|
|
if (converted != NULL)
|
|
{
|
|
free(converted);
|
|
converted = NULL;
|
|
}
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[in] locale Enables locale-specific behavior in the implementation. \ref locales "List of valid locales."
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed to contain output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_INVALID_LOCALE Invalid locale specified.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf8tolower
|
|
\sa utf8totitle
|
|
\sa utf8casefold
|
|
*/
|
|
UTF8_API size_t utf8toupper(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors, int no_replacement);
|
|
|
|
/*!
|
|
\brief Convert UTF-8 encoded text to lowercase.
|
|
|
|
This function allows conversion of UTF-8 encoded strings to lowercase
|
|
without first changing the encoding to UTF-32. Conversion is fully compliant
|
|
with the Unicode 7.0 standard.
|
|
|
|
Although most code points can be converted to lowercase in-place, there are
|
|
notable exceptions. For example, U+0130 (LATIN CAPITAL LETTER I WITH DOT
|
|
ABOVE) maps to "U+0069 U+0307" (LATIN SMALL LETTER I and COMBINING DOT
|
|
ABOVE) when converted to lowercase. Therefor, it is advised to first
|
|
determine the size in bytes of the output by calling the function with a
|
|
NULL output buffer.
|
|
|
|
Only a handful of scripts make a distinction between upper- and lowercase.
|
|
In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic,
|
|
a few historic or archaic scripts have case. The vast majority of scripts do
|
|
not have case distinctions.
|
|
|
|
\note Case mapping is not reversible. That is, `toUpper(toLower(x))
|
|
!= toLower(toUpper(x))`.
|
|
|
|
\warning Certain code points (or combinations of code points) apply rules
|
|
based on the locale. For more information about these exceptional
|
|
code points, please refer to the Unicode standard:
|
|
ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
author_t* Author_ByName(const char* name)
|
|
{
|
|
author_t* result = NULL;
|
|
size_t name_size = strlen(name);
|
|
char* converted = NULL;
|
|
size_t converted_size;
|
|
int32_t errors;
|
|
size_t i;
|
|
|
|
if ((converted_size = utf8tolower(name, name_size, NULL, 0, UTF8_LOCALE_DEFAULT, &errors)) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
goto cleanup;
|
|
}
|
|
|
|
converted = (char*)malloc(converted_size + 1);
|
|
|
|
if (converted == NULL ||
|
|
utf8tolower(name, name_size, converted, converted_size, UTF8_LOCALE_DEFAULT, &errors) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
goto cleanup;
|
|
}
|
|
|
|
converted[converted_size] = 0;
|
|
|
|
for (i = 0; i < g_AuthorCount; ++i)
|
|
{
|
|
if (!strcmp(g_Author[i].name, converted))
|
|
{
|
|
result = &g_Author[i];
|
|
break;
|
|
}
|
|
}
|
|
|
|
cleanup:
|
|
if (converted != NULL)
|
|
{
|
|
free(converted);
|
|
converted = NULL;
|
|
}
|
|
return result;
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[in] locale Enables locale-specific behavior in the implementation. \ref locales "List of valid locales."
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_INVALID_LOCALE Invalid locale specified.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf8toupper
|
|
\sa utf8totitle
|
|
\sa utf8casefold
|
|
*/
|
|
UTF8_API size_t utf8tolower(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors, int no_replacement);
|
|
|
|
/*!
|
|
\brief Convert UTF-8 encoded text to titlecase.
|
|
|
|
This function allows conversion of UTF-8 encoded strings to titlecase
|
|
without first changing the encoding to UTF-32. Conversion is fully compliant
|
|
with the Unicode 7.0 standard.
|
|
|
|
Titlecase requires a bit more explanation than uppercase and lowercase,
|
|
because it is not a common text transformation. Titlecase uses uppercase
|
|
for the first letter of each word and lowercase for the rest. Words are
|
|
defined as "collections of code points with general category Lu, Ll, Lt, Lm
|
|
or Lo according to the Unicode database".
|
|
|
|
Effectively, any type of punctuation can break up a word, even if this is
|
|
not grammatically valid. This happens because the titlecasing algorithm
|
|
does not and cannot take grammar rules into account.
|
|
|
|
Text | Titlecase
|
|
-------------------------------------|-------------------------------------
|
|
The running man | The Running Man
|
|
NATO Alliance | Nato Alliance
|
|
You're amazing at building libraries | You'Re Amazing At Building Libraries
|
|
|
|
Although most code points can be converted to titlecase in-place, there are
|
|
notable exceptions. For example, U+00DF (LATIN SMALL LETTER SHARP S) maps to
|
|
"U+0053 U+0073" (LATIN CAPITAL LETTER S and LATIN SMALL LETTER S) when
|
|
converted to titlecase. Therefor, it is advised to first determine the size
|
|
in bytes of the output by calling the function with a NULL output buffer.
|
|
|
|
Only a handful of scripts make a distinction between upper- and lowercase.
|
|
In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic,
|
|
a few historic or archaic scripts have case. The vast majority of scripts
|
|
do not have case distinctions.
|
|
|
|
\note Case mapping is not reversible. That is, `toUpper(toLower(x))
|
|
!= toLower(toUpper(x))`.
|
|
|
|
\warning Certain code points (or combinations of code points) apply rules
|
|
based on the locale. For more information about these exceptional
|
|
code points, please refer to the Unicode standard:
|
|
ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
void Book_SetTitle(book_t* book, const char* title)
|
|
{
|
|
size_t converted_size;
|
|
int32_t errors;
|
|
size_t i;
|
|
|
|
if ((converted_size = utf8totitle(title, strlen(title), book->title, sizeof(book->title) - 1, UTF8_LOCALE_DEFAULT, &errors)) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
memset(book->title, 0, sizeof(book->title));
|
|
|
|
return;
|
|
}
|
|
book->title[converted_size] = 0;
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[in] locale Enables locale-specific behavior in the implementation. \ref locales "List of valid locales."
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_INVALID_LOCALE Invalid locale specified.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf8tolower
|
|
\sa utf8toupper
|
|
\sa utf8casefold
|
|
*/
|
|
UTF8_API size_t utf8totitle(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors, int no_replacement);
|
|
|
|
/*!
|
|
\brief Remove case distinction from UTF-8 encoded text.
|
|
|
|
Case folding is the process of eliminating differences between code points
|
|
concerning case mapping. It is most commonly used for comparing strings in a
|
|
case-insensitive manner. Conversion is fully compliant with the Unicode 7.0
|
|
standard.
|
|
|
|
Although similar to lowercasing text, there are significant differences.
|
|
For one, case folding does _not_ take locale into account when converting.
|
|
In some cases, case folding can be up to 20% faster than lowercasing the
|
|
same text, but the result cannot be treated as correct lowercased text.
|
|
|
|
Only two locale-specific exception are made when case folding text.
|
|
In Turkish, U+0049 LATIN CAPITAL LETTER I maps to U+0131 LATIN SMALL LETTER
|
|
DOTLESS I and U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE maps to U+0069
|
|
LATIN SMALL LETTER I.
|
|
|
|
Although most code points can be case folded in-place, there are notable
|
|
exceptions. For example, U+0130 (LATIN CAPITAL LETTER I WITH DOT ABOVE) maps
|
|
to "U+0069 U+0307" (LATIN SMALL LETTER I and COMBINING DOT ABOVE) when
|
|
converted to lowercase. Therefor, it is advised to first determine the size
|
|
in bytes of the output by calling the function with a NULL output buffer.
|
|
|
|
Only a handful of scripts make a distinction between upper- and lowercase.
|
|
In addition to modern scripts, such as Latin, Greek, Armenian and Cyrillic,
|
|
a few historic or archaic scripts have case. The vast majority of scripts
|
|
do not have case distinctions.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
int32_t Command_ParseCommand(const char* argument)
|
|
{
|
|
char* buffer = NULL;
|
|
size_t buffer_size = 0;
|
|
int32_t errors;
|
|
int32_t result = 0;
|
|
|
|
if ((buffer_size = utf8casefold(argument, strlen(argument), NULL, 0, UTF8_LOCALE_DEFAULT, &errors)) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
result = -1;
|
|
|
|
goto cleanup;
|
|
}
|
|
|
|
buffer = (char*)malloc(buffer_size);
|
|
|
|
if (buffer == NULL ||
|
|
utf8casefold(argument, strlen(argument), buffer, buffer_size, UTF8_LOCALE_DEFAULT, &errors) == 0 ||
|
|
errors != UTF8_ERR_NONE)
|
|
{
|
|
result = -1;
|
|
|
|
goto cleanup;
|
|
}
|
|
|
|
if (!strncmp(buffer, "-username", strlen("-username")))
|
|
{
|
|
result = eCommand_Username;
|
|
}
|
|
else if (
|
|
!strncmp(buffer, "-password", strlen("-password")))
|
|
{
|
|
result = eCommand_Password;
|
|
}
|
|
else if (
|
|
!strncmp(buffer, "-message", strlen("-message")))
|
|
{
|
|
result = eCommand_Message;
|
|
}
|
|
|
|
cleanup:
|
|
if (buffer != NULL)
|
|
{
|
|
free(buffer);
|
|
buffer = NULL;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[in] locale Enables locale-specific behavior in the implementation. \ref locales "List of valid locales."
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_INVALID_LOCALE Invalid locale specified.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf8tolower
|
|
\sa utf8toupper
|
|
\sa utf8totitle
|
|
*/
|
|
UTF8_API size_t utf8casefold(const char* input, size_t inputSize, char* target, size_t targetSize, size_t locale, int32_t* errors, int no_replacement);
|
|
|
|
/*!
|
|
\brief Check if a string is stable in the specified Unicode Normalization
|
|
Form.
|
|
|
|
This function can be used as a preprocessing step, before attempting to
|
|
normalize a string. Normalization is a very expensive process, it is often
|
|
cheaper to first determine if the string is unstable in the requested
|
|
normalization form.
|
|
|
|
The result of the check will be YES if the string is stable and MAYBE or NO
|
|
if it is unstable. If the result is MAYBE, the string does not necessarily
|
|
have to be normalized.
|
|
|
|
If the result is unstable, the offset parameter is set to the offset for the
|
|
first unstable code point. If the string is stable, the offset is equivalent
|
|
to the length of the string in bytes.
|
|
|
|
You must specify the desired Unicode Normalization Form by using a
|
|
combination of flags:
|
|
|
|
Unicode | Flags
|
|
---------------------------- | ---------------------------------------------------------
|
|
Normalization Form C (NFC) | #UTF8_NORMALIZE_COMPOSE
|
|
Normalization Form KC (NFKC) | #UTF8_NORMALIZE_COMPOSE + #UTF8_NORMALIZE_COMPATIBILITY
|
|
Normalization Form D (NFD) | #UTF8_NORMALIZE_DECOMPOSE
|
|
Normalization Form KD (NFKD) | #UTF8_NORMALIZE_DECOMPOSE + #UTF8_NORMALIZE_COMPATIBILITY
|
|
|
|
For more information, please review [Unicode Standard Annex #15 - Unicode
|
|
Normalization Forms](http://www.unicode.org/reports/tr15/).
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
uint8_t Text_InspectComposed(const char* text)
|
|
{
|
|
const char* src = text;
|
|
size_t src_size = strlen(text);
|
|
size_t offset;
|
|
size_t total_offset;
|
|
|
|
if (utf8isnormalized(src, src_size, UTF8_NORMALIZE_COMPOSE, &offset) == UTF8_NORMALIZATION_RESULT_YES)
|
|
{
|
|
printf("Clean!\n");
|
|
|
|
return 1;
|
|
}
|
|
|
|
total_offset = offset;
|
|
|
|
do
|
|
{
|
|
const char* next;
|
|
|
|
printf("Unstable at byte %d\n", total_offset);
|
|
|
|
next = utf8seek(src, text, 1, SEEK_CUR);
|
|
if (next == src)
|
|
{
|
|
break;
|
|
}
|
|
|
|
total_offset += offset;
|
|
|
|
src = next;
|
|
src_size -= next - src;
|
|
}
|
|
while (utf8isnormalized(src, src_size, UTF8_NORMALIZE_COMPOSE, &offset) != UTF8_NORMALIZATION_RESULT_YES);
|
|
|
|
return 0;
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[in] flags Desired normalization form. Must be a combination of \ref normalization "normalization flags".
|
|
\param[out] offset Offset to first unstable code point or length of input in bytes if stable.
|
|
|
|
\retval #UTF8_NORMALIZATION_RESULT_YES Input is stable and does not have to be normalized.
|
|
\retval #UTF8_NORMALIZATION_RESULT_MAYBE Input is unstable, but normalization may be skipped.
|
|
\retval #UTF8_NORMALIZATION_RESULT_NO Input is unstable and must be normalized.
|
|
|
|
\sa utf8normalize
|
|
*/
|
|
UTF8_API uint8_t utf8isnormalized(const char* input, size_t inputSize, size_t flags, size_t* offset);
|
|
|
|
/*!
|
|
\brief Normalize a string to the specified Unicode Normalization Form.
|
|
|
|
The Unicode standard defines two standards for equivalence between
|
|
characters: canonical and compatibility equivalence. Canonically equivalent
|
|
characters and sequence represent the same abstract character and must be
|
|
rendered with the same appearance and behavior. Compatibility equivalent
|
|
characters have a weaker equivalence and may be rendered differently.
|
|
|
|
Unicode Normalization Forms are formally defined standards that can be used
|
|
to test whether any two strings of characters are equivalent to each other.
|
|
This equivalence may be canonical or compatibility.
|
|
|
|
The algorithm puts all combining marks into a specified order and uses the
|
|
rules for decomposition and composition to transform the string into one of
|
|
four Unicode Normalization Forms. A binary comparison can then be used to
|
|
determine equivalence.
|
|
|
|
These are the Unicode Normalization Forms:
|
|
|
|
Form | Description
|
|
---------------------------- | ---------------------------------------------
|
|
Normalization Form D (NFD) | Canonical decomposition
|
|
Normalization Form C (NFC) | Canonical decomposition, followed by canonical composition
|
|
Normalization Form KD (NFKD) | Compatibility decomposition
|
|
Normalization Form KC (NFKC) | Compatibility decomposition, followed by canonical composition
|
|
|
|
`utf8normalize` can be used to transform text into one of these forms. You
|
|
must specify the desired Unicode Normalization Form by using a combination
|
|
of flags:
|
|
|
|
Form | Flags
|
|
----------------------------- | ---------------------------------------------------------
|
|
Normalization Form D (NFD) | #UTF8_NORMALIZE_DECOMPOSE
|
|
Normalization Form C (NFC) | #UTF8_NORMALIZE_COMPOSE
|
|
Normalization Form KD (NFKD) | #UTF8_NORMALIZE_DECOMPOSE + #UTF8_NORMALIZE_COMPATIBILITY
|
|
Normalization Form KC (NFKC) | #UTF8_NORMALIZE_COMPOSE + #UTF8_NORMALIZE_COMPATIBILITY
|
|
|
|
For more information, please review [Unicode Standard Annex #15 - Unicode
|
|
Normalization Forms](http://www.unicode.org/reports/tr15/).
|
|
|
|
\note Unnormalized text is rare in the wild. As an example, *all* text
|
|
found on the Internet as HTML source code must be encoded as NFC, as
|
|
specified by the W3C.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
void Font_RenderTextNormalized(const char* input)
|
|
{
|
|
const char* src = NULL;
|
|
const char* src_start;
|
|
size_t src_size;
|
|
char* converted = NULL;
|
|
size_t converted_size = 0;
|
|
size_t input_size = strlen(input);
|
|
|
|
if (utf8isnormalized(input, input_size, UTF8_NORMALIZE_COMPOSE, NULL) != UTF8_NORMALIZATION_RESULT_YES)
|
|
{
|
|
int32_t errors;
|
|
|
|
converted_size = utf8normalize(input, input_size, NULL, 0, UTF8_NORMALIZE_COMPOSE, &errors);
|
|
if (converted_size > 0 &&
|
|
errors == UTF8_ERR_NONE)
|
|
{
|
|
converted = (char*)malloc(converted_size + 1);
|
|
utf8normalize(input, input_size, converted, converted_size, UTF8_NORMALIZE_COMPOSE, NULL);
|
|
converted[converted_size] = 0;
|
|
|
|
src = (const char*)converted;
|
|
src_size = converted_size;
|
|
}
|
|
}
|
|
|
|
if (src == NULL)
|
|
{
|
|
src = (const char*)input;
|
|
src_size = input_size;
|
|
}
|
|
|
|
src_start = src;
|
|
|
|
while (src_size > 0)
|
|
{
|
|
const char* next;
|
|
int32_t errors;
|
|
|
|
next = utf8seek(src, src_size, src_start, 1, SEEK_CUR);
|
|
if (next == src)
|
|
{
|
|
break;
|
|
}
|
|
|
|
unicode_t code_point;
|
|
utf8toutf32(src, (size_t)(next - src), &code_point, sizeof(unicode_t), &errors);
|
|
if (errors != UTF8_ERR_NONE)
|
|
{
|
|
break;
|
|
}
|
|
|
|
Font_RenderCodePoint(code_point);
|
|
|
|
src_size -= next - src;
|
|
src = next;
|
|
}
|
|
|
|
if (converted != NULL)
|
|
{
|
|
free(converted);
|
|
converted = NULL;
|
|
}
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[out] target Output buffer for the result, can be NULL.
|
|
\param[in] targetSize Size of the output buffer in bytes.
|
|
\param[in] flags Desired normalization form. Must be a combination of #UTF8_NORMALIZE_COMPOSE, #UTF8_NORMALIZE_DECOMPOSE and #UTF8_NORMALIZE_COMPATIBILITY.
|
|
\param[out] errors Output for errors.
|
|
|
|
\return Amount of bytes needed for storing output.
|
|
|
|
\retval #UTF8_ERR_NONE No errors.
|
|
\retval #UTF8_ERR_INVALID_FLAG Invalid combination of flags was specified.
|
|
\retval #UTF8_ERR_INVALID_DATA Failed to decode data.
|
|
\retval #UTF8_ERR_OVERLAPPING_PARAMETERS Input and output buffers overlap in memory.
|
|
\retval #UTF8_ERR_NOT_ENOUGH_SPACE Target buffer size is insufficient for result.
|
|
|
|
\sa utf8isnormalized
|
|
*/
|
|
UTF8_API size_t utf8normalize(const char* input, size_t inputSize, char* target, size_t targetSize, size_t flags, int32_t* errors);
|
|
|
|
/*!
|
|
\brief Check if the input string conforms to the category specified by the
|
|
flags.
|
|
|
|
This function can be used to check if the code points in a string are part
|
|
of a category. Valid flags are members of the
|
|
\ref category "list of categories". The category for a code point is
|
|
defined as part of the entry in UnicodeData.txt, the data file for the
|
|
Unicode code point database.
|
|
|
|
\note The function is _greedy_. This means it will try to match as many
|
|
code points with the matching category flags as possible and return the
|
|
offset in the input in bytes. If this is undesired behavior, use `utf8seek`
|
|
to seek in the input first before matching it with the category flags.
|
|
|
|
By default, the function will treat grapheme clusters as a single code
|
|
point. This means that the following string:
|
|
|
|
Code point | Canonical combining class | General category | Name
|
|
---------- | ------------------------- | --------------------- | ----------------------
|
|
U+0045 | 0 | Lu (Uppercase letter) | LATIN CAPITAL LETTER E
|
|
U+0300 | 230 | Mn (Non-spacing mark) | COMBINING GRAVE ACCENT
|
|
|
|
Will match with #UTF8_CATEGORY_LETTER_UPPERCASE in its entirety, because
|
|
the COMBINING GRAVE ACCENT is treated as part of the grapheme cluster. This
|
|
is useful when e.g. creating a text parser, because you do not have to
|
|
normalize the text first.
|
|
|
|
If this is undesired behavior, specify the
|
|
#UTF8_CATEGORY_IGNORE_GRAPHEME_CLUSTER flag.
|
|
|
|
\warning In order to maintain backwards compatibility with POSIX functions
|
|
like `isdigit` and `isspace`, compatibility flags have been provided. Note,
|
|
however, that the result is only guaranteed to be correct for code points
|
|
in the Basic Latin range, between U+0000 and 0+007F. Combining a
|
|
compatibility flag with a regular category flag will result in undefined
|
|
behavior.
|
|
|
|
Example:
|
|
|
|
\code{.c}
|
|
const char* Parser_NextIdentifier(char** output, size_t* outputSize, const char* input, size_t inputSize)
|
|
{
|
|
const char* src = input;
|
|
size_t src_size = inputSize;
|
|
size_t whitespace_size;
|
|
size_t identifier_size;
|
|
|
|
whitespace_size = utf8iscategory(src, src_size, UTF8_CATEGORY_SEPARATOR_SPACE);
|
|
if (whitespace_size == 0)
|
|
{
|
|
whitespace_size = utf8iscategory(src, src_size, UTF8_CATEGORY_ISSPACE);
|
|
}
|
|
|
|
if (whitespace_size > 0)
|
|
{
|
|
if (whitespace_size >= src_size)
|
|
{
|
|
return src + src_size;
|
|
}
|
|
|
|
src += whitespace_size;
|
|
src_size -= whitespace_size;
|
|
}
|
|
|
|
identifier_size = utf8iscategory(src, src_size, UTF8_CATEGORY_LETTER | UTF8_CATEGORY_PUNCTUATION_CONNECTOR | UTF8_CATEGORY_PUNCTUATION_DASH);
|
|
if (identifier_size == 0)
|
|
{
|
|
return src;
|
|
}
|
|
|
|
*output = (char*)malloc(identifier_size + 1);
|
|
memcpy(*output, src, identifier_size);
|
|
(*output)[identifier_size] = 0;
|
|
*outputSize = identifier_size;
|
|
|
|
if (identifier_size >= src_size)
|
|
{
|
|
return src + src_size;
|
|
}
|
|
|
|
return src + identifier_size;
|
|
}
|
|
\endcode
|
|
|
|
\param[in] input UTF-8 encoded string.
|
|
\param[in] inputSize Size of the input in bytes.
|
|
\param[in] flags Requested category. Must be a combination of \ref category "category flags" or a single compatibility flag.
|
|
|
|
\return Number of bytes in the input that conform to the specified category flags.
|
|
|
|
\sa utf8seek
|
|
*/
|
|
UTF8_API size_t utf8iscategory(const char* input, size_t inputSize, size_t flags);
|
|
|
|
/*!
|
|
\}
|
|
*/
|
|
|
|
#endif /* _UTF8REWIND_H_ */
|