Improve UTF-8 support in some natives (bug 6475) (#407)

* Compile as static library, update AMBuildScript and link to core

* Update VS project files to include the library

* Add UTF-8 Rewind library (v1.5.1) to third_party directory

* Update ACKNOWLEDGEMENTS.txt

* Move AMXX buffer in its own function

* Move constants from string.inc to string_const.inc and update project files

* Move stocks from string.inc to string_stocks.inc and update project files

* Improve UTF-8 support in containi() and update documentation

* Improve UTF-8 support in strcmp() and update documentation

* Improve UTF-8 support in strfind() and update documentation

Worth to be noted that this native with ignorecase set was not working properly. So broken that no one reported the issue.
This adds also a safety check for "pos" parameter to not go < 0.

* Improve UTF-8 support in strncmp() and update documentation

* Improve UTF-8 support in equali() and update documentation

* Add an option to some UTF-8 Rewind functions for avoiding invalid data to be replaced

By default it replaces any invalid byte or sequence of bytes by 0xFFFD (3 bytes). It can be problematic when the input buffer is not changed (from a plugin) and that some natives need to calculate a position from the converted string. With such replacement, the position is displaced due the final string length being larger.

This compiles the library as C++, because I added some silly param with a default default value which is not supported by C.

* Improve UTF-8 support in replace_string/ex() and update documentation

* Add is_string_category() and update documentation

* Update a little testsuite plugin (and fix linux compilation)

* Add mb_strotolower/upper() and update documentation

* Add mb_ucfirst() and update documentation

* Add mb_strtotile() and update documentation

* Improve UTF-8 support in get_players() and find_player() with name/case insenstive flags set

* Fix KliPPy's complain
This commit is contained in:
Vincent Herbet 2017-08-05 10:32:16 +02:00 committed by GitHub
parent 07c3d49cfa
commit ab854ec035
34 changed files with 20166 additions and 532 deletions

View File

@ -17,6 +17,7 @@ class AMXXConfig(object):
self.versionlib = None
self.zlib = None
self.hashing = None
self.utf8rewind = None
self.csx_app = None
self.stdcxx_path = None
@ -164,6 +165,7 @@ class AMXXConfig(object):
cxx.includes += [os.path.join(builder.sourcePath, 'third_party')]
cxx.includes += [os.path.join(builder.sourcePath, 'third_party', 'hashing')]
cxx.includes += [os.path.join(builder.sourcePath, 'third_party', 'zlib')]
cxx.includes += [os.path.join(builder.sourcePath, 'third_party', 'utf8rewind')]
def configure_gcc(self, cxx):
cxx.cflags += [
@ -388,6 +390,10 @@ AMXX.hashing = builder.RunScript(
'third_party/hashing/AMBuilder'
)
AMXX.utf8rewind = builder.RunScript(
'third_party/utf8rewind/AMBuilder'
)
builder.RunBuildScripts(
[
'amxmodx/AMBuilder',

View File

@ -32,7 +32,7 @@ elif builder.target_platform == 'windows':
]
binary.compiler.linkflags += jit_objects
binary.compiler.linkflags += [AMXX.zlib.binary, AMXX.hashing.binary]
binary.compiler.linkflags += [AMXX.zlib.binary, AMXX.hashing.binary, AMXX.utf8rewind.binary]
if builder.target_platform == 'mac':
binary.compiler.postlink += [

View File

@ -2267,7 +2267,7 @@ static cell AMX_NATIVE_CALL get_players(AMX *amx, cell *params) /* 4 param */
{
if (flags & 64)
{
if (stristr(pPlayer->name.chars(), sptemp) == NULL)
if (utf8stristr(pPlayer->name.chars(), sptemp) == NULL)
continue;
}
else if (strstr(pPlayer->name.chars(), sptemp) == NULL)
@ -2301,7 +2301,7 @@ static cell AMX_NATIVE_CALL find_player(AMX *amx, cell *params) /* 1 param */
// Switch for the l flag
if (flags & 2048)
func = strcasecmp;
func = utf8strcasecmp;
else
func = strcmp;
@ -2327,7 +2327,7 @@ static cell AMX_NATIVE_CALL find_player(AMX *amx, cell *params) /* 1 param */
{
if (flags & 2048)
{
if (stristr(pPlayer->name.chars(), sptemp) == NULL)
if (utf8stristr(pPlayer->name.chars(), sptemp) == NULL)
continue;
}
else if (strstr(pPlayer->name.chars(), sptemp) == NULL)

View File

@ -113,6 +113,8 @@ extern AMX_NATIVE_INFO g_GameConfigNatives[];
#define SETCLIENTLISTENING (*g_engfuncs.pfnVoice_SetClientListening)
#define SETCLIENTMAXSPEED (*g_engfuncs.pfnSetClientMaxspeed)
#define MAX_BUFFER_LENGTH 16384
char* UTIL_SplitHudMessage(register const char *src);
int UTIL_ReadFlags(const char* c);
@ -130,11 +132,16 @@ void UTIL_TeamInfo(edict_t *pEntity, int playerIndex, const char *pszTeamName);
template <typename D> int UTIL_CheckValidChar(D *c);
template <typename D, typename S> unsigned int strncopy(D *dest, const S *src, size_t count);
unsigned int UTIL_GetUTF8CharBytes(const char *stream);
unsigned int UTIL_ReplaceAll(char *subject, size_t maxlength, const char *search, const char *replace, bool caseSensitive);
size_t UTIL_ReplaceAll(char *subject, size_t maxlength, const char *search, const char *replace, bool caseSensitive);
size_t UTIL_ReplaceAll(char *subject, size_t maxlength, const char *search, size_t searchLen, const char *replace, size_t replaceLen, bool caseSensitive);
char *UTIL_ReplaceEx(char *subject, size_t maxLen, const char *search, size_t searchLen, const char *replace, size_t replaceLen, bool caseSensitive);
void UTIL_TrimLeft(char *buffer);
void UTIL_TrimRight(char *buffer);
char* utf8stristr(const char *string1, const char *string2);
int utf8strncasecmp(const char *string1, const char *string2, size_t n);
int utf8strcasecmp(const char *string1, const char *string2);
#define GET_PLAYER_POINTER(e) (&g_players[ENTINDEX(e)])
//#define GET_PLAYER_POINTER(e) (&g_players[(((int)e-g_edict_point)/sizeof(edict_t))])
#define GET_PLAYER_POINTER_I(i) (&g_players[i])

View File

@ -60,8 +60,8 @@
</Midl>
<ClCompile>
<Optimization>Disabled</Optimization>
<AdditionalIncludeDirectories>..\;..\..\public;..\..\public\memtools;..\..\third_party;..\..\third_party\zlib;..\..\third_party\hashing;..\..\public\sdk;..\..\public\amtl;..\..\third_party;..\..\third_party\hashing;$(METAMOD)\metamod;$(HLSDK)\common;$(HLSDK)\engine;$(HLSDK)\dlls;$(HLSDK)\pm_shared;$(HLSDK)\public;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_CRT_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;_CRT_NONSTDC_NO_DEPRECATE;_DEBUG;_WINDOWS;_USRDLL;amxmodx_EXPORTS;PAWN_CELL_SIZE=32;ASM32;JIT;_CRT_SECURE_NO_DEPRECATE;HAVE_STDINT_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>..\;..\..\public;..\..\public\memtools;..\..\third_party;..\..\third_party\zlib;..\..\third_party\hashing;..\..\third_party\utf8rewind;..\..\public\sdk;..\..\public\amtl;..\..\third_party;..\..\third_party\hashing;$(METAMOD)\metamod;$(HLSDK)\common;$(HLSDK)\engine;$(HLSDK)\dlls;$(HLSDK)\pm_shared;$(HLSDK)\public;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_CRT_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;_CRT_NONSTDC_NO_DEPRECATE;UTF8PROC_EXPORTS;_DEBUG;_WINDOWS;_USRDLL;amxmodx_EXPORTS;PAWN_CELL_SIZE=32;ASM32;JIT;_CRT_SECURE_NO_DEPRECATE;HAVE_STDINT_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
<StructMemberAlignment>4Bytes</StructMemberAlignment>
@ -110,8 +110,8 @@
<IntrinsicFunctions>true</IntrinsicFunctions>
<FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
<OmitFramePointers>true</OmitFramePointers>
<AdditionalIncludeDirectories>..\;..\..\public;..\..\public\memtools;..\..\third_party;..\..\third_party\zlib;..\..\third_party\hashing;..\..\third_party;..\..\third_party\hashing;..\..\public\sdk;..\..\public\amtl;..\..\third_party;..\..\third_party\hashing;$(METAMOD)\metamod;$(HLSDK)\common;$(HLSDK)\engine;$(HLSDK)\dlls;$(HLSDK)\pm_shared;$(HLSDK)\public;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_CRT_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;_CRT_NONSTDC_NO_DEPRECATE;NDEBUG;_WINDOWS;_USRDLL;amxmodx_EXPORTS;JIT;ASM32;PAWN_CELL_SIZE=32;HAVE_STDINT_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>..\;..\..\public;..\..\public\memtools;..\..\third_party;..\..\third_party\zlib;..\..\third_party\hashing;..\..\third_party\utf8rewind;..\..\third_party;..\..\third_party\hashing;..\..\public\sdk;..\..\public\amtl;..\..\third_party;..\..\third_party\hashing;$(METAMOD)\metamod;$(HLSDK)\common;$(HLSDK)\engine;$(HLSDK)\dlls;$(HLSDK)\pm_shared;$(HLSDK)\public;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<PreprocessorDefinitions>WIN32;_CRT_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;_CRT_NONSTDC_NO_DEPRECATE;UTF8PROC_EXPORTS;NDEBUG;_WINDOWS;_USRDLL;amxmodx_EXPORTS;JIT;ASM32;PAWN_CELL_SIZE=32;HAVE_STDINT_H;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<IgnoreStandardIncludePath>false</IgnoreStandardIncludePath>
<StringPooling>true</StringPooling>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
@ -180,6 +180,42 @@
<ObjectFileName Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">$(IntDir)hashing\</ObjectFileName>
<ObjectFileName Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">$(IntDir)hashing\</ObjectFileName>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\casemapping.c">
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">CompileAsCpp</CompileAs>
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">CompileAsCpp</CompileAs>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\codepoint.c">
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">CompileAsCpp</CompileAs>
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">CompileAsCpp</CompileAs>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\composition.c">
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">CompileAsCpp</CompileAs>
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">CompileAsCpp</CompileAs>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\database.c">
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">CompileAsCpp</CompileAs>
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">CompileAsCpp</CompileAs>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\decomposition.c">
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">CompileAsCpp</CompileAs>
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">CompileAsCpp</CompileAs>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\seeking.c">
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">CompileAsCpp</CompileAs>
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">CompileAsCpp</CompileAs>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\streaming.c">
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">CompileAsCpp</CompileAs>
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">CompileAsCpp</CompileAs>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\unicodedatabase.c">
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">CompileAsCpp</CompileAs>
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">CompileAsCpp</CompileAs>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\utf8rewind.c">
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">CompileAsCpp</CompileAs>
<CompileAs Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">CompileAsCpp</CompileAs>
</ClCompile>
<ClCompile Include="..\..\third_party\zlib\adler32.c">
<ObjectFileName Condition="'$(Configuration)|$(Platform)'=='JITDebug|Win32'">$(IntDir)zlib\</ObjectFileName>
<ObjectFileName Condition="'$(Configuration)|$(Platform)'=='JITRelease|Win32'">$(IntDir)zlib\</ObjectFileName>
@ -326,6 +362,16 @@
<ClInclude Include="..\..\third_party\hashing\hashers\sha256.h" />
<ClInclude Include="..\..\third_party\hashing\hashers\sha3.h" />
<ClInclude Include="..\..\third_party\hashing\hashing.h" />
<ClInclude Include="..\..\third_party\utf8rewind\internal\base.h" />
<ClInclude Include="..\..\third_party\utf8rewind\internal\casemapping.h" />
<ClInclude Include="..\..\third_party\utf8rewind\internal\codepoint.h" />
<ClInclude Include="..\..\third_party\utf8rewind\internal\composition.h" />
<ClInclude Include="..\..\third_party\utf8rewind\internal\database.h" />
<ClInclude Include="..\..\third_party\utf8rewind\internal\decomposition.h" />
<ClInclude Include="..\..\third_party\utf8rewind\internal\seeking.h" />
<ClInclude Include="..\..\third_party\utf8rewind\internal\streaming.h" />
<ClInclude Include="..\..\third_party\utf8rewind\unicodedatabase.h" />
<ClInclude Include="..\..\third_party\utf8rewind\utf8rewind.h" />
<ClInclude Include="..\..\third_party\zlib\crc32.h" />
<ClInclude Include="..\..\third_party\zlib\deflate.h" />
<ClInclude Include="..\..\third_party\zlib\gzguts.h" />
@ -390,6 +436,8 @@
<None Include="..\..\plugins\include\cvars.inc" />
<None Include="..\..\plugins\include\datapack.inc" />
<None Include="..\..\plugins\include\gameconfig.inc" />
<None Include="..\..\plugins\include\string_const.inc" />
<None Include="..\..\plugins\include\string_stocks.inc" />
<None Include="..\..\plugins\include\textparse_ini.inc" />
<None Include="..\..\plugins\include\textparse_smc.inc" />
<None Include="..\amxdefn.asm" />

View File

@ -54,6 +54,12 @@
</Filter>
<Filter Include="ReSDK\engine">
<UniqueIdentifier>{04fab577-6f56-40d0-8f69-7ce1b8bf3bb9}</UniqueIdentifier>
</Filter>
<Filter Include="Third Party\UTF8Rewind">
<UniqueIdentifier>{270f3524-564f-4154-bb35-242a6faac09e}</UniqueIdentifier>
</Filter>
<Filter Include="Third Party\UTF8Rewind\internal">
<UniqueIdentifier>{295b670a-1aa3-4b80-bbf6-4ba422672274}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
@ -303,6 +309,33 @@
<ClCompile Include="..\..\public\resdk\mod_rehlds_api.cpp">
<Filter>ReSDK</Filter>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\unicodedatabase.c">
<Filter>Third Party\UTF8Rewind</Filter>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\utf8rewind.c">
<Filter>Third Party\UTF8Rewind</Filter>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\casemapping.c">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\codepoint.c">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\composition.c">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\database.c">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\decomposition.c">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\seeking.c">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClCompile>
<ClCompile Include="..\..\third_party\utf8rewind\internal\streaming.c">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\amx.h">
@ -506,8 +539,35 @@
<ClInclude Include="..\CoreConfig.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\CFrameAction.h">
<Filter>Header Files</Filter>
<ClInclude Include="..\..\third_party\utf8rewind\unicodedatabase.h">
<Filter>Third Party\UTF8Rewind</Filter>
</ClInclude>
<ClInclude Include="..\..\third_party\utf8rewind\internal\base.h">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClInclude>
<ClInclude Include="..\..\third_party\utf8rewind\internal\casemapping.h">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClInclude>
<ClInclude Include="..\..\third_party\utf8rewind\internal\codepoint.h">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClInclude>
<ClInclude Include="..\..\third_party\utf8rewind\internal\composition.h">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClInclude>
<ClInclude Include="..\..\third_party\utf8rewind\internal\database.h">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClInclude>
<ClInclude Include="..\..\third_party\utf8rewind\internal\decomposition.h">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClInclude>
<ClInclude Include="..\..\third_party\utf8rewind\internal\seeking.h">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClInclude>
<ClInclude Include="..\..\third_party\utf8rewind\internal\streaming.h">
<Filter>Third Party\UTF8Rewind\internal</Filter>
</ClInclude>
<ClInclude Include="..\..\third_party\utf8rewind\utf8rewind.h">
<Filter>Third Party\UTF8Rewind</Filter>
</ClInclude>
<ClInclude Include="..\..\public\resdk\common\hookchains.h">
<Filter>ReSDK\common</Filter>
@ -521,6 +581,9 @@
<ClInclude Include="..\..\public\resdk\mod_rehlds_api.h">
<Filter>ReSDK</Filter>
</ClInclude>
<ClInclude Include="..\CFrameAction.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="..\version.rc">
@ -624,6 +687,12 @@
<None Include="..\..\plugins\include\cstrike_const.inc">
<Filter>Pawn Includes</Filter>
</None>
<None Include="..\..\plugins\include\string_const.inc">
<Filter>Pawn Includes</Filter>
</None>
<None Include="..\..\plugins\include\string_stocks.inc">
<Filter>Pawn Includes</Filter>
</None>
</ItemGroup>
<ItemGroup>
<Object Include="..\Jit\helpers-x86.obj">

View File

@ -11,6 +11,7 @@
#include "amxmodx.h"
#include "format.h"
#include "binlog.h"
#include <utf8rewind.h>
const char* stristr(const char* str, const char* substr)
{
@ -169,11 +170,17 @@ extern "C" size_t get_amxstring_r(AMX *amx, cell amx_addr, char *destination, in
return dest - start;
}
char *get_amxbuffer(int id)
{
static char buffer[4][MAX_BUFFER_LENGTH];
return buffer[id];
}
char *get_amxstring(AMX *amx, cell amx_addr, int id, int& len)
{
static char buffer[4][16384];
len = get_amxstring_r(amx, amx_addr, buffer[id], sizeof(buffer[id]) - 1);
return buffer[id];
auto buffer = get_amxbuffer(id);
len = get_amxstring_r(amx, amx_addr, buffer, MAX_BUFFER_LENGTH - 1);
return buffer;
}
char *get_amxstring_null(AMX *amx, cell amx_addr, int id, int& len)
@ -302,58 +309,68 @@ static cell AMX_NATIVE_CALL replace(AMX *amx, cell *params) /* 4 param */
return 0;
}
// native replace_string(text[], maxlength, const search[], const replace[], bool:caseSensitive = true);
static cell AMX_NATIVE_CALL replace_string(AMX *amx, cell *params)
{
int len;
size_t maxlength = (size_t)params[2];
enum args { arg_count, arg_text, arg_maxlength, arg_search, arg_replace, arg_casesensitive };
char *text = get_amxstring(amx, params[1], 0, len);
const char *search = get_amxstring(amx, params[3], 1, len);
const char *replace = get_amxstring(amx, params[4], 2, len);
auto textLength = 0;
auto searchLength = 0;
auto replaceLength = 0;
bool caseSensitive = params[5] ? true : false;
auto text = get_amxstring(amx, params[arg_text] , 0, textLength);
auto search = get_amxstring(amx, params[arg_search] , 1, searchLength);
auto replace = get_amxstring(amx, params[arg_replace], 2, replaceLength);
if (search[0] == '\0')
auto textMaxLength = params[arg_maxlength];
auto caseSensitive = params[arg_casesensitive] != 0;
if (!*search)
{
LogError(amx, AMX_ERR_NATIVE, "Cannot replace searches of empty strings.");
return -1;
}
int count = UTIL_ReplaceAll(text, maxlength + 1, search, replace, caseSensitive); // + EOS
auto count = UTIL_ReplaceAll(text, textMaxLength + 1, search, searchLength, replace, replaceLength, caseSensitive); // + EOS
set_amxstring(amx, params[1], text, maxlength);
set_amxstring(amx, params[arg_text], text, textMaxLength);
return count;
}
// native replace_stringex(text[], maxlength, const search[], const replace[], searchLen = -1, replaceLen = -1, bool:caseSensitive = true);
static cell AMX_NATIVE_CALL replace_stringex(AMX *amx, cell *params)
{
int len;
size_t maxlength = (size_t)params[2];
enum args { arg_count, arg_text, arg_maxlength, arg_search, arg_replace, arg_searchlen, arg_replacelen, arg_casesensitive };
char *text = get_amxstring(amx, params[1], 0, len);
const char *search = get_amxstring(amx, params[3], 1, len);
const char *replace = get_amxstring(amx, params[4], 2, len);
auto textLength = 0;
auto searchLength = 0;
auto replaceLength = 0;
size_t searchLen = (params[5] == -1) ? strlen(search) : (size_t)params[5];
size_t replaceLen = (params[6] == -1) ? strlen(replace) : (size_t)params[6];
auto text = get_amxstring(amx, params[arg_text] , 0, textLength);
auto search = get_amxstring(amx, params[arg_search] , 1, searchLength);
auto replace = get_amxstring(amx, params[arg_replace], 2, replaceLength);
bool caseSensitive = params[7] ? true : false;
auto textMaxLength = params[arg_maxlength];
auto caseSensitive = params[arg_casesensitive] != 0;
if (searchLen == 0)
if (params[arg_searchlen] != -1) { searchLength = params[arg_searchlen]; }
if (params[arg_replacelen] != -1) { replaceLength = params[arg_replacelen]; }
if (searchLength <= 0)
{
LogError(amx, AMX_ERR_NATIVE, "Cannot replace searches of empty strings.");
return -1;
}
char *ptr = UTIL_ReplaceEx(text, maxlength + 1, search, searchLen, replace, replaceLen, caseSensitive); // + EOS
auto ptr = UTIL_ReplaceEx(text, textMaxLength + 1, search, searchLength, replace, replaceLength, caseSensitive); // + EOS
if (!ptr)
{
return -1;
}
set_amxstring(amx, params[1], text, maxlength);
set_amxstring(amx, params[arg_text], text, textMaxLength);
return ptr - text;
}
@ -382,27 +399,36 @@ static cell AMX_NATIVE_CALL contain(AMX *amx, cell *params) /* 2 param */
return -1;
}
static cell AMX_NATIVE_CALL containi(AMX *amx, cell *params) /* 2 param */
// native containi(const source[], const string[]);
static cell AMX_NATIVE_CALL containi(AMX *amx, cell *params)
{
register cell *a = get_amxaddr(amx, params[2]);
register cell *b = get_amxaddr(amx, params[1]);
register cell *c = b;
cell* str = b;
cell* substr = a;
while (*c)
enum args { arg_count, arg_source, arg_search };
auto sourceLength = 0;
auto searchLength = 0;
auto source = get_amxstring(amx, params[arg_source], 0, sourceLength);
auto search = get_amxstring(amx, params[arg_search], 1, searchLength);
if (sourceLength && searchLength)
{
if (tolower(*c) == tolower(*a))
auto sourceFolded = get_amxbuffer(2);
auto searchFolded = get_amxbuffer(3);
sourceLength = utf8casefold(source, sourceLength, sourceFolded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
searchLength = utf8casefold(search, searchLength, searchFolded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
sourceFolded[sourceLength] = '\0';
searchFolded[searchLength] = '\0';
auto result = strstr(sourceFolded, searchFolded);
if (result)
{
c++;
if (!*++a)
return b - str;
} else {
c = ++b;
a = substr;
return result - sourceFolded;
}
}
return -1;
}
@ -609,30 +635,34 @@ static cell AMX_NATIVE_CALL equal(AMX *amx, cell *params) /* 3 param */
return ret ? 0 : 1;
}
static cell AMX_NATIVE_CALL equali(AMX *amx, cell *params) /* 3 param */
// native equali(const a[], const b[], c = 0);
static cell AMX_NATIVE_CALL equali(AMX *amx, cell *params)
{
cell *a = get_amxaddr(amx, params[1]);
cell *b = get_amxaddr(amx, params[2]);
int f, l, c = params[3];
if (c)
{
do
{
f = tolower(*a++);
l = tolower(*b++);
} while (--c && l && f && f == l);
return (f - l) ? 0 : 1;
}
enum args { arg_count, arg_string1, arg_string2, arg_numbytes };
do
auto string1Length = 0;
auto string2Length = 0;
auto string1 = get_amxstring(amx, params[arg_string1], 0, string1Length);
auto string2 = get_amxstring(amx, params[arg_string2], 1, string2Length);
auto string1Folded = get_amxbuffer(2);
auto string2Folded = get_amxbuffer(3);
string1Length = utf8casefold(string1, string1Length, string1Folded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
string2Length = utf8casefold(string2, string2Length, string2Folded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
string2Folded[string1Length] = '\0';
string1Folded[string2Length] = '\0';
if (params[arg_numbytes] > 0)
{
f = tolower(*a++);
l = tolower(*b++);
} while (f && f == l);
return (f - l) ? 0 : 1;
return static_cast<cell>(strncmp(string1Folded, string2Folded, params[arg_numbytes]) == 0);
}
else
{
return static_cast<cell>(strcmp(string1Folded, string2Folded) == 0);
}
}
static cell g_cpbuf[4096];
@ -730,6 +760,29 @@ static cell AMX_NATIVE_CALL strtolower(AMX *amx, cell *params) /* 1 param */
return cptr - begin;
}
// native mb_strtolower(source[], maxlength = 0);
static cell AMX_NATIVE_CALL mb_strtolower(AMX *amx, cell *params)
{
enum args { arg_count, arg_string, arg_maxlength };
auto sourceLength = 0;
auto source = get_amxstring(amx, params[arg_string], 0, sourceLength);
auto outputMaxLength = params[arg_maxlength];
if (outputMaxLength <= 0)
{
outputMaxLength = sourceLength;
}
auto output = get_amxbuffer(1);
auto outputLength = utf8tolower(source, sourceLength, output, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
output[outputLength] = '\0';
return set_amxstring_utf8(amx, params[arg_string], output, outputLength, outputMaxLength);
}
static cell AMX_NATIVE_CALL strtoupper(AMX *amx, cell *params) /* 1 param */
{
cell *cptr = get_amxaddr(amx, params[1]);
@ -744,6 +797,29 @@ static cell AMX_NATIVE_CALL strtoupper(AMX *amx, cell *params) /* 1 param */
return cptr - begin;
}
// native mb_strtoupper(source[], maxlength = 0);
static cell AMX_NATIVE_CALL mb_strtoupper(AMX *amx, cell *params)
{
enum args { arg_count, arg_string, arg_maxlength };
auto sourceLength = 0;
auto source = get_amxstring(amx, params[arg_string], 0, sourceLength);
auto outputMaxLength = params[arg_maxlength];
if (outputMaxLength <= 0)
{
outputMaxLength = sourceLength;
}
auto output = get_amxbuffer(1);
auto outputLength = utf8toupper(source, sourceLength, output, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
output[outputLength] = '\0';
return set_amxstring_utf8(amx, params[arg_string], output, outputLength, outputMaxLength);
}
int fo_numargs(AMX *amx)
{
unsigned char *data = amx->base + (int)((AMX_HEADER *)amx->base)->dat;
@ -1241,6 +1317,46 @@ static cell AMX_NATIVE_CALL amx_ucfirst(AMX *amx, cell *params)
return 1;
}
// native mb_ucfirst(string[], maxlength = 0);
static cell AMX_NATIVE_CALL mb_ucfirst(AMX *amx, cell *params)
{
enum args { arg_count, arg_string, arg_maxlength };
auto sourceLength = 0;
auto source = get_amxstring(amx, params[arg_string], 0, sourceLength);
auto outputMaxLength = params[arg_maxlength];
if (outputMaxLength <= 0)
{
outputMaxLength = sourceLength;
}
// Retrieves the first character length in bytes.
auto firstChLength = utf8seek(source, sourceLength, source, 1, SEEK_CUR) - source;
if (firstChLength)
{
char output[8] = {};
auto outputLength = utf8toupper(source, firstChLength, output, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
// The converted character is either larger or smaller in bytes.
if (firstChLength != outputLength)
{
// Calculates the new string length and makes sure we don't go over the buffer size (fairly unlikely).
sourceLength = ke::Min<int>(sourceLength + (outputLength - firstChLength), outputMaxLength);
// Move data forward or backward minus the first character (whathever its size).
memmove(source + outputLength, source + firstChLength, (sourceLength - outputLength) * sizeof(char));
}
// Copy the new character at the start of the string.
memcpy(source, output, outputLength);
}
return set_amxstring_utf8(amx, params[arg_string], source, sourceLength, outputMaxLength);
}
static cell AMX_NATIVE_CALL amx_strlen(AMX *amx, cell *params)
{
int len;
@ -1292,62 +1408,105 @@ static cell AMX_NATIVE_CALL n_strcat(AMX *amx, cell *params)
return params[3] - num;
}
// native strcmp(const string1[], const string2[], bool:ignorecase = false);
static cell AMX_NATIVE_CALL n_strcmp(AMX *amx, cell *params)
{
int len;
char *str1 = get_amxstring(amx, params[1], 0, len);
char *str2 = get_amxstring(amx, params[2], 1, len);
enum args { arg_count, arg_string1, arg_string2, arg_ignorecase };
if (params[3])
return stricmp(str1, str2);
else
return strcmp(str1, str2);
}
auto string1Length = 0;
auto string2Length = 0;
static cell AMX_NATIVE_CALL n_strncmp(AMX *amx, cell *params)
{
int len;
char *str1 = get_amxstring(amx, params[1], 0, len);
char *str2 = get_amxstring(amx, params[2], 1, len);
auto string1 = get_amxstring(amx, params[arg_string1], 0, string1Length);
auto string2 = get_amxstring(amx, params[arg_string2], 1, string2Length);
if (params[4])
return strncasecmp(str1, str2, (size_t)params[3]);
else
return strncmp(str1, str2, (size_t)params[3]);
}
static cell AMX_NATIVE_CALL n_strfind(AMX *amx, cell *params)
{
int len;
char *str = get_amxstring(amx, params[1], 0, len);
int sublen;
char *sub = get_amxstring(amx, params[2], 1, sublen);
bool igcase = params[3] ? true : false;
if (igcase)
if (params[arg_ignorecase] != 0)
{
for (int i = 0; i < len; i++)
{
if (str[i] & (1<<5))
str[i] &= ~(1<<5);
}
for (int i = 0; i < sublen; i++)
{
if (str[i] & (1<<5))
str[i] &= ~(1<<5);
}
auto string1Folded = get_amxbuffer(2);
auto string2Folded = get_amxbuffer(3);
string1Length = utf8casefold(string1, string1Length, string1Folded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
string2Length = utf8casefold(string2, string2Length, string2Folded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
string2Folded[string1Length] = '\0';
string1Folded[string2Length] = '\0';
string1 = string1Folded;
string2 = string2Folded;
}
if (params[4] > len)
return -1;
return strcmp(string1, string2);
}
char *find = strstr(str + params[4], sub);
// native strncmp(const string1[], const string2[], num, bool:ignorecase = false);
static cell AMX_NATIVE_CALL n_strncmp(AMX *amx, cell *params)
{
enum args { arg_count, arg_string1, arg_string2, arg_numbytes, arg_ignorecase };
auto string1Length = 0;
auto string2Length = 0;
auto string1 = get_amxstring(amx, params[arg_string1], 0, string1Length);
auto string2 = get_amxstring(amx, params[arg_string2], 1, string2Length);
if (params[arg_ignorecase] != 0)
{
auto string1Folded = get_amxbuffer(2);
auto string2Folded = get_amxbuffer(3);
string1Length = utf8casefold(string1, string1Length, string1Folded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
string2Length = utf8casefold(string2, string2Length, string2Folded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
string2Folded[string1Length] = '\0';
string1Folded[string2Length] = '\0';
string1 = string1Folded;
string2 = string2Folded;
}
return strncmp(string1, string2, params[arg_numbytes]);
}
// native strfind(const string[], const sub[], bool:ignorecase = false, pos = 0);
static cell AMX_NATIVE_CALL n_strfind(AMX *amx, cell *params)
{
enum args { arg_count, arg_source, arg_search, arg_ignorecase, arg_startpos };
auto sourceLength = 0;
auto searchLength = 0;
auto source = get_amxstring(amx, params[arg_source], 0, sourceLength);
auto search = get_amxstring(amx, params[arg_search], 1, searchLength);
if (params[arg_ignorecase] != 0)
{
auto sourceFolded = get_amxbuffer(2);
auto searchFolded = get_amxbuffer(3);
sourceLength = utf8casefold(source, sourceLength, sourceFolded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
searchLength = utf8casefold(search, searchLength, searchFolded, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
sourceFolded[sourceLength] = '\0';
searchFolded[searchLength] = '\0';
source = sourceFolded;
search = searchFolded;
}
auto position = params[arg_startpos];
if (position < 0 || position > sourceLength)
{
return -1;
}
auto find = strstr(source + position, search);
if (!find)
{
return -1;
}
return (find - str);
return (find - source);
}
static cell AMX_NATIVE_CALL vformat(AMX *amx, cell *params)
@ -1424,6 +1583,60 @@ static cell AMX_NATIVE_CALL fmt(AMX *amx, cell *params)
return 1;
};
// native mb_strtotitle(source[], maxlength = 0);
static cell AMX_NATIVE_CALL mb_strtotitle(AMX *amx, cell *params)
{
enum args { arg_count, arg_string, arg_maxlength };
auto sourceLength = 0;
auto source = get_amxstring(amx, params[arg_string], 0, sourceLength);
auto outputMaxLength = params[arg_maxlength];
if (outputMaxLength <= 0)
{
outputMaxLength = sourceLength;
}
auto output = get_amxbuffer(1);
auto outputLength = utf8totitle(source, sourceLength, output, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
output[outputLength] = '\0';
return set_amxstring_utf8(amx, params[arg_string], output, outputLength, outputMaxLength);
}
// native bool:is_string_category(const input[], input_size, flags, &output_size = 0);
static cell AMX_NATIVE_CALL is_string_category(AMX *amx, cell *params)
{
enum args { arg_count, arg_input, arg_inputsize, arg_flags, arg_outputsize };
auto inputLength = 0;
auto input = get_amxstring(amx, params[arg_input], 0, inputLength);
auto inputMaxLength = ke::Min(params[arg_inputsize], inputLength);
auto outputSize = get_amxaddr(amx, params[arg_outputsize]);
// User wants to check only one character whatever its size.
if (inputMaxLength <= 1)
{
// Gets the character length.
inputMaxLength = utf8seek(input, inputLength, input, 1, SEEK_CUR) - input;
// Truncated character.
if (inputMaxLength > inputLength)
{
*outputSize = 0;
return FALSE;
}
}
// Checks input with the given flags.
*outputSize = utf8iscategory(input, inputMaxLength, params[arg_flags]);
// If function consumed input, then it's a success.
return static_cast<cell>(*outputSize == inputMaxLength);
}
AMX_NATIVE_INFO string_Natives[] =
{
@ -1445,7 +1658,12 @@ AMX_NATIVE_INFO string_Natives[] =
{"is_char_upper", is_char_upper},
{"is_char_lower", is_char_lower},
{"is_char_mb", is_char_mb},
{"is_string_category", is_string_category },
{"get_char_bytes", get_char_bytes},
{"mb_strtotitle", mb_strtotitle},
{"mb_strtolower", mb_strtolower},
{"mb_strtoupper", mb_strtoupper},
{"mb_ucfirst", mb_ucfirst},
{"num_to_str", numtostr},
{"numtostr", numtostr},
{"parse", parse},

View File

@ -9,6 +9,7 @@
#include <time.h>
#include "amxmodx.h"
#include <utf8rewind.h>
int UTIL_ReadFlags(const char* c)
{
@ -454,11 +455,38 @@ int UTIL_CheckValidChar(D *c)
return 0;
}
unsigned int UTIL_ReplaceAll(char *subject, size_t maxlength, const char *search, const char *replace, bool caseSensitive)
{
size_t searchLen = strlen(search);
size_t replaceLen = strlen(replace);
static char OutputBuffer1[MAX_BUFFER_LENGTH];
static char OutputBuffer2[MAX_BUFFER_LENGTH];
char* utf8stristr(const char *string1, const char *string2)
{
auto string1Length = utf8casefold(string1, strlen(string1), OutputBuffer1, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
auto string2Length = utf8casefold(string2, strlen(string2), OutputBuffer2, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
OutputBuffer1[string1Length] = '\0';
OutputBuffer2[string2Length] = '\0';
return strstr(OutputBuffer1, OutputBuffer2);
}
int utf8strncasecmp(const char *string1, const char *string2, size_t n)
{
auto string1Length = utf8casefold(string1, strlen(string1), OutputBuffer1, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
auto string2Length = utf8casefold(string2, strlen(string2), OutputBuffer2, MAX_BUFFER_LENGTH - 1, UTF8_LOCALE_DEFAULT, nullptr, TRUE);
OutputBuffer1[string1Length] = '\0';
OutputBuffer2[string2Length] = '\0';
return n != 0 ? strncmp(OutputBuffer1, OutputBuffer2, n) : strcmp(OutputBuffer1, OutputBuffer2);
}
int utf8strcasecmp(const char *string1, const char *string2)
{
return utf8strncasecmp(string1, string2, 0);
}
size_t UTIL_ReplaceAll(char *subject, size_t maxlength, const char *search, size_t searchLen, const char *replace, size_t replaceLen, bool caseSensitive)
{
char *newptr, *ptr = subject;
unsigned int total = 0;
while ((newptr = UTIL_ReplaceEx(ptr, maxlength, search, searchLen, replace, replaceLen, caseSensitive)) != NULL)
@ -476,6 +504,11 @@ unsigned int UTIL_ReplaceAll(char *subject, size_t maxlength, const char *search
return total;
}
size_t UTIL_ReplaceAll(char *subject, size_t maxlength, const char *search, const char *replace, bool caseSensitive)
{
return UTIL_ReplaceAll(subject, maxlength, search, strlen(search), replace, strlen(replace), caseSensitive);
}
template unsigned int strncopy<char, char>(char *, const char *, size_t);
template unsigned int strncopy<char, cell>(char *, const cell *, size_t);
template unsigned int strncopy<cell, char>(cell *, const char *, size_t);
@ -534,7 +567,7 @@ char *UTIL_ReplaceEx(char *subject, size_t maxLen, const char *search, size_t se
/* If the search matches and the replace length is 0,
* we can just terminate the string and be done.
*/
if ((caseSensitive ? strcmp(subject, search) : strcasecmp(subject, search)) == 0 && replaceLen == 0)
if ((caseSensitive ? strcmp(subject, search) : utf8strcasecmp(subject, search)) == 0 && replaceLen == 0)
{
*subject = '\0';
return subject;
@ -551,7 +584,7 @@ char *UTIL_ReplaceEx(char *subject, size_t maxLen, const char *search, size_t se
while (*ptr != '\0' && (browsed <= textLen - searchLen))
{
/* See if we get a comparison */
if ((caseSensitive ? strncmp(ptr, search, searchLen) : strncasecmp(ptr, search, searchLen)) == 0)
if ((caseSensitive ? strncmp(ptr, search, searchLen) : utf8strncasecmp(ptr, search, searchLen)) == 0)
{
if (replaceLen > searchLen)
{

View File

@ -16,7 +16,7 @@
#endif
#define _string_included
#define charsmax(%1) (sizeof(%1)-1)
#include <string_const>
/**
* @global Unless otherwise noted, all string functions which take in a
@ -25,11 +25,6 @@
* copy(string, charsmax(string), ...)
*/
/**
* Buffer size used by fmt().
*/
#define MAX_FMT_LENGTH 256
/**
* Calculates the length of a string.
*
@ -52,13 +47,15 @@ native contain(const source[], const string[]);
/**
* Tests whether a string is found inside another string with case ignoring.
*
* @note This supports multi-byte characters (UTF-8) on comparison.
*
* @param source String to search in.
* @param string Substring to find inside the original string.
*
* @return -1 on failure (no match found). Any other value
* indicates a position in the string where the match starts.
*/
native containi(const source[],const string[]);
native containi(const source[], const string[]);
/**
* Given a string, replaces the first occurrence of a search string with a
@ -83,6 +80,7 @@ native replace(text[], len, const what[], const with[]);
* that pushes old data out.
*
* @note Only available in 1.8.3 and above.
* @note This supports multi-byte characters (UTF-8) on case insensitive comparison.
*
* @param text String to perform search and replacements on.
* @param maxlength Maximum length of the string buffer.
@ -92,7 +90,7 @@ native replace(text[], len, const what[], const with[]);
*
* @return Number of replacements that were performed.
*/
native replace_string(text[], maxlength, const search[], const replace[], bool:caseSensitive=true);
native replace_string(text[], maxlength, const search[], const replace[], bool:caseSensitive = true);
/**
* Given a string, replaces the first occurrence of a search string with a
@ -104,6 +102,7 @@ native replace_string(text[], maxlength, const search[], const replace[], bool:c
* that pushes old data out.
*
* @note Only available in 1.8.3 and above.
* @note This supports multi-byte characters (UTF-8) on case insensitive comparison.
*
* @param text String to perform search and replacements on.
* @param maxlength Maximum length of the string buffer.
@ -119,7 +118,7 @@ native replace_string(text[], maxlength, const search[], const replace[], bool:c
* the last replacement ended, or -1 if no replacements were
* made.
*/
native replace_stringex(text[], maxlength, const search[], const replace[], searchLen=-1, replaceLen=-1, bool:caseSensitive=true);
native replace_stringex(text[], maxlength, const search[], const replace[], searchLen = -1, replaceLen = -1, bool:caseSensitive = true);
/**
* Concatenates one string onto another.
@ -361,13 +360,15 @@ native equal(const a[],const b[],c=0);
/**
* Returns whether two strings are equal with case ignoring.
*
* @note This supports multi-byte characters (UTF-8) on comparison.
*
* @param a First string (left).
* @param b Second string (right).
* @param c Number of characters to compare.
*
* @return True if equal, false otherwise.
*/
native equali(const a[],const b[],c=0);
native equali(const a[], const b[], c = 0);
/**
* Copies one string to another string.
@ -445,49 +446,6 @@ native parse(const text[], ... );
* @noreturn
*/
native strtok(const text[], Left[], leftLen, Right[], rightLen, token=' ', trimSpaces=0);
/**
* Below are the trim flags for strtok2
*
* You can specify how the left and right buffers will
* be trimmed by strtok2. LTRIM trims spaces from the
* left side. RTRIM trims from the right side.
*
* The defines TRIM_INNER, TRIM_OUTER and TRIM_FULL are
* shorthands for commonly used flag combinations.
*
* When the initial string is trimmed, using TRIM_INNER
* for all subsequent strtok2 calls will ensure that left
* and right are always trimmed from both sides.
*
* Examples:
* str1[] = " This is * some text "
* strtok2(str1, left, 24, right, 24, '*', TRIM_FULL)
* left will be "This is", right will be "some text"
*
* str2[] = " Here is | an | example "
* trim(str2)
* strtok2(str2, left, 24, right, 24, '|', TRIM_INNER)
* left will be "Here is", right will be "an | example"
* strtok2(right, left, 24, right, 24, '|', TRIM_INNER)
* left will be "an", right will be "example"
*
* str3[] = " One - more "
* strtok2(str3, left, 24, right, 24, '-', TRIM_OUTER)
* left will be "One ", right will be " more"
*
* str4[] = " Final . example "
* strtok2(str4, left, 24, right, 24, '.', LTRIM_LEFT|LTRIM_RIGHT)
* left will be "Final ", right will be "example "
*/
#define LTRIM_LEFT (1<<0)
#define RTRIM_LEFT (1<<1)
#define LTRIM_RIGHT (1<<2)
#define RTRIM_RIGHT (1<<3)
#define TRIM_INNER RTRIM_LEFT|LTRIM_RIGHT
#define TRIM_OUTER LTRIM_LEFT|RTRIM_RIGHT
#define TRIM_FULL TRIM_OUTER|TRIM_INNER
/**
* Breaks a string in two by token.
@ -523,6 +481,21 @@ native trim(text[]);
*/
native strtolower(string[]);
/**
* Performs a multi-byte safe (UTF-8) conversion of all chars in string to lower case.
*
* @note Although most code points can be converted in-place, there are notable
* exceptions and the final length can vary.
* @note Case mapping is not reversible. That is, toUpper(toLower(x)) != toLower(toUpper(x)).
*
* @param string The string to convert.
* @param maxlength Optional size of the buffer. If 0, the length of the original string
* will be used instead.
*
* @return Number of bytes written.
*/
native mb_strtolower(string[], maxlength = 0);
/**
* Converts all chars in string to upper case.
*
@ -531,6 +504,21 @@ native strtolower(string[]);
*/
native strtoupper(string[]);
/**
* Performs a multi-byte safe (UTF-8) conversion of all chars in string to upper case.
*
* @note Although most code points can be converted in-place, there are notable
* exceptions and the final length can vary.
* @note Case mapping is not reversible. That is, toUpper(toLower(x)) != toLower(toUpper(x)).
*
* @param string The string to convert.
* @param maxlength Optional size of the buffer. If 0, the length of the original string
* will be used instead.
*
* @return Number of bytes written.
*/
native mb_strtoupper(string[], maxlength = 0);
/**
* Make a string's first character uppercase.
*
@ -539,6 +527,68 @@ native strtoupper(string[]);
*/
native ucfirst(string[]);
/**
* Performs a multi-byte safe (UTF-8) conversion of a string's first character to upper case.
*
* @note Although most code points can be converted in-place, there are notable
* exceptions and the final length can vary.
*
* @param string The string to convert.
* @param maxlength Optional size of the buffer. If 0, the length of the original string
* will be used instead.
*
* @return Number of bytes written.
*/
native mb_ucfirst(string[], maxlength = 0);
/**
* Performs a multi-byte safe (UTF-8) conversion of all chars in string to title case.
*
* @note Although most code points can be converted in-place, there are notable
* exceptions and the final length can vary.
* @note Any type of punctuation can break up a word, even if this is
* not grammatically valid. This happens because the titlecasing algorithm
* does not and cannot take grammar rules into account.
* @note Examples:
* The running man | The Running Man
* NATO Alliance | Nato Alliance
* You're amazing at building libraries | You'Re Amazing At Building Libraries
*
* @param string The string to convert.
* @param maxlength Optional size of the buffer. If 0, the length of the original string
* will be used instead.
*
* @return Number of bytes written.
*/
native mb_strtotitle(string[], maxlength = 0);
/**
* Checks if the input string conforms to the category specified by the flags.
*
* @note This function can be used to check if the code points in a string are part
* of a category. Valid flags are part of the UTF8C_* list of defines.
* The category for a code point is defined as part of the entry in
* UnicodeData.txt, the data file for the Unicode code point database.
* @note Flags parameter must be a combination of UTF8C_* flags or a single UTF8C_IS* flag.
* In order to main backwards compatibility with POSIX functions like `isdigit`
* and `isspace`, compatibility flags have been provided. Note, however, that
* the result is only guaranteed to be correct for code points in the Basic
* Latin range, between U+0000 and 0+007F. Combining a compatibility flag with
* a regular category flag will result in undefined behavior.
* @note The function is greedy. This means it will try to match as many code
* points with the matching category flags as possible and return the offset in
* the input in bytes.
*
* @param input The string to check
* @param input_size Size of the string, use 1 to check one character regardless its size
* @param flags Requested category, see UTF8C_* flags
* @param output_size Number of bytes in the input that conform to the specified
* category flags
* @return True if the whole input of `input_size` conforms to the specified
* category flags, false otherwise
*/
native bool:is_string_category(const input[], input_size, flags, &output_size = 0);
/**
* Returns whether a character is numeric.
*
@ -612,23 +662,6 @@ native bool:is_char_upper(ch);
*/
native bool:is_char_lower(ch);
/**
* Returns whether a given string contains only digits.
* This returns false for zero-length strings.
*
* @param sString Character to test.
* @return True if string contains only digit, otherwise false.
*/
stock bool:is_str_num(const sString[])
{
new i = 0;
while (sString[i] && isdigit(sString[i]))
++i;
return sString[i] == 0 && i != 0;
}
/**
* Returns the number of bytes a character is using. This is
* for multi-byte characters (UTF-8). For normal ASCII characters,
@ -641,42 +674,6 @@ stock bool:is_str_num(const sString[])
*/
native get_char_bytes(const source[]);
/**
* Returns an uppercase character to a lowercase character.
*
* @note Only available in 1.8.3 and above.
*
* @param chr Characer to convert.
* @return Lowercase character on success,
* no change on failure.
*/
stock char_to_upper(chr)
{
if (is_char_lower(chr))
{
return (chr & ~(1<<5));
}
return chr;
}
/**
* Returns a lowercase character to an uppercase character.
*
* @note Only available in 1.8.3 and above.
*
* @param chr Characer to convert.
* @return Uppercase character on success,
* no change on failure.
*/
stock char_to_lower(chr)
{
if (is_char_upper(chr))
{
return (chr | (1<<5));
}
return chr;
}
/**
* Concatenates one string onto another.
*
@ -690,19 +687,23 @@ native strcat(dest[], const source[], maxlength);
/**
* Tests whether a string is found inside another string.
*
* @note This supports multi-byte characters (UTF-8) on case insensitive comparison.
*
* @param string String to search in.
* @param sub Substring to find inside the original string.
* @param ignorecase If true, search is case insensitive.
* If false (default), search is case sensitive.
* @param pos
* @param pos Start position to search from.
* @return -1 on failure (no match found). Any other value
* indicates a position in the string where the match starts.
*/
native strfind(const string[], const sub[], ignorecase=0, pos=0);
native strfind(const string[], const sub[], bool:ignorecase = false, pos = 0);
/**
* Compares two strings lexographically.
*
* @note This supports multi-byte characters (UTF-8) on case insensitive comparison.
*
* @param string1 First string (left).
* @param string2 Second string (right).
* @param ignorecase If true, comparison is case insensitive.
@ -711,12 +712,13 @@ native strfind(const string[], const sub[], ignorecase=0, pos=0);
* 0 if string1 == string2
* 1 if string1 > string2
*/
native strcmp(const string1[], const string2[], ignorecase=0);
native strcmp(const string1[], const string2[], bool:ignorecase = false);
/**
* Compares two strings parts lexographically.
*
* @note Only available in 1.8.3 and above.
* @note This supports multi-byte characters (UTF-8) on case insensitive comparison.
*
* @param string1 First string (left).
* @param string2 Second string (right).
@ -727,17 +729,7 @@ native strcmp(const string1[], const string2[], ignorecase=0);
* 0 if string1 == string2
* 1 if string1 > string2
*/
native strncmp(const string1[], const string2[], num, bool:ignorecase=false);
/**
* Backwards compatibility stock - use argbreak or argparse.
* @deprecated this function does not work properly.
*/
#pragma deprecated Use argbreak() instead
stock strbreak(const text[], Left[], leftLen, Right[], rightLen)
{
return argbreak(text, Left, leftLen, Right, rightLen);
}
native strncmp(const string1[], const string2[], num, bool:ignorecase = false);
/**
* Parses an argument string to find the first argument. You can use this to
@ -773,34 +765,6 @@ stock strbreak(const text[], Left[], leftLen, Right[], rightLen)
*/
native argparse(const text[], pos, argbuffer[], maxlen);
/**
* Emulates strbreak() using argparse().
*
* @param text Source input string.
* @param left Buffer to store string left part.
* @param leftlen Maximum length of the string part buffer.
* @param right Buffer to store string right part.
* @param rightlen Maximum length of the string part buffer.
*
* @return -1 if no match was found; otherwise, an index into source
* marking the first index after the searched text. The
* index is always relative to the start of the input string.
*/
stock argbreak(const text[], left[], leftlen, right[], rightlen)
{
new pos = argparse(text, 0, left, leftlen);
if (pos == -1)
return -1;
new textlen = strlen(text);
while (pos < textlen && isspace(text[pos]))
pos++;
copy(right, rightlen, text[pos]);
return pos;
}
/**
* Returns text in a string up until a certain character sequence is reached.
*
@ -817,195 +781,6 @@ stock argbreak(const text[], left[], leftlen, right[], rightlen)
*/
native split_string(const source[], const split[], part[], partLen);
/**
* It is basically strbreak but you have a delimiter that is more than one character in length. By Suicid3.
*
* @param szInput Source input string.
* @param szLeft Buffer to store left string part.
* @param pL_Max Maximum length of the string part buffer.
* @param szRight Buffer to store right string part.
* @param pR_Max Maximum length of the string part buffer.
* @param szDelim A string which specifies a search point to break at.
*
* @noreturn
*/
stock split(const szInput[], szLeft[], pL_Max, szRight[], pR_Max, const szDelim[])
{
new iEnd = contain(szInput, szDelim);
new iStart = iEnd + strlen(szDelim);
//If delimiter isnt in Input just split the string at max lengths
if (iEnd == -1)
{
iStart = copy(szLeft, pL_Max, szInput);
copy(szRight, pR_Max, szInput[iStart]);
return;
}
//If delimter is in Input then split at input for max lengths
if (pL_Max >= iEnd)
copy(szLeft, iEnd, szInput);
else
copy(szLeft, pL_Max, szInput);
copy(szRight, pR_Max, szInput[iStart]);
return;
}
/**
* Removes a path from szFilePath leaving the name of the file in szFile for a pMax length.
*
* @param szFilePath String to perform search and replacements on.
* @param szFile Buffer to store file name.
* @param pMax Maximum length of the string buffer.
*
* @noreturn
*/
stock remove_filepath(const szFilePath[], szFile[], pMax)
{
new len = strlen(szFilePath);
while ((--len >= 0) && (szFilePath[len] != '/') && (szFilePath[len] != '\')) { }
copy(szFile, pMax, szFilePath[len + 1]);
return;
}
/**
* Replaces a contained string iteratively.
*
* @note Consider using replace_string() instead.
*
* @note This ensures that no infinite replacements will take place by
* intelligently moving to the next string position each iteration.
*
* @param string String to perform search and replacements on.
* @param len Maximum length of the string buffer.
* @param what String to search for.
* @param with String to replace the search string with.
* @return Number of replacements on success, otherwise 0.
*/
stock replace_all(string[], len, const what[], const with[])
{
new pos = 0;
if ((pos = contain(string, what)) == -1)
{
return 0;
}
new total = 0;
new with_len = strlen(with);
new diff = strlen(what) - with_len;
new total_len = strlen(string);
new temp_pos = 0;
while (replace(string[pos], len - pos, what, with) != 0)
{
total++;
/* jump to position after replacement */
pos += with_len;
/* update cached length of string */
total_len -= diff;
/* will the next call be operating on the last character? */
if (pos >= total_len)
{
break;
}
/* find the next position from our offset */
temp_pos = contain(string[pos], what);
/* if it's invalid, we're done */
if (temp_pos == -1)
{
break;
}
/* otherwise, reposition and update counters */
pos += temp_pos;
}
return total;
}
/**
* Breaks a string into pieces and stores each piece into an array of buffers.
*
* @param text The string to split.
* @param split The string to use as a split delimiter.
* @param buffers An array of string buffers (2D array).
* @param maxStrings Number of string buffers (first dimension size).
* @param maxStringLength Maximum length of each string buffer.
* @param copyRemainder False (default) discard excess pieces, true to ignore
* delimiters after last piece.
* @return Number of strings retrieved.
*/
stock explode_string(const text[], const split[], buffers[][], maxStrings, maxStringLength, bool:copyRemainder = false)
{
new reloc_idx, idx, total;
if (maxStrings < 1 || !split[0])
{
return 0;
}
while ((idx = split_string(text[reloc_idx], split, buffers[total], maxStringLength)) != -1)
{
reloc_idx += idx;
if (++total == maxStrings)
{
if (copyRemainder)
{
copy(buffers[total-1], maxStringLength, text[reloc_idx-idx]);
}
return total;
}
}
copy(buffers[total++], maxStringLength, text[reloc_idx]);
return total;
}
/**
* Joins an array of strings into one string, with a "join" string inserted in
* between each given string. This function complements ExplodeString.
*
* @param strings An array of strings.
* @param numStrings Number of strings in the array.
* @param join The join string to insert between each string.
* @param buffer Output buffer to write the joined string to.
* @param maxLength Maximum length of the output buffer.
* @return Number of bytes written to the output buffer.
*/
stock implode_strings(const strings[][], numStrings, const join[], buffer[], maxLength)
{
new total, length, part_length;
new join_length = strlen(join);
for (new i=0; i<numStrings; i++)
{
length = copy(buffer[total], maxLength-total, strings[i]);
total += length;
if (length < part_length)
{
break;
}
if (i != numStrings - 1)
{
length = copy(buffer[total], maxLength-total, join);
total += length;
if (length < join_length)
{
break;
}
}
}
return total;
}
// Always keep this at the bottom of this file.
#include <string_stocks>

View File

@ -0,0 +1,156 @@
// vim: set ts=4 sw=4 tw=99 noet:
//
// AMX Mod X, based on AMX Mod by Aleksander Naszko ("OLO").
// Copyright (C) The AMX Mod X Development Team.
//
// This software is licensed under the GNU General Public License, version 3 or higher.
// Additional exceptions apply. For full license details, see LICENSE.txt or visit:
// https://alliedmods.net/amxmodx-license
//
// String Manipulation Constants
//
#if defined _string_const_included
#endinput
#endif
#define _string_const_included
#define charsmax(%1) (sizeof(%1)-1)
/**
* @global Unless otherwise noted, all string functions which take in a
* writable buffer and maximum length should NOT have the null terminator INCLUDED
* in the length. This means that this is valid:
* copy(string, charsmax(string), ...)
*/
/**
* Buffer size used by fmt().
*/
#define MAX_FMT_LENGTH 256
/**
* Below are the trim flags for strtok2
*
* You can specify how the left and right buffers will
* be trimmed by strtok2. LTRIM trims spaces from the
* left side. RTRIM trims from the right side.
*
* The defines TRIM_INNER, TRIM_OUTER and TRIM_FULL are
* shorthands for commonly used flag combinations.
*
* When the initial string is trimmed, using TRIM_INNER
* for all subsequent strtok2 calls will ensure that left
* and right are always trimmed from both sides.
*
* Examples:
* str1[] = " This is * some text "
* strtok2(str1, left, 24, right, 24, '*', TRIM_FULL)
* left will be "This is", right will be "some text"
*
* str2[] = " Here is | an | example "
* trim(str2)
* strtok2(str2, left, 24, right, 24, '|', TRIM_INNER)
* left will be "Here is", right will be "an | example"
* strtok2(right, left, 24, right, 24, '|', TRIM_INNER)
* left will be "an", right will be "example"
*
* str3[] = " One - more "
* strtok2(str3, left, 24, right, 24, '-', TRIM_OUTER)
* left will be "One ", right will be " more"
*
* str4[] = " Final . example "
* strtok2(str4, left, 24, right, 24, '.', LTRIM_LEFT|LTRIM_RIGHT)
* left will be "Final ", right will be "example "
*/
#define LTRIM_LEFT (1<<0)
#define RTRIM_LEFT (1<<1)
#define LTRIM_RIGHT (1<<2)
#define RTRIM_RIGHT (1<<3)
#define TRIM_INNER RTRIM_LEFT|LTRIM_RIGHT
#define TRIM_OUTER LTRIM_LEFT|RTRIM_RIGHT
#define TRIM_FULL TRIM_OUTER|TRIM_INNER
/**
* Category flags to be used with is_string_category(), to check whether code points in a
* string are part of that category.
*/
#define UTF8C_LETTER_UPPERCASE 0x00000001 // Uppercase letter code points, Lu in the Unicode database.
#define UTF8C_LETTER_LOWERCASE 0x00000002 // Lowercase letter code points, Ll in the Unicode database.
#define UTF8C_LETTER_TITLECASE 0x00000004 // Titlecase letter code points, Lt in the Unicode database.
#define UTF8C_LETTER_MODIFIER 0x00000008 // Modifier letter code points, Lm in the Unicode database.
#define UTF8C_LETTER_OTHER 0x00000010 // Other letter code points, Lo in the Unicode database.
// Combined flag for all letter categories with case mapping
// Combined flag for all letter categories
const UTF8C_LETTER = (UTF8C_LETTER_UPPERCASE | UTF8C_LETTER_LOWERCASE | UTF8C_LETTER_TITLECASE | UTF8C_LETTER_MODIFIER | UTF8C_LETTER_OTHER);
const UTF8C_CASE_MAPPED = (UTF8C_LETTER_UPPERCASE | UTF8C_LETTER_LOWERCASE | UTF8C_LETTER_TITLECASE);
#define UTF8C_MARK_NON_SPACING 0x00000020 // Non-spacing mark code points, Mn in the Unicode database.
#define UTF8C_MARK_SPACING 0x00000040 // Spacing mark code points, Mc in the Unicode database.
#define UTF8C_MARK_ENCLOSING 0x00000080 // Enclosing mark code points, Me in the Unicode database.
// Combined flag for all mark categories.
const UTF8C_MARK = (UTF8C_MARK_NON_SPACING | UTF8C_MARK_SPACING | UTF8C_MARK_ENCLOSING);
#define UTF8C_NUMBER_DECIMAL 0x00000100 // Decimal number code points, Nd in the Unicode database.
#define UTF8C_NUMBER_LETTER 0x00000200 // Letter number code points, Nl in the Unicode database.
#define UTF8C_NUMBER_OTHER 0x00000400 // Other number code points, No in the Unicode database.
// Combined flag for all number categories.
const UTF8C_NUMBER = (UTF8C_NUMBER_DECIMAL | UTF8C_NUMBER_LETTER | UTF8C_NUMBER_OTHER);
#define UTF8C_PUNCTUATION_CONNECTOR 0x00000800 // Connector punctuation category, Pc in the Unicode database.
#define UTF8C_PUNCTUATION_DASH 0x00001000 // Dash punctuation category, Pd in the Unicode database.
#define UTF8C_PUNCTUATION_OPEN 0x00002000 // Open punctuation category, Ps in the Unicode database.
#define UTF8C_PUNCTUATION_CLOSE 0x00004000 // Close punctuation category, Pe in the Unicode database.
#define UTF8C_PUNCTUATION_INITIAL 0x00008000 // Initial punctuation category, Pi in the Unicode database.
#define UTF8C_PUNCTUATION_FINAL 0x00010000 // Final punctuation category, Pf in the Unicode database.
#define UTF8C_PUNCTUATION_OTHER 0x00020000 // Other punctuation category, Po in the Unicode database.
// Combined flag for all punctuation categories.
const UTF8C_PUNCTUATION = (UTF8C_PUNCTUATION_CONNECTOR | UTF8C_PUNCTUATION_DASH | UTF8C_PUNCTUATION_OPEN | \
UTF8C_PUNCTUATION_CLOSE | UTF8C_PUNCTUATION_INITIAL | UTF8C_PUNCTUATION_FINAL | \
UTF8C_PUNCTUATION_OTHER);
#define UTF8C_SYMBOL_MATH 0x00040000 // Math symbol category, Sm in the Unicode database.
#define UTF8C_SYMBOL_CURRENCY 0x00080000 // Currency symbol category, Sc in the Unicode database.
#define UTF8C_SYMBOL_MODIFIER 0x00100000 // Modifier symbol category, Sk in the Unicode database.
#define UTF8C_SYMBOL_OTHER 0x00200000 // Other symbol category, So in the Unicode database.
// Combined flag for all symbol categories.
const UTF8C_SYMBOL = (UTF8C_SYMBOL_MATH | UTF8C_SYMBOL_CURRENCY | UTF8C_SYMBOL_MODIFIER | UTF8C_SYMBOL_OTHER);
#define UTF8C_SEPARATOR_SPACE 0x00400000 // Space separator category, Zs in the Unicode database.
#define UTF8C_SEPARATOR_LINE 0x00800000 // Line separator category, Zl in the Unicode database.
#define UTF8C_SEPARATOR_PARAGRAPH 0x01000000 // Paragraph separator category, Zp in the Unicode database.
// Combined flag for all separator categories.
const UTF8C_SEPARATOR = (UTF8C_SEPARATOR_SPACE | UTF8C_SEPARATOR_LINE | UTF8C_SEPARATOR_PARAGRAPH);
#define UTF8C_CONTROL 0x02000000 // Control category, Cc in the Unicode database.
#define UTF8C_FORMAT 0x04000000 // Format category, Cf in the Unicode database.
#define UTF8C_SURROGATE 0x08000000 // Surrogate category, Cs in the Unicode database.
#define UTF8C_PRIVATE_USE 0x10000000 // Private use category, Co in the Unicode database.
#define UTF8C_UNASSIGNED 0x20000000 // Unassigned category, Cn in the Unicode database.
#define UTF8C_COMPATIBILITY 0x40000000 // Flag used for maintaining backwards compatibility with POSIX
#define UTF8C_IGNORE_GRAPHEME_CLUSTER 0x80000000 // Flag used for checking only the general category of code points at the start of a grapheme cluster.
// Flag used for maintaining backwards compatibility with POSIX function
const UTF8C_ISCNTRL = (UTF8C_COMPATIBILITY | UTF8C_CONTROL);
const UTF8C_ISPRINT = (UTF8C_COMPATIBILITY | UTF8C_LETTER | UTF8C_NUMBER | UTF8C_PUNCTUATION | UTF8C_SYMBOL | UTF8C_SEPARATOR);
const UTF8C_ISSPACE = (UTF8C_COMPATIBILITY | UTF8C_SEPARATOR_SPACE);
const UTF8C_ISBLANK = (UTF8C_COMPATIBILITY | UTF8C_SEPARATOR_SPACE | UTF8C_PRIVATE_USE);
const UTF8C_ISGRAPH = (UTF8C_COMPATIBILITY | UTF8C_LETTER | UTF8C_NUMBER | UTF8C_PUNCTUATION | UTF8C_SYMBOL);
const UTF8C_ISPUNCT = (UTF8C_COMPATIBILITY | UTF8C_PUNCTUATION | UTF8C_SYMBOL);
const UTF8C_ISALNUM = (UTF8C_COMPATIBILITY | UTF8C_LETTER | UTF8C_NUMBER);
const UTF8C_ISALPHA = (UTF8C_COMPATIBILITY | UTF8C_LETTER);
const UTF8C_ISUPPER = (UTF8C_COMPATIBILITY | UTF8C_LETTER_UPPERCASE);
const UTF8C_ISLOWER = (UTF8C_COMPATIBILITY | UTF8C_LETTER_LOWERCASE);
const UTF8C_ISDIGIT = (UTF8C_COMPATIBILITY | UTF8C_NUMBER);
const UTF8C_ISXDIGIT = (UTF8C_COMPATIBILITY | UTF8C_NUMBER | UTF8C_PRIVATE_USE);
// All flags.
const UTF8C_ALL = 0xFFFFFFFF & (~UTF8C_COMPATIBILITY);

View File

@ -0,0 +1,325 @@
// vim: set ts=4 sw=4 tw=99 noet:
//
// AMX Mod X, based on AMX Mod by Aleksander Naszko ("OLO").
// Copyright (C) The AMX Mod X Development Team.
//
// This software is licensed under the GNU General Public License, version 3 or higher.
// Additional exceptions apply. For full license details, see LICENSE.txt or visit:
// https://alliedmods.net/amxmodx-license
//
// String Manipulation Stocks
//
#if defined _string_stocks_included
#endinput
#endif
#define _string_stocks_included
#if !defined _string_included
#include <string>
#endif
/**
* @global Unless otherwise noted, all string functions which take in a
* writable buffer and maximum length should NOT have the null terminator INCLUDED
* in the length. This means that this is valid:
* copy(string, charsmax(string), ...)
*/
/**
* Returns whether a given string contains only digits.
* This returns false for zero-length strings.
*
* @param sString Character to test.
* @return True if string contains only digit, otherwise false.
*/
stock bool:is_str_num(const sString[])
{
new i = 0;
while (sString[i] && isdigit(sString[i]))
{
++i;
}
return sString[i] == 0 && i != 0;
}
/**
* Returns an uppercase character to a lowercase character.
*
* @note Only available in 1.8.3 and above.
*
* @param chr Characer to convert.
* @return Lowercase character on success,
* no change on failure.
*/
stock char_to_upper(chr)
{
if (is_char_lower(chr))
{
return (chr & ~(1<<5));
}
return chr;
}
/**
* Returns a lowercase character to an uppercase character.
*
* @note Only available in 1.8.3 and above.
*
* @param chr Characer to convert.
* @return Uppercase character on success,
* no change on failure.
*/
stock char_to_lower(chr)
{
if (is_char_upper(chr))
{
return (chr | (1<<5));
}
return chr;
}
/**
* Backwards compatibility stock - use argbreak or argparse.
* @deprecated this function does not work properly.
*/
#pragma deprecated Use argbreak() instead
stock strbreak(const text[], Left[], leftLen, Right[], rightLen)
{
return argbreak(text, Left, leftLen, Right, rightLen);
}
/**
* Emulates strbreak() using argparse().
*
* @param text Source input string.
* @param left Buffer to store string left part.
* @param leftlen Maximum length of the string part buffer.
* @param right Buffer to store string right part.
* @param rightlen Maximum length of the string part buffer.
*
* @return -1 if no match was found; otherwise, an index into source
* marking the first index after the searched text. The
* index is always relative to the start of the input string.
*/
stock argbreak(const text[], left[], leftlen, right[], rightlen)
{
new pos = argparse(text, 0, left, leftlen);
if (pos == -1)
{
return -1;
}
new textlen = strlen(text);
while (pos < textlen && isspace(text[pos]))
{
pos++;
}
copy(right, rightlen, text[pos]);
return pos;
}
/**
* It is basically strbreak but you have a delimiter that is more than one character in length. By Suicid3.
*
* @param szInput Source input string.
* @param szLeft Buffer to store left string part.
* @param pL_Max Maximum length of the string part buffer.
* @param szRight Buffer to store right string part.
* @param pR_Max Maximum length of the string part buffer.
* @param szDelim A string which specifies a search point to break at.
*
* @noreturn
*/
stock split(const szInput[], szLeft[], pL_Max, szRight[], pR_Max, const szDelim[])
{
new iEnd = contain(szInput, szDelim);
new iStart = iEnd + strlen(szDelim);
// If delimiter isnt in Input just split the string at max lengths
if (iEnd == -1)
{
iStart = copy(szLeft, pL_Max, szInput);
copy(szRight, pR_Max, szInput[iStart]);
return;
}
// If delimter is in Input then split at input for max lengths
if (pL_Max >= iEnd)
copy(szLeft, iEnd, szInput);
else
copy(szLeft, pL_Max, szInput);
copy(szRight, pR_Max, szInput[iStart]);
}
/**
* Removes a path from szFilePath leaving the name of the file in szFile for a pMax length.
*
* @param szFilePath String to perform search and replacements on.
* @param szFile Buffer to store file name.
* @param pMax Maximum length of the string buffer.
*
* @noreturn
*/
stock remove_filepath(const szFilePath[], szFile[], pMax)
{
new len = strlen(szFilePath);
while ((--len >= 0) && (szFilePath[len] != '/') && (szFilePath[len] != '\')) { }
copy(szFile, pMax, szFilePath[len + 1]);
return;
}
/**
* Replaces a contained string iteratively.
*
* @note Consider using replace_string() instead.
*
* @note This ensures that no infinite replacements will take place by
* intelligently moving to the next string position each iteration.
*
* @param string String to perform search and replacements on.
* @param len Maximum length of the string buffer.
* @param what String to search for.
* @param with String to replace the search string with.
*
* @return Number of replacements on success, otherwise 0.
*/
stock replace_all(string[], len, const what[], const with[])
{
new pos = 0;
if ((pos = contain(string, what)) == -1)
{
return 0;
}
new total = 0;
new with_len = strlen(with);
new diff = strlen(what) - with_len;
new total_len = strlen(string);
new temp_pos = 0;
while (replace(string[pos], len - pos, what, with) != 0)
{
total++;
/* jump to position after replacement */
pos += with_len;
/* update cached length of string */
total_len -= diff;
/* will the next call be operating on the last character? */
if (pos >= total_len)
{
break;
}
/* find the next position from our offset */
temp_pos = contain(string[pos], what);
/* if it's invalid, we're done */
if (temp_pos == -1)
{
break;
}
/* otherwise, reposition and update counters */
pos += temp_pos;
}
return total;
}
/**
* Breaks a string into pieces and stores each piece into an array of buffers.
*
* @param text The string to split.
* @param split The string to use as a split delimiter.
* @param buffers An array of string buffers (2D array).
* @param maxStrings Number of string buffers (first dimension size).
* @param maxStringLength Maximum length of each string buffer.
* @param copyRemainder False (default) discard excess pieces, true to ignore
* delimiters after last piece.
* @return Number of strings retrieved.
*/
stock explode_string(const text[], const split[], buffers[][], maxStrings, maxStringLength, bool:copyRemainder = false)
{
new reloc_idx, idx, total;
if (maxStrings < 1 || !split[0])
{
return 0;
}
while ((idx = split_string(text[reloc_idx], split, buffers[total], maxStringLength)) != -1)
{
reloc_idx += idx;
if (++total == maxStrings)
{
if (copyRemainder)
{
copy(buffers[total-1], maxStringLength, text[reloc_idx-idx]);
}
return total;
}
}
copy(buffers[total++], maxStringLength, text[reloc_idx]);
return total;
}
/**
* Joins an array of strings into one string, with a "join" string inserted in
* between each given string. This function complements ExplodeString.
*
* @param strings An array of strings.
* @param numStrings Number of strings in the array.
* @param join The join string to insert between each string.
* @param buffer Output buffer to write the joined string to.
* @param maxLength Maximum length of the output buffer.
* @return Number of bytes written to the output buffer.
*/
stock implode_strings(const strings[][], numStrings, const join[], buffer[], maxLength)
{
new total, length, part_length;
new join_length = strlen(join);
for (new i=0; i<numStrings; i++)
{
length = copy(buffer[total], maxLength-total, strings[i]);
total += length;
if (length < part_length)
{
break;
}
if (i != numStrings - 1)
{
length = copy(buffer[total], maxLength-total, join);
total += length;
if (length < join_length)
{
break;
}
}
}
return total;
}

File diff suppressed because it is too large Load Diff

View File

@ -114,3 +114,27 @@ subject to the following restrictions:
* Altered source versions must be plainly marked as such, and
must not be misrepresented as being the original software.
-----------------------------------------------------------------
Portable C UTF-8 Rewind Library, as used in AMX Mod X Core module
-----------------------------------------------------------------
Copyright (C) 2014-2016 Quinten Lansu
Licensed under The MIT License.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -304,6 +304,8 @@ scripting_files = [
'include/regex.inc',
'include/sockets.inc',
'include/string.inc',
'include/string_const.inc',
'include/string_stocks.inc',
'include/tfcconst.inc',
'include/tfcstats.inc',
'include/tfcx.inc',

26
third_party/utf8rewind/AMBuilder vendored Normal file
View File

@ -0,0 +1,26 @@
# vim: sts=2 ts=8 sw=2 tw=99 et ft=python:
import os, platform
lib = builder.compiler.StaticLibrary('utf8rewind')
lib.compiler.defines += [
'UTF8REWINDS_EXPORTS',
]
lib.compiler.includes += [
os.path.join(builder.sourcePath, 'third_party', 'utf8rewind'),
]
lib.sources += [
'utf8rewind.c',
'unicodedatabase.c',
'internal/casemapping.c',
'internal/codepoint.c',
'internal/composition.c',
'internal/database.c',
'internal/decomposition.c',
'internal/seeking.c',
'internal/streaming.c',
]
rvalue = builder.Add(lib)

147
third_party/utf8rewind/internal/base.h vendored Normal file
View File

@ -0,0 +1,147 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_BASE_H_
#define _UTF8REWIND_INTERNAL_BASE_H_
/*!
\file
\brief Base header for internal interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
#if defined(__GNUC__) && !defined(COMPILER_ICC)
#define UTF8_UNUSED(_parameter) _parameter __attribute__ ((unused))
#else
#define UTF8_UNUSED(_parameter) _parameter
#endif
#define UTF8_SET_ERROR(_error) \
if (errors != 0) { *errors = UTF8_ERR_ ## _error; }
/* Validates input before transforming */
/* Check for parameter overlap using the separating axis theorem */
#define UTF8_VALIDATE_PARAMETERS_CHAR(_inputType, _result) \
if (input == 0) { \
UTF8_SET_ERROR(INVALID_DATA); \
return _result; \
} \
else if (inputSize < sizeof(_inputType)) { \
if (target != 0) { \
if (targetSize < 3) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
memcpy(target, REPLACEMENT_CHARACTER_STRING, REPLACEMENT_CHARACTER_STRING_LENGTH); \
} \
UTF8_SET_ERROR(INVALID_DATA); \
return _result + REPLACEMENT_CHARACTER_STRING_LENGTH; \
} \
if (target != 0 && targetSize == 0) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
if ((char*)input == target) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
{ \
char* input_center = (char*)input + (inputSize / 2); \
char* target_center = target + (targetSize / 2); \
size_t delta = (size_t)((input_center > target_center) ? (input_center - target_center) : (target_center - input_center)); \
if (delta < (inputSize + targetSize) / 2) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
}
#define UTF8_VALIDATE_PARAMETERS_CHAR_NOCR(_inputType, _result) \
if (input == 0) { \
UTF8_SET_ERROR(INVALID_DATA); \
return _result; \
} \
else if (inputSize < sizeof(_inputType)) { \
UTF8_SET_ERROR(INVALID_DATA); \
return _result; \
} \
if (target != 0 && targetSize == 0) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
if ((char*)input == target) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
{ \
char* input_center = (char*)input + (inputSize / 2); \
char* target_center = target + (targetSize / 2); \
size_t delta = (size_t)((input_center > target_center) ? (input_center - target_center) : (target_center - input_center)); \
if (delta < (inputSize + targetSize) / 2) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
}
#define UTF8_VALIDATE_PARAMETERS(_inputType, _outputType, _result) \
if (input == 0) { \
UTF8_SET_ERROR(INVALID_DATA); \
return _result; \
} \
else if (inputSize < sizeof(_inputType)) { \
if (target != 0) { \
if (targetSize < sizeof(_outputType)) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
*target = REPLACEMENT_CHARACTER; \
} \
UTF8_SET_ERROR(INVALID_DATA); \
return _result + sizeof(_outputType); \
} \
if (target != 0 && targetSize < sizeof(_outputType)) { \
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
return _result; \
} \
if ((char*)input == (char*)target) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
{ \
char* input_center = (char*)input + (inputSize / 2); \
char* target_center = (char*)target + (targetSize / 2); \
size_t delta = (size_t)((input_center > target_center) ? (input_center - target_center) : (target_center - input_center)); \
if (delta < (inputSize + targetSize) / 2) { \
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
return _result; \
} \
}
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_BASE_H_ */

View File

@ -0,0 +1,663 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "casemapping.h"
#include "base.h"
#include "codepoint.h"
#include "database.h"
#include "streaming.h"
static const char basic_latin_lowercase_table[58] = {
/* LATIN CAPITAL LETTER A - LATIN CAPITAL LETTER Z */
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C,
0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
0x79, 0x7A,
0x5B, /* LEFT SQUARE BRACKET */
0x5C, /* REVERSE SOLIDUS */
0x5D, /* RIGHT SQUARE BRACKET */
0x5E, /* CIRCUMFLEX ACCENT */
0x5F, /* LOW LINE */
0x60, /* GRAVE ACCENT */
/* LATIN SMALL LETTER A - LATIN SMALL LETTER Z */
0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C,
0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
0x79, 0x7A
};
static const char basic_latin_uppercase_table[58] = {
/* LATIN CAPITAL LETTER A - LATIN CAPITAL LETTER Z */
0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C,
0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
0x59, 0x5A,
0x5B, /* LEFT SQUARE BRACKET */
0x5C, /* REVERSE SOLIDUS */
0x5D, /* RIGHT SQUARE BRACKET */
0x5E, /* CIRCUMFLEX ACCENT */
0x5F, /* LOW LINE */
0x60, /* GRAVE ACCENT */
/* LATIN SMALL LETTER A - LATIN SMALL LETTER Z */
0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C,
0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
0x59, 0x5A
};
uint8_t casemapping_initialize(
CaseMappingState* state,
const char* input, size_t inputSize,
char* target, size_t targetSize,
const uint32_t* propertyIndex1, const uint32_t* propertyIndex2, const uint32_t* propertyData,
uint8_t quickCheck, size_t locale,
int32_t* errors)
{
memset(state, 0, sizeof(CaseMappingState));
if (locale >= UTF8_LOCALE_MAXIMUM)
{
UTF8_SET_ERROR(INVALID_LOCALE);
return 0;
}
state->src = input;
state->src_size = inputSize;
state->dst = target;
state->dst_size = targetSize;
state->property_index1 = propertyIndex1;
state->property_index2 = propertyIndex2;
state->property_data = propertyData;
state->quickcheck_flags = quickCheck;
state->locale = locale;
return 1;
}
size_t casemapping_execute(CaseMappingState* state, int32_t* errors, int no_replacement)
{
uint8_t qc_casemapped = 0;
uint8_t bytes_needed = 0;
const char* resolved = 0;
StreamState stream;
uint8_t i;
/* Read next code point */
state->last_code_point_size = codepoint_read(state->src, state->src_size, &state->last_code_point);
if (state->last_code_point_size == 0)
{
goto invaliddata;
}
/* Check for invalid characters */
if (state->last_code_point == REPLACEMENT_CHARACTER)
{
/* If option set, we want to avoid invalid byte to be replaced. Forces size to 1 to read the next byte. */
if (no_replacement)
{
state->last_code_point_size = 1;
}
else
{
/* Get code point properties */
state->last_canonical_combining_class = CCC_NOT_REORDERED;
state->last_general_category = UTF8_CATEGORY_SYMBOL_OTHER;
resolved = REPLACEMENT_CHARACTER_STRING;
bytes_needed = REPLACEMENT_CHARACTER_STRING_LENGTH;
goto writeresolved;
}
}
if (state->locale == UTF8_LOCALE_TURKISH_AND_AZERI_LATIN)
{
/*
Code point General Category does not need to be modified, because
all mappings result in the same General Category
*/
if (state->property_data == LowercaseDataPtr)
{
if (state->last_code_point == CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
{
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
resolved = "i";
bytes_needed = 1;
}
else if (
state->last_code_point == CP_LATIN_CAPITAL_LETTER_I)
{
if (state->src_size == 0)
{
/* Early-out for easy case */
state->last_code_point = CP_LATIN_SMALL_LETTER_DOTLESS_I;
resolved = "\xC4\xB1";
bytes_needed = 2;
}
else
{
uint8_t found = 0;
/* Initialize stream and read the next sequence */
if (!stream_initialize(&stream, state->src, state->src_size) ||
!stream_read(&stream, QuickCheckNFCIndexPtr, QuickCheckNFCDataPtr))
{
goto writeregular;
}
/* Erase COMBINING DOT ABOVE from sequence */
for (i = stream.current - 1; i > 0; --i)
{
if (stream.codepoint[i] == CP_COMBINING_DOT_ABOVE)
{
stream.canonical_combining_class[i] = CCC_INVALID;
found++;
}
}
/* Stabilize sequence and write to output */
if (!stream.stable ||
found > 0)
{
stream_reorder(&stream);
stream.current -= found;
}
stream.codepoint[0] = (found > 0) ? CP_LATIN_SMALL_LETTER_I : CP_LATIN_SMALL_LETTER_DOTLESS_I;
goto writestream;
}
}
}
else
{
if (state->last_code_point == CP_LATIN_SMALL_LETTER_I)
{
state->last_code_point = CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
resolved = "\xC4\xB0";
bytes_needed = 2;
}
else if (
state->last_code_point == CP_LATIN_SMALL_LETTER_DOTLESS_I)
{
state->last_code_point = CP_LATIN_CAPITAL_LETTER_I;
resolved = "I";
bytes_needed = 1;
}
}
/* Check if mapping succeeded */
if (resolved != 0)
{
/* Code point properties */
state->last_general_category = UTF8_CATEGORY_LETTER;
goto writeresolved;
}
}
else if (
state->locale == UTF8_LOCALE_LITHUANIAN)
{
if (state->property_data == LowercaseDataPtr)
{
unicode_t cp_additional_accent = 0;
uint8_t write_soft_dot = 1;
switch (state->last_code_point)
{
case CP_LATIN_CAPITAL_LETTER_I:
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
break;
case CP_LATIN_CAPITAL_LETTER_J:
state->last_code_point = CP_LATIN_SMALL_LETTER_J;
break;
case CP_LATIN_CAPITAL_LETTER_I_WITH_OGONEK:
state->last_code_point = CP_LATIN_SMALL_LETTER_I_WITH_OGONEK;
break;
case CP_LATIN_CAPITAL_LETTER_I_WITH_GRAVE:
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
cp_additional_accent = CP_COMBINING_GRAVE_ACCENT;
break;
case CP_LATIN_CAPITAL_LETTER_I_WITH_ACUTE:
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
cp_additional_accent = CP_COMBINING_ACUTE_ACCENT;
break;
case CP_LATIN_CAPITAL_LETTER_I_WITH_TILDE:
state->last_code_point = CP_LATIN_SMALL_LETTER_I;
cp_additional_accent = CP_COMBINING_TILDE_ACCENT;
break;
default:
goto writeregular;
}
/* Initialize stream and read the next sequence */
if (!stream_initialize(&stream, state->src, state->src_size) ||
!stream_read(&stream, QuickCheckNFCIndexPtr, QuickCheckNFCDataPtr))
{
goto writeregular;
}
/* Assign the lowercase code point to the start of the stream */
stream.codepoint[0] = state->last_code_point;
/* Check if COMBINING DOT ABOVE is not yet present */
for (i = stream.current - 1; i > 0; --i)
{
if (stream.codepoint[i] == CP_COMBINING_DOT_ABOVE)
{
write_soft_dot = 0;
break;
}
}
/* Stabilize the sequence */
if (!stream.stable)
{
stream_reorder(&stream);
stream.stable = 1;
}
/* Write COMBINING DOT ABOVE */
if (write_soft_dot &&
stream.current < STREAM_BUFFER_MAX)
{
/* Ensure the COMBINING DOT ABOVE comes before other accents with the same CCC */
if (stream.canonical_combining_class[stream.current - 1] == CCC_ABOVE)
{
unicode_t cp_swap = stream.codepoint[stream.current - 1];
stream.codepoint[stream.current - 1] = CP_COMBINING_DOT_ABOVE;
stream.codepoint[stream.current] = cp_swap;
}
else
{
stream.codepoint[stream.current] = CP_COMBINING_DOT_ABOVE;
}
stream.canonical_combining_class[stream.current] = CCC_ABOVE;
/* Check if sequence has become unstable */
stream.stable = stream.canonical_combining_class[stream.current - 1] <= CCC_ABOVE;
stream.current++;
}
/* Write additional accent */
if (cp_additional_accent != 0 &&
stream.current < STREAM_BUFFER_MAX)
{
/* Additional accents are always of the upper variety */
stream.codepoint[stream.current] = cp_additional_accent;
stream.canonical_combining_class[stream.current] = CCC_ABOVE;
/* Check if sequence has become unstable */
if (stream.stable &&
stream.canonical_combining_class[stream.current] < stream.canonical_combining_class[stream.current - 1])
{
stream.stable = 0;
}
stream.current++;
}
/* Stabilize the sequence */
if (!stream.stable)
{
stream_reorder(&stream);
}
}
else
{
uint8_t erase_count = 0;
switch (state->last_code_point)
{
case CP_LATIN_SMALL_LETTER_I:
state->last_code_point = CP_LATIN_CAPITAL_LETTER_I;
break;
case CP_LATIN_SMALL_LETTER_J:
state->last_code_point = CP_LATIN_CAPITAL_LETTER_J;
break;
case CP_LATIN_SMALL_LETTER_I_WITH_OGONEK:
state->last_code_point = CP_LATIN_CAPITAL_LETTER_I_WITH_OGONEK;
break;
default:
goto writeregular;
}
/* Initialize stream and read the next sequence */
if (!stream_initialize(&stream, state->src, state->src_size) ||
!stream_read(&stream, QuickCheckNFCIndexPtr, QuickCheckNFCDataPtr))
{
goto writeregular;
}
/* Assign the uppercase code point to the start of the stream */
stream.codepoint[0] = state->last_code_point;
/* Remove COMBINING DOT ABOVE from sequence */
for (i = 1; i < stream.current; ++i)
{
if (stream.codepoint[i] == CP_COMBINING_DOT_ABOVE)
{
stream.canonical_combining_class[i] = CCC_INVALID;
erase_count++;
}
}
/* Stabilize the sequence */
if (!stream.stable ||
erase_count > 0)
{
stream_reorder(&stream);
stream.current -= erase_count;
}
}
goto writestream;
}
writeregular:
/* Get code point properties */
state->last_canonical_combining_class = PROPERTY_GET_CCC(state->last_code_point);
state->last_general_category = PROPERTY_GET_GC(state->last_code_point);
/* Move source cursor */
if (state->src_size >= state->last_code_point_size)
{
state->src += state->last_code_point_size;
state->src_size -= state->last_code_point_size;
}
else
{
state->src_size = 0;
}
/* Write to output */
if (state->last_code_point_size == 1)
{
/* Write Basic Latin to output buffer*/
if (state->dst != 0)
{
if (state->dst_size < 1)
{
goto outofspace;
}
/*
Uppercase letters are U+0041 ('A') to U+005A ('Z')
Lowercase letters are U+0061 ('a') to U+007A ('z')
*/
if (state->last_code_point >= 0x41 &&
state->last_code_point <= 0x7A)
{
if (state->property_data == LowercaseDataPtr)
{
*state->dst = basic_latin_lowercase_table[state->last_code_point - 0x41];
}
else
{
*state->dst = basic_latin_uppercase_table[state->last_code_point - 0x41];
}
}
else
{
/* All other code points in Basic Latin are unaffected by case mapping */
if (no_replacement && state->last_code_point == REPLACEMENT_CHARACTER)
{
*state->dst = (char)*(state->src - state->last_code_point_size);
}
else
{
*state->dst = (char)state->last_code_point;
}
}
state->dst++;
state->dst_size--;
}
bytes_needed = 1;
}
else
{
if (state->property_data == LowercaseDataPtr &&
state->last_code_point == CP_GREEK_CAPITAL_LETTER_SIGMA)
{
/*
If the final letter of a word (defined as "a collection of code
points with the General Category 'Letter'") is a GREEK CAPITAL
LETTER SIGMA and more than one code point was processed, the
lowercase version is U+03C2 GREEK SMALL LETTER FINAL SIGMA
instead of U+03C3 GREEK SMALL LETTER SIGMA.
*/
/* At least one code point should have been read */
uint8_t should_convert = state->total_bytes_needed > 0;
if (state->src_size > 0)
{
unicode_t peeked = 0;
const char* peeked_src = state->src;
size_t peeked_src_size = state->src_size;
while (1)
{
uint8_t peeked_read = 0;
/* Peek next code point */
if ((peeked_read = codepoint_read(peeked_src, peeked_src_size, &peeked)) == 0 ||
peeked_src_size < peeked_read)
{
should_convert = 1;
break;
}
/* Convert if the "word" has ended */
if (PROPERTY_GET_CCC(peeked) == CCC_NOT_REORDERED)
{
should_convert = (PROPERTY_GET_GC(peeked) & UTF8_CATEGORY_LETTER) == 0;
break;
}
peeked_src += peeked_read;
peeked_src_size -= peeked_read;
}
}
/* Write the converted code point to the output buffer */
bytes_needed = 2;
if (state->dst != 0)
{
if (state->dst_size < bytes_needed)
{
goto outofspace;
}
memcpy(state->dst, should_convert ? "\xCF\x82" : "\xCF\x83", bytes_needed);
state->dst += bytes_needed;
state->dst_size -= bytes_needed;
}
return bytes_needed;
}
/* Check if the code point is case mapped */
qc_casemapped = PROPERTY_GET_CM(state->last_code_point);
if ((qc_casemapped & state->quickcheck_flags) != 0)
{
/* Attempt to resolve the case mapping */
resolved = database_querydecomposition(state->last_code_point, state->property_index1, state->property_index2, state->property_data, &bytes_needed);
if (resolved != 0)
{
/* Code point properties */
state->last_general_category = UTF8_CATEGORY_LETTER;
goto writeresolvedonly;
}
}
/* Write code point unchanged to output */
bytes_needed = codepoint_write(state->last_code_point, &state->dst, &state->dst_size);
if (bytes_needed == 0)
{
goto outofspace;
}
}
return bytes_needed;
writeresolved:
/* Move source cursor */
if (state->src_size >= state->last_code_point_size)
{
state->src += state->last_code_point_size;
state->src_size -= state->last_code_point_size;
}
else
{
state->src_size = 0;
}
writeresolvedonly:
/* Write resolved string to output */
if (state->dst != 0)
{
if (state->dst_size < bytes_needed)
{
goto outofspace;
}
memcpy(state->dst, resolved, bytes_needed);
state->dst += bytes_needed;
state->dst_size -= bytes_needed;
}
return bytes_needed;
writestream:
/* Get code point properties */
state->last_code_point = stream.codepoint[stream.current - 1];
state->last_canonical_combining_class = stream.canonical_combining_class[stream.current - 1];
state->last_general_category = PROPERTY_GET_GC(stream.codepoint[0]);
/* Move source cursor */
state->src = stream.src;
state->src_size = stream.src_size;
/* Write result to the output buffer */
if (!stream_write(&stream, &state->dst, &state->dst_size, &bytes_needed))
{
goto outofspace;
}
return bytes_needed;
invaliddata:
UTF8_SET_ERROR(INVALID_DATA);
state->src_size = 0;
return 0;
outofspace:
UTF8_SET_ERROR(NOT_ENOUGH_SPACE);
state->src_size = 0;
return 0;
}

View File

@ -0,0 +1,67 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_CASEMAPPING_H_
#define _UTF8REWIND_INTERNAL_CASEMAPPING_H_
/*!
\file
\brief Case mapping interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
typedef struct {
const char* src;
char* dst;
size_t src_size;
size_t dst_size;
size_t total_bytes_needed;
unicode_t last_code_point;
size_t locale;
const uint32_t* property_index1;
const uint32_t* property_index2;
const uint32_t* property_data;
uint32_t last_general_category;
uint8_t last_code_point_size;
uint8_t last_canonical_combining_class;
uint8_t quickcheck_flags;
} CaseMappingState;
uint8_t casemapping_initialize(
CaseMappingState* state,
const char* input, size_t inputSize,
char* target, size_t targetSize,
const uint32_t* propertyIndex1, const uint32_t* propertyIndex2, const uint32_t* propertyData,
uint8_t quickCheck, size_t locale,
int32_t* errors);
size_t casemapping_execute(CaseMappingState* state, int32_t* errors, int no_replacement);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_CASEMAPPING_H_ */

View File

@ -0,0 +1,272 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "codepoint.h"
const uint8_t codepoint_decoded_length[256] = {
/* Basic Latin */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 - 0x07 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x08 - 0x0F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x10 - 0x17 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x18 - 0x1F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x20 - 0x27 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x28 - 0x2F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x30 - 0x37 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x38 - 0x3F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x40 - 0x47 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x48 - 0x4F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x50 - 0x57 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x58 - 0x5F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x60 - 0x67 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x68 - 0x6F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x70 - 0x77 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x78 - 0x7F */
/* Malformed continuation byte */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x87 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 - 0x8F */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x97 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xA8 - 0xAF */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xB7 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xB8 - 0xBF */
/* Two bytes */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0 - 0xC7 */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xC8 - 0xCF */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0 - 0xD7 */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xD8 - 0xDF */
/* Three bytes */
3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0 - 0xE7 */
3, 3, 3, 3, 3, 3, 3, 3, /* 0xE8 - 0xEF */
/* Four bytes */
4, 4, 4, 4, 4, 4, 4, 4, /* 0xF0 - 0xF7 */
/* Five bytes */
5, 5, 5, 5, /* 0xF8 - 0xFB */
/* Six bytes */
6, 6, /* 0xFC - 0xFD */
/* Invalid */
7, 7 /* 0xFE - 0xFF */
};
uint8_t codepoint_write(unicode_t encoded, char** target, size_t* targetSize)
{
uint8_t encoded_length;
/* Determine encoded length of code point */
if (encoded <= MAX_BASIC_LATIN)
{
encoded_length = 1;
}
else if (
encoded <= 0x7FF)
{
encoded_length = 2;
}
else if (
encoded <= MAX_BASIC_MULTILINGUAL_PLANE)
{
encoded_length = 3;
}
else if (
encoded > MAX_LEGAL_UNICODE)
{
encoded = REPLACEMENT_CHARACTER;
encoded_length = REPLACEMENT_CHARACTER_STRING_LENGTH;
}
else
{
encoded_length = 4;
}
/* Write to target */
if (*target != 0)
{
char* dst;
if (*targetSize < encoded_length)
{
return 0;
}
dst = *target;
switch (encoded_length)
{
case 1:
*dst++ = (char)encoded;
break;
case 2:
*dst++ = (char)(encoded >> 6) | 0xC0;
*dst++ = (char)(encoded & 0x3F) | 0x80;
break;
case 3:
*dst++ = (char)(encoded >> 12) | 0xE0;
*dst++ = (char)((encoded >> 6) & 0x3F) | 0x80;
*dst++ = (char)(encoded & 0x3F) | 0x80;
break;
case 4:
*dst++ = (char)(encoded >> 18) | 0xF0;
*dst++ = (char)((encoded >> 12) & 0x3F) | 0x80;
*dst++ = (char)((encoded >> 6) & 0x3F) | 0x80;
*dst++ = (char)(encoded & 0x3F) | 0x80;
break;
default:
break;
}
*target += encoded_length;
*targetSize -= encoded_length;
}
return encoded_length;
}
uint8_t codepoint_read(const char* input, size_t inputSize, unicode_t* decoded)
{
const uint8_t* src = (const uint8_t*)input;
if (input == 0 ||
inputSize == 0)
{
/* Invalid data */
return 0;
}
if (*src <= MAX_BASIC_LATIN)
{
/* Basic Latin */
*decoded = (unicode_t)*src;
return 1;
}
else
{
/* Multi-byte sequence */
static const uint8_t SequenceMask[7] = {
0x00, 0x7F, 0x1F, 0x0F,
0x07, 0x03, 0x01
};
static const unicode_t SequenceMinimum[7] = {
0x0000, 0x0000, 0x0080, 0x0800,
0x10000, MAX_LEGAL_UNICODE, MAX_LEGAL_UNICODE
};
size_t src_size = inputSize;
uint8_t src_index;
/* Length of sequence is determined by first byte */
uint8_t decoded_length = codepoint_decoded_length[*src];
if (decoded_length < 1 ||
decoded_length > 6)
{
/* Not a multi-byte sequence starter */
*decoded = REPLACEMENT_CHARACTER;
decoded_length = 1;
}
else if (decoded_length > 4)
{
/* Always an overlong sequence */
*decoded = REPLACEMENT_CHARACTER;
/* All bytes in the sequence must be processed */
for (src_index = 1; src_index < decoded_length; ++src_index)
{
src++;
/* Check if next byte is valid */
if (src_size == 0 || /* Not enough data */
(*src < 0x80 || *src > 0xBF)) /* Not a continuation byte */
{
return src_index;
}
src_size--;
}
}
else
{
/* Use mask to strip value from first byte */
*decoded = (unicode_t)(*src & SequenceMask[decoded_length]);
/* All bytes in the sequence must be processed */
for (src_index = 1; src_index < decoded_length; ++src_index)
{
src++;
/* Check if next byte is valid */
if (src_size == 0 || /* Not enough data */
(*src < 0x80 || *src > 0xBF)) /* Not a continuation byte */
{
*decoded = REPLACEMENT_CHARACTER;
return src_index;
}
src_size--;
/* Add value of continuation byte to codepoint */
*decoded = (*decoded << 6) | (*src & 0x3F);
}
/* Check for overlong sequences and surrogate pairs */
if (*decoded < SequenceMinimum[decoded_length] ||
*decoded > MAX_LEGAL_UNICODE ||
(*decoded >= SURROGATE_HIGH_START && *decoded <= SURROGATE_LOW_END))
{
*decoded = REPLACEMENT_CHARACTER;
}
}
return decoded_length;
}
}

View File

@ -0,0 +1,291 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_CODEPOINT_H_
#define _UTF8REWIND_INTERNAL_CODEPOINT_H_
/*!
\file
\brief Codepoint interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
/*!
\addtogroup internal Internal functions and definitions
\{
*/
/*!
\def MAX_BASIC_LATIN
\brief The last codepoint part of Basic Latin (U+0000 - U+007F).
*/
#define MAX_BASIC_LATIN 0x007F
/*!
\def MAX_LATIN_1
\brief The last codepoint part of Latin-1 Supplement (U+0080 - U+00FF).
*/
#define MAX_LATIN_1 0x00FF
/*!
\def MAX_BASIC_MULTILINGUAL_PLANE
\brief The last legal codepoint in the Basic Multilingual Plane (BMP).
*/
#define MAX_BASIC_MULTILINGUAL_PLANE 0xFFFF
/*!
\def MAX_LEGAL_UNICODE
\brief The last legal codepoint in Unicode.
*/
#define MAX_LEGAL_UNICODE 0x10FFFF
/*!
\def REPLACEMENT_CHARACTER
\brief The codepoint used to replace illegal codepoints.
*/
#define REPLACEMENT_CHARACTER 0xFFFD
/*!
\def REPLACEMENT_CHARACTER_STRING
\brief The replacement character as a UTF-8 encoded string.
*/
#define REPLACEMENT_CHARACTER_STRING "\xEF\xBF\xBD"
/*!
\def REPLACEMENT_CHARACTER_STRING_LENGTH
\brief Length of the UTF-8 encoded string of the replacment character.
*/
#define REPLACEMENT_CHARACTER_STRING_LENGTH 3
/*!
\def SURROGATE_HIGH_START
\brief The minimum codepoint for the high member of a surrogate pair.
*/
#define SURROGATE_HIGH_START 0xD800
/*!
\def SURROGATE_HIGH_END
\brief The maximum codepoint for the high member of a surrogate pair.
*/
#define SURROGATE_HIGH_END 0xDBFF
/*!
\def SURROGATE_LOW_START
\brief The minimum codepoint for the low member of a surrogate pair.
*/
#define SURROGATE_LOW_START 0xDC00
/*!
\def SURROGATE_LOW_END
\brief The maximum codepoint for the low member of a surrogate pair.
*/
#define SURROGATE_LOW_END 0xDFFF
/*!
\def HANGUL_JAMO_FIRST
\brief The first codepoint part of the Hangul Jamo block.
*/
#define HANGUL_JAMO_FIRST 0x1100
/*!
\def HANGUL_JAMO_LAST
\brief The last codepoint part of the Hangul Jamo block.
*/
#define HANGUL_JAMO_LAST 0x11FF
/*!
\def HANGUL_L_FIRST
\brief The first codepoint part of the Hangul Jamo L section used for
normalization.
*/
#define HANGUL_L_FIRST 0x1100
/*!
\def HANGUL_L_LAST
\brief The last codepoint part of the Hangul Jamo L section used for
normalization.
*/
#define HANGUL_L_LAST 0x1112
/*!
\def HANGUL_L_COUNT
\brief The number of codepoints in the Hangul Jamo L section.
*/
#define HANGUL_L_COUNT 19
/*!
\def HANGUL_V_FIRST
\brief The first codepoint part of the Hangul Jamo V section used for
normalization.
*/
#define HANGUL_V_FIRST 0x1161
/*!
\def HANGUL_V_LAST
\brief The last codepoint part of the Hangul Jamo V section used for
normalization.
*/
#define HANGUL_V_LAST 0x1175
/*!
\def HANGUL_V_COUNT
\brief The number of codepoints in the Hangul Jamo V section.
*/
#define HANGUL_V_COUNT 21
/*!
\def HANGUL_T_FIRST
\brief The first codepoint part of the Hangul Jamo T section used for
normalization.
*/
#define HANGUL_T_FIRST 0x11A7
/*!
\def HANGUL_T_LAST
\brief The last codepoint part of the Hangul Jamo V section used for
normalization.
*/
#define HANGUL_T_LAST 0x11C2
/*!
\def HANGUL_T_COUNT
\brief The number of codepoints in the Hangul Jamo T section.
*/
#define HANGUL_T_COUNT 28
/*!
\def HANGUL_N_COUNT
\brief Number of codepoints part of the Hangul Jamo V and T sections.
*/
#define HANGUL_N_COUNT 588 /* VCount * TCount */
/*!
\def HANGUL_S_FIRST
\brief The first codepoint in the Hangul Syllables block.
*/
#define HANGUL_S_FIRST 0xAC00
/*!
\def HANGUL_S_LAST
\brief The last codepoint in the Hangul Syllables block.
*/
#define HANGUL_S_LAST 0xD7A3
/*!
\def HANGUL_S_COUNT
\brief The number of codepoints in the Hangul Syllables block.
*/
#define HANGUL_S_COUNT 11172 /* LCount * NCount */
#define CP_LATIN_CAPITAL_LETTER_I 0x0049
#define CP_LATIN_CAPITAL_LETTER_J 0x004A
#define CP_LATIN_SMALL_LETTER_I 0x0069
#define CP_LATIN_SMALL_LETTER_J 0x006A
#define CP_LATIN_CAPITAL_LETTER_I_WITH_GRAVE 0x00CC
#define CP_LATIN_CAPITAL_LETTER_I_WITH_ACUTE 0x00CD
#define CP_LATIN_CAPITAL_LETTER_I_WITH_TILDE 0x0128
#define CP_LATIN_CAPITAL_LETTER_I_WITH_OGONEK 0x012E
#define CP_LATIN_SMALL_LETTER_I_WITH_OGONEK 0x012F
#define CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x0130
#define CP_LATIN_SMALL_LETTER_DOTLESS_I 0x0131
#define CP_COMBINING_GRAVE_ACCENT 0x0300
#define CP_COMBINING_ACUTE_ACCENT 0x0301
#define CP_COMBINING_TILDE_ACCENT 0x0303
#define CP_COMBINING_DOT_ABOVE 0x0307
#define CP_COMBINING_GREEK_YPOGEGRAMMENI 0x0345
#define CP_COMBINING_GRAPHEME_JOINER 0x034F
#define CP_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
#define CCC_NOT_REORDERED 0
#define CCC_OVERLAY 1
#define CCC_NUKTA 7
#define CCC_KANA_VOICING 8
#define CCC_VIRAMA 9
#define CCC_FIXED_POSITION_START 10
#define CCC_FIXED_POSITION_END 199
#define CCC_ATTACHED_BELOW_LEFT 200
#define CCC_ATTACHED_BELOW 202
#define CCC_ATTACHED_BOTTOM_RIGHT 204
#define CCC_ATTACHED_LEFT 208
#define CCC_ATTACHED_RIGHT 210
#define CCC_ATTACHED_TOP_LEFT 212
#define CCC_ATTACHED_ABOVE 214
#define CCC_ATTACHED_ABOVE_RIGHT 216
#define CCC_BELOW_LEFT 218
#define CCC_BELOW 220
#define CCC_BELOW_RIGHT 222
#define CCC_LEFT 224
#define CCC_RIGHT 226
#define CCC_ABOVE_LEFT 228
#define CCC_ABOVE 230
#define CCC_ABOVE_RIGHT 232
#define CCC_DOUBLE_BELOW 233
#define CCC_DOUBLE_ABOVE 234
#define CCC_IOTA_SUBSCRIPT 240
#define CCC_INVALID 255
/*!
\brief Get the number of bytes used for encoding a code point.
\param[in] byte Encoded byte
\return Number of bytes needed for decoding or 0 if input is illegal.
*/
extern const uint8_t codepoint_decoded_length[256];
/*!
\brief Write Unicode code point to UTF-8 encoded string.
Target buffer and size is modified by encoded size.
\param[in] encoded Unicode code point
\param[in,out] target Target buffer
\param[in,out] targetSize Size of output buffer in bytes
\return Bytes needed for encoding or 0 on error.
*/
uint8_t codepoint_write(unicode_t encoded, char** target, size_t* targetSize);
/*!
\brief Read Unicode code point from UTF-8 encoded string.
\param[in] input Input buffer
\param[in] inputSize Size of input buffer in bytes
\param[out] decoded Unicode codepoint
\return Bytes read from string or 0 on error.
*/
uint8_t codepoint_read(const char* input, size_t inputSize, unicode_t* decoded);
/*!
\}
*/
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_CODEPOINT_H_ */

View File

@ -0,0 +1,336 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "composition.h"
#include "codepoint.h"
#include "database.h"
uint8_t compose_initialize(ComposeState* state, StreamState* input, StreamState* output, uint8_t compatibility)
{
memset(state, 0, sizeof(ComposeState));
/* Ensure streams are valid */
if (input == 0 ||
output == 0)
{
return 0;
}
/* Set up streams */
state->input = input;
state->output = output;
memset(state->output, 0, sizeof(StreamState));
/* Set up codepoint quickcheck property */
if (compatibility == 1)
{
state->qc_index = QuickCheckNFKCIndexPtr;
state->qc_data = QuickCheckNFKCDataPtr;
}
else
{
state->qc_index = QuickCheckNFCIndexPtr;
state->qc_data = QuickCheckNFCDataPtr;
}
return 1;
}
uint8_t compose_readcodepoint(ComposeState* state, uint8_t index)
{
if (state->input->index == state->input->current &&
!stream_read(state->input, state->qc_index, state->qc_data))
{
/* End of data */
return 0;
}
/* Get next codepoint from sequence */
state->output->codepoint[index] = state->input->codepoint[state->input->index];
state->output->quick_check[index] = state->input->quick_check[state->input->index];
state->output->canonical_combining_class[index] = state->input->canonical_combining_class[state->input->index];
state->input->index++;
state->output->current++;
return 1;
}
uint8_t compose_execute(ComposeState* state)
{
uint8_t output_index;
uint8_t cursor_current;
uint8_t cursor_next;
/* Check if input is available */
if (state->input == 0)
{
return 0;
}
/* Reset output */
state->output->current = 0;
/* Read first codepoint */
if (!compose_readcodepoint(state, 0))
{
return 0;
}
for (output_index = 0; output_index < state->output->current; ++output_index)
{
/* Ensure current codepoint is a starter */
cursor_current = output_index;
while (state->output->canonical_combining_class[cursor_current] != CCC_NOT_REORDERED)
{
cursor_current++;
if (cursor_current == state->output->current &&
!compose_readcodepoint(state, cursor_current))
{
/* Only non-starters left */
return 1;
}
}
/* Get next codepoint */
cursor_next = cursor_current + 1;
while (
cursor_next < state->output->current ||
compose_readcodepoint(state, cursor_next))
{
/*
Two codepoints can be composed if the current codepoint is a starter
and the next codepoint isn't blocked by a previous codepoint.
*/
if (state->output->canonical_combining_class[cursor_next] > state->output->canonical_combining_class[cursor_next - 1] || /* Can be composed based on CCC */
/* Quick check value can override composition block by previous codepoint */
(state->output->quick_check[cursor_next] != QuickCheckResult_Yes && state->output->canonical_combining_class[cursor_next - 1] == CCC_NOT_REORDERED))
{
unicode_t composed = 0;
/*
Hangul composition
Algorithm adapted from Unicode Technical Report #15:
http://www.unicode.org/reports/tr15/tr15-18.html#Hangul
*/
if (state->output->codepoint[cursor_current] >= HANGUL_L_FIRST &&
state->output->codepoint[cursor_current] <= HANGUL_L_LAST)
{
/* Check for Hangul LV pair */
if (state->output->codepoint[cursor_next] >= HANGUL_V_FIRST &&
state->output->codepoint[cursor_next] <= HANGUL_V_LAST)
{
unicode_t l_index = state->output->codepoint[cursor_current] - HANGUL_L_FIRST;
unicode_t v_index = state->output->codepoint[cursor_next] - HANGUL_V_FIRST;
composed = HANGUL_S_FIRST + (((l_index * HANGUL_V_COUNT) + v_index) * HANGUL_T_COUNT);
}
}
else if (
state->output->codepoint[cursor_current] >= HANGUL_S_FIRST &&
state->output->codepoint[cursor_current] <= HANGUL_S_LAST)
{
/* Check for Hangul LV and T pair */
if (state->output->codepoint[cursor_next] >= HANGUL_T_FIRST &&
state->output->codepoint[cursor_next] <= HANGUL_T_LAST)
{
unicode_t t_index = state->output->codepoint[cursor_next] - HANGUL_T_FIRST;
composed = state->output->codepoint[cursor_current] + t_index;
}
}
else
{
/* Attempt to compose codepoints using the database */
composed = database_querycomposition(
state->output->codepoint[cursor_current],
state->output->codepoint[cursor_next]);
}
/* Check if composition succeeded */
if (composed != 0)
{
/*
When we successfully compose two codepoints, the second must be removed
from the sequence. The way this is accomplished is by marking the cell
empty with a NUL codepoint.
Decomposed:
codepoint U+0044 U+0307 U+0031
index 0 1 2
Composed:
codepoint U+1E0A U+0000 U+0031
index 0 1 2
If the second codepoint was at the end of the sequence, the output
sequence is shortened by one.
*/
/* Add composition to output */
state->output->codepoint[cursor_current] = composed;
state->output->quick_check[cursor_current] = PROPERTY_GET(state->qc_index, state->qc_data, composed);
state->output->canonical_combining_class[cursor_current] = PROPERTY_GET_CCC(composed);
/* Clear next codepoint from output */
state->output->codepoint[cursor_next] = 0;
state->output->quick_check[cursor_next] = QuickCheckResult_Yes;
state->output->canonical_combining_class[cursor_next] = CCC_NOT_REORDERED;
if (cursor_next == state->output->current - 1)
{
/* Next codepoint was at end of output */
state->output->current--;
}
/* Reset cursor to current output index */
cursor_current = output_index;
cursor_next = output_index;
}
}
else if (
state->output->canonical_combining_class[cursor_next] == CCC_NOT_REORDERED)
{
/* Attempt to compose starters, but do not read from the next sequence */
break;
}
/* Evaluate next codepoint */
cursor_next++;
}
/* Fill up "holes" left by composing codepoints not at the end of the sequence */
if (state->output->current > 1)
{
uint8_t write_index = 0;
uint8_t read_index = 1;
/*
We want to move valid codepoints to the left as much as possible in order to fill up
holes left by the composition process.
Note that the process does not clear unused codepoints at the end, this is a small
optimization in order to avoid unnecessary clears. The length member is adjusted to
the new size.
Before reordering:
codepoint A B 0 0 0 D
index 0 1 2 3 4 5
length 6
After reordering:
codepoint A B D 0 0 D
index 0 1 2 3 4 5
length 3
*/
/* Evaluate all codepoints in output sequence */
while (write_index < state->output->current)
{
/* Check if read cursor is on an empty cell */
if (read_index < state->output->current &&
state->output->codepoint[read_index] == 0)
{
/* Skip all empty cells */
while (
read_index < state->output->current &&
state->output->codepoint[read_index] == 0)
{
read_index++;
}
if (read_index == state->output->current)
{
/* Reached end of data */
break;
}
/* Copy cell at read cursor to write cursor */
state->output->codepoint[write_index] = state->output->codepoint[read_index];
state->output->quick_check[write_index] = state->output->quick_check[read_index];
state->output->canonical_combining_class[write_index] = state->output->canonical_combining_class[read_index];
}
/* Move cursors */
write_index++;
read_index++;
}
/* Adjust length of output sequence */
state->output->current = write_index;
}
else
{
/* Evaluated all sequences in output */
state->input = 0;
break;
}
}
return 1;
}

View File

@ -0,0 +1,54 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_COMPOSITION_H_
#define _UTF8REWIND_INTERNAL_COMPOSITION_H_
/*!
\file
\brief Composition interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
#include "streaming.h"
typedef struct {
StreamState* input;
StreamState* output;
const size_t* qc_index;
const uint8_t* qc_data;
} ComposeState;
uint8_t compose_initialize(ComposeState* state, StreamState* input, StreamState* output, uint8_t compatibility);
uint8_t compose_readcodepoint(ComposeState* state, uint8_t index);
uint8_t compose_execute(ComposeState* state);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_COMPOSITION_H_ */

View File

@ -0,0 +1,113 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "database.h"
#include "../unicodedatabase.h"
#include "codepoint.h"
#define DECOMPOSE_INDEX1_SHIFT (12)
#define DECOMPOSE_INDEX2_SHIFT (5)
static const unicode_t DECOMPOSE_INDEX1_MASK = MAX_LEGAL_UNICODE;
static const unicode_t DECOMPOSE_INDEX2_MASK = (1 << DECOMPOSE_INDEX1_SHIFT) - 1;
static const unicode_t DECOMPOSE_DATA_MASK = (1 << DECOMPOSE_INDEX2_SHIFT) - 1;
const char* database_querydecomposition(unicode_t codepoint, const uint32_t* index1Array, const uint32_t* index2Array, const uint32_t* dataArray, uint8_t* length)
{
uint32_t index;
uint32_t data;
index = index1Array[codepoint >> DECOMPOSE_INDEX1_SHIFT];
index = index2Array[index + ((codepoint & DECOMPOSE_INDEX2_MASK) >> DECOMPOSE_INDEX2_SHIFT)];
index = index + (codepoint & DECOMPOSE_DATA_MASK);
if (index == 0 ||
(data = dataArray[index]) == 0)
{
*length = 0;
return 0;
}
*length = (uint8_t)((data & 0xFF000000) >> 24);
return CompressedStringData + (data & 0x00FFFFFF);
}
unicode_t database_querycomposition(unicode_t left, unicode_t right)
{
uint64_t key = ((uint64_t)left << 32) + (uint64_t)right;
size_t offset_start = 0;
size_t offset_end = UnicodeCompositionRecordCount - 1;
size_t offset_pivot;
size_t i;
if (key < UnicodeCompositionRecordPtr[offset_start].key ||
key > UnicodeCompositionRecordPtr[offset_end].key)
{
return 0;
}
do
{
offset_pivot = offset_start + ((offset_end - offset_start) / 2);
if (key == UnicodeCompositionRecordPtr[offset_start].key)
{
return UnicodeCompositionRecordPtr[offset_start].value;
}
else if (key == UnicodeCompositionRecordPtr[offset_end].key)
{
return UnicodeCompositionRecordPtr[offset_end].value;
}
else if (key == UnicodeCompositionRecordPtr[offset_pivot].key)
{
return UnicodeCompositionRecordPtr[offset_pivot].value;
}
else
{
if (key > UnicodeCompositionRecordPtr[offset_pivot].key)
{
offset_start = offset_pivot;
}
else
{
offset_end = offset_pivot;
}
}
}
while (offset_end - offset_start > 32);
for (i = offset_start; i <= offset_end; ++i)
{
if (key == UnicodeCompositionRecordPtr[i].key)
{
return UnicodeCompositionRecordPtr[i].value;
}
}
return 0;
}

View File

@ -0,0 +1,91 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_DATABASE_H_
#define _UTF8REWIND_INTERNAL_DATABASE_H_
/*!
\file
\brief Database interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
#include "../unicodedatabase.h"
enum QuickCheckCaseMapped
{
QuickCheckCaseMapped_Uppercase = 0x01,
QuickCheckCaseMapped_Lowercase = 0x02,
QuickCheckCaseMapped_Titlecase = 0x04,
QuickCheckCaseMapped_Casefolded = 0x08,
};
enum QuickCheckResult
{
QuickCheckResult_Yes,
QuickCheckResult_Maybe,
QuickCheckResult_No,
};
#define PROPERTY_INDEX_SHIFT (5)
static const unicode_t PROPERTY_DATA_MASK = (1 << PROPERTY_INDEX_SHIFT) - 1;
#define PROPERTY_GET(_indexArray, _dataArray, _cp) \
(_dataArray)[ \
(_indexArray)[(_cp) >> PROPERTY_INDEX_SHIFT] + \
((_cp) & PROPERTY_DATA_MASK)]
#define PROPERTY_GET_GC(_cp) \
PROPERTY_GET(GeneralCategoryIndexPtr, GeneralCategoryDataPtr, _cp)
#define PROPERTY_GET_CCC(_cp) \
PROPERTY_GET(CanonicalCombiningClassIndexPtr, CanonicalCombiningClassDataPtr, _cp)
#define PROPERTY_GET_CM(_cp) \
PROPERTY_GET(QuickCheckCaseMappedIndexPtr, QuickCheckCaseMappedDataPtr, _cp)
#define PROPERTY_GET_NFC(_cp) \
PROPERTY_GET(QuickCheckNFCIndexPtr, QuickCheckNFCDataPtr, _cp)
#define PROPERTY_GET_NFD(_cp) \
PROPERTY_GET(QuickCheckNFDIndexPtr, QuickCheckNFDDataPtr, _cp)
#define PROPERTY_GET_NFKC(_cp) \
PROPERTY_GET(QuickCheckNFKCIndexPtr, QuickCheckNFKCDataPtr, _cp)
#define PROPERTY_GET_NFKD(_cp) \
PROPERTY_GET(QuickCheckNFKDIndexPtr, QuickCheckNFKDDataPtr, _cp)
const char* database_querydecomposition(unicode_t codepoint, const uint32_t* index1Array, const uint32_t* index2Array, const uint32_t* dataArray, uint8_t* length);
unicode_t database_querycomposition(unicode_t left, unicode_t right);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_DATABASE_H_ */

View File

@ -0,0 +1,339 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "decomposition.h"
#include "codepoint.h"
#include "database.h"
uint8_t decompose_initialize(
DecomposeState* state,
StreamState* input, StreamState* output,
uint8_t compatibility)
{
memset(state, 0, sizeof(DecomposeState));
/* Ensure streams are valid */
if (input == 0 ||
output == 0)
{
return 0;
}
/* Set up streams */
state->input = input;
state->output = output;
memset(state->output, 0, sizeof(StreamState));
/* Set up codepoint quickcheck property */
if (compatibility == 1)
{
state->property_index1 = NFKDIndex1Ptr;
state->property_index2 = NFKDIndex2Ptr;
state->property_data = NFKDDataPtr;
state->qc_index = QuickCheckNFKDIndexPtr;
state->qc_data = QuickCheckNFKDDataPtr;
}
else
{
state->property_index1 = NFDIndex1Ptr;
state->property_index2 = NFDIndex2Ptr;
state->property_data = NFDDataPtr;
state->qc_index = QuickCheckNFDIndexPtr;
state->qc_data = QuickCheckNFDDataPtr;
}
return 1;
}
uint8_t decompose_execute(DecomposeState* state)
{
unicode_t* src_codepoint;
unicode_t* dst_codepoint;
uint8_t* dst_canonical_combining_class;
uint8_t* dst_quick_check;
uint8_t uncached = 1;
/* Check if input is valid */
if (state->input == 0)
{
return 0;
}
/* Set up output */
state->output->current = 0;
state->output->index = 0;
state->output->stable = 1;
dst_codepoint = state->output->codepoint;
dst_canonical_combining_class = state->output->canonical_combining_class;
dst_quick_check = state->output->quick_check;
/* Check cache for stored sequences */
if (state->cache_current < state->cache_filled)
{
/* Read from cache */
while (state->cache_current < state->cache_filled)
{
if (state->output->current > 0 &&
state->cache_canonical_combining_class[state->cache_current] == CCC_NOT_REORDERED)
{
/* Sequence ends on next non-starter or end of data */
break;
}
*dst_codepoint++ = state->cache_codepoint[state->cache_current];
*dst_canonical_combining_class++ = state->cache_canonical_combining_class[state->cache_current];
*dst_quick_check++ = QuickCheckResult_Yes;
state->output->current++;
state->cache_current++;
}
/* Check if cache has been emptied */
if (state->cache_current == state->cache_filled)
{
state->cache_current = 0;
state->cache_filled = 0;
}
/* Check for additional input */
if (state->input->index == state->input->current)
{
/* Don't compare canonical combining classes, output will always be stable */
return state->output->current;
}
}
/* Read next sequence from input */
if (state->input->index == state->input->current &&
!stream_read(state->input, state->qc_index, state->qc_data))
{
/* End of data */
state->input = 0;
return 0;
}
/* Read from source */
src_codepoint = state->input->codepoint + state->input->index;
while (state->input->index < state->input->current)
{
if (*src_codepoint <= MAX_BASIC_LATIN)
{
/* Basic Latin codepoints are already decomposed */
if (uncached)
{
*dst_codepoint++ = *src_codepoint;
*dst_canonical_combining_class++ = CCC_NOT_REORDERED;
*dst_quick_check++ = QuickCheckResult_Yes;
state->output->current++;
}
else
{
state->cache_codepoint[state->cache_filled] = *src_codepoint;
state->cache_canonical_combining_class[state->cache_filled] = CCC_NOT_REORDERED;
state->cache_filled++;
}
}
else if (
*src_codepoint >= HANGUL_S_FIRST &&
*src_codepoint <= HANGUL_S_LAST)
{
/*
Hangul decomposition
Algorithm adapted from Unicode Technical Report #15:
http://www.unicode.org/reports/tr15/tr15-18.html#Hangul
*/
unicode_t s_index = *src_codepoint - HANGUL_S_FIRST;
if (uncached)
{
*dst_codepoint++ = HANGUL_L_FIRST + (s_index / HANGUL_N_COUNT);
*dst_canonical_combining_class++ = CCC_NOT_REORDERED;
*dst_quick_check++ = QuickCheckResult_Yes;
state->output->current++;
}
else
{
state->cache_codepoint[state->cache_filled] = HANGUL_L_FIRST + (s_index / HANGUL_N_COUNT);
state->cache_canonical_combining_class[state->cache_filled] = CCC_NOT_REORDERED;
state->cache_filled++;
}
/* Store subsequent non-starters in cache */
uncached = 0;
state->cache_codepoint[state->cache_filled] = HANGUL_V_FIRST + (s_index % HANGUL_N_COUNT) / HANGUL_T_COUNT;
state->cache_canonical_combining_class[state->cache_filled] = CCC_NOT_REORDERED;
state->cache_filled++;
if ((s_index % HANGUL_T_COUNT) != 0)
{
state->cache_codepoint[state->cache_filled] = HANGUL_T_FIRST + (s_index % HANGUL_T_COUNT);
state->cache_canonical_combining_class[state->cache_filled] = CCC_NOT_REORDERED;
state->cache_filled++;
}
}
else
{
/* Use quick check to skip stable codepoints */
unicode_t decoded_codepoint = *src_codepoint;
uint8_t decoded_quick_check = PROPERTY_GET(state->qc_index, state->qc_data, decoded_codepoint);
uint8_t decoded_canonical_combining_class;
uint8_t decoded_size;
if (decoded_quick_check != QuickCheckResult_Yes)
{
/* Check database for decomposition */
uint8_t src_size;
const char* src = database_querydecomposition(
decoded_codepoint,
state->property_index1, state->property_index2, state->property_data,
&src_size);
while (src_size > 0)
{
/* Decode current codepoint */
decoded_size = codepoint_read(src, src_size, &decoded_codepoint);
if (decoded_size == 0)
{
break;
}
decoded_canonical_combining_class = PROPERTY_GET_CCC(decoded_codepoint);
/* Check for end of sequence */
if (uncached &&
state->output->current > 0 &&
decoded_canonical_combining_class == CCC_NOT_REORDERED)
{
uncached = 0;
}
if (uncached)
{
/* Write codepoint to output */
*dst_codepoint++ = decoded_codepoint;
*dst_canonical_combining_class++ = decoded_canonical_combining_class;
*dst_quick_check++ = QuickCheckResult_Yes;
state->output->current++;
}
else
{
/* Store in cache */
state->cache_codepoint[state->cache_filled] = decoded_codepoint;
state->cache_canonical_combining_class[state->cache_filled] = decoded_canonical_combining_class;
state->cache_filled++;
}
src += decoded_size;
src_size -= decoded_size;
}
}
else
{
decoded_canonical_combining_class = PROPERTY_GET_CCC(decoded_codepoint);
if (uncached)
{
/* Write codepoint to output */
*dst_codepoint++ = decoded_codepoint;
*dst_canonical_combining_class++ = decoded_canonical_combining_class;
*dst_quick_check++ = decoded_quick_check;
state->output->current++;
}
else
{
/* Store in cache */
state->cache_codepoint[state->cache_filled] = decoded_codepoint;
state->cache_canonical_combining_class[state->cache_filled] = decoded_canonical_combining_class;
state->cache_filled++;
}
}
}
src_codepoint++;
state->input->index++;
}
if (state->output->current > 1)
{
/* Check if output is stable by comparing canonical combining classes */
uint8_t i;
for (i = 1; i < state->output->current; ++i)
{
if (state->output->canonical_combining_class[i] < state->output->canonical_combining_class[i - 1])
{
state->output->stable = 0;
break;
}
}
}
return state->output->current;
}

View File

@ -0,0 +1,59 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_DECOMPOSITION_H_
#define _UTF8REWIND_INTERNAL_DECOMPOSITION_H_
/*!
\file
\brief Decomposition interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
#include "streaming.h"
typedef struct {
StreamState* input;
StreamState* output;
const size_t* qc_index;
const uint8_t* qc_data;
const uint32_t* property_index1;
const uint32_t* property_index2;
const uint32_t* property_data;
unicode_t cache_codepoint[STREAM_BUFFER_MAX];
uint8_t cache_canonical_combining_class[STREAM_BUFFER_MAX];
uint8_t cache_current;
uint8_t cache_filled;
} DecomposeState;
uint8_t decompose_initialize(DecomposeState* state, StreamState* input, StreamState* output, uint8_t compatibility);
uint8_t decompose_execute(DecomposeState* state);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_DECOMPOSITION_H_ */

View File

@ -0,0 +1,187 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "seeking.h"
#include "codepoint.h"
const char* seeking_forward(const char* input, const char* inputEnd, size_t inputSize, off_t offset)
{
if (inputEnd <= input || /* Swapped parameters */
offset <= 0 || /* Invalid offset */
inputSize == 0) /* Nothing to do */
{
return input;
}
else if (
offset >= (off_t)inputSize) /* Out of bounds */
{
return inputEnd;
}
do
{
/* Get decoded length of next sequence */
uint8_t codepoint_length = codepoint_decoded_length[(uint8_t)*input];
if (codepoint_length > 1 &&
codepoint_length < 7)
{
/* Check all bytes of multi-byte sequence */
uint8_t i;
for (i = 0; i < codepoint_length; ++i)
{
/* Next byte of sequence */
input++;
if (input == inputEnd || /* End of data */
codepoint_decoded_length[(uint8_t)*input] != 0) /* Not a continuation byte */
{
break;
}
}
}
else
{
/* Skip to next sequence */
input++;
}
}
while (input < inputEnd &&
--offset > 0);
return input;
}
const char* seeking_rewind(const char* inputStart, const char* input, size_t inputSize, off_t offset)
{
const char* marker;
const char* marker_valid;
if (inputStart >= input || /* Swapped parameters */
offset >= 0) /* Invalid offset */
{
return input;
}
else if (
-offset >= (off_t)inputSize) /* Out of bounds */
{
return inputStart;
}
/* Set up the marker */
marker = input - 1;
marker_valid = marker;
do
{
/* Move the cursor */
input--;
/* Move the marker until we encounter a valid sequence */
while (marker_valid == input)
{
uint8_t codepoint_length = codepoint_decoded_length[(uint8_t)*marker];
if (codepoint_length == 1 || /* Basic Latin */
codepoint_length == 7) /* Illegal byte */
{
marker_valid = marker;
break;
}
else if (
codepoint_length > 1)
{
if (marker == inputStart &&
/* Not overlong */
marker_valid - inputStart == codepoint_length - 1)
{
/* Last sequence */
return marker;
}
else
{
/* Multi-byte sequence */
marker_valid = marker + codepoint_length - 1;
break;
}
}
else if (
marker <= inputStart)
{
/* Continuation bytes only */
marker_valid = marker;
break;
}
else
{
/* Move marker to next byte */
marker--;
}
}
/* Read the next part of a sequence */
if (input <= marker_valid)
{
if (marker == inputStart)
{
/* Last sequence */
return marker;
}
else
{
/* Move the cursor to the start of the sequence */
input = marker;
/* Reset the marker on the next byte */
marker--;
marker_valid = marker;
}
}
}
while (input >= inputStart &&
++offset < 0);
return input;
}

View File

@ -0,0 +1,44 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_SEEKING_H_
#define _UTF8REWIND_INTERNAL_SEEKING_H_
/*!
\file
\brief Seeking interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
const char* seeking_forward(const char* input, const char* inputEnd, size_t inputSize, off_t offset);
const char* seeking_rewind(const char* inputStart, const char* input, size_t inputSize, off_t offset);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_SEEKING_H_ */

View File

@ -0,0 +1,236 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "streaming.h"
#include "codepoint.h"
#include "database.h"
uint8_t stream_initialize(StreamState* state, const char* input, size_t inputSize)
{
memset(state, 0, sizeof(StreamState));
if (input == 0 ||
inputSize == 0)
{
return 0;
}
state->src = input;
state->src_size = inputSize;
state->stable = 1;
return 1;
}
uint8_t stream_read(StreamState* state, const size_t* propertyIndex, const uint8_t* propertyData)
{
/* Ensure input is available */
if (state->src_size == 0 ||
propertyIndex == 0 ||
propertyData == 0)
{
return 0;
}
/* Reset sequence after the first pass */
if (state->filled > 0)
{
/* Check for end of data */
if (state->filled == state->current &&
state->src_size <= state->last_length)
{
state->src_size = 0;
state->index = 0;
state->current = 0;
state->filled = 0;
return 0;
}
/* Copy last peeked codepoint to new sequence */
state->codepoint[0] = state->codepoint[state->filled - 1];
state->canonical_combining_class[0] = state->canonical_combining_class[state->filled - 1];
state->quick_check[0] = state->quick_check[state->filled - 1];
/* New sequence always starts as stable */
state->stable = 1;
/* Reset buffer members */
state->index = 0;
state->current = 1;
state->filled = 1;
}
/* Read codepoints */
while (state->filled < STREAM_SAFE_MAX)
{
/* Move the input cursor after peeking */
if (state->last_length > 0)
{
if (state->src_size <= state->last_length)
{
state->src += state->src_size;
state->src_size = 0;
break;
}
state->src += state->last_length;
state->src_size -= state->last_length;
}
/* Peek the next codepoint */
state->last_length = codepoint_read(state->src, state->src_size, &state->codepoint[state->filled]);
state->quick_check[state->filled] = PROPERTY_GET(propertyIndex, propertyData, state->codepoint[state->filled]);
state->canonical_combining_class[state->filled] = PROPERTY_GET_CCC(state->codepoint[state->filled]);
state->filled++;
if (state->current > 0)
{
/* Sequences end on the next starter and can consist of only non-starters */
if (state->canonical_combining_class[state->current] == 0)
{
break;
}
/* Check if sequence is unstable by comparing canonical combining classes */
if (state->stable &&
state->canonical_combining_class[state->current] < state->canonical_combining_class[state->current - 1])
{
state->stable = 0;
}
}
state->current++;
}
if (state->filled == STREAM_SAFE_MAX)
{
/* Insert COMBINING GRAPHEME JOINER into output */
state->codepoint[state->filled] = CP_COMBINING_GRAPHEME_JOINER;
state->quick_check[state->filled] = QuickCheckResult_Yes;
state->canonical_combining_class[state->filled] = CCC_NOT_REORDERED;
state->filled++;
}
return 1;
}
uint8_t stream_write(StreamState* state, char** output, size_t* outputSize, uint8_t* bytesWritten)
{
uint8_t i;
if (state->current == 0)
{
/* Nothing to write */
*bytesWritten = 0;
return 1;
}
/* Encode code points as UTF-8 */
for (i = 0; i < state->current; ++i)
{
uint8_t encoded_size = codepoint_write(state->codepoint[i], output, outputSize);
if (encoded_size == 0)
{
/* Not enough space */
return 0;
}
*bytesWritten += encoded_size;
}
return 1;
}
uint8_t stream_reorder(StreamState* state)
{
uint8_t i;
uint8_t dirty = 1;
if (state->current == 0)
{
/* Nothing to do */
return 0;
}
/* Reorder codepoints until the entire sequence is table */
do
{
dirty = 0;
for (i = 1; i < state->current; i++)
{
/* Sort codepoints by canonical combining class, smallest to largest */
if (state->canonical_combining_class[i] < state->canonical_combining_class[i - 1])
{
unicode_t swap_cp;
uint8_t swap_qc;
uint8_t swap_ccc;
swap_cp = state->codepoint[i];
state->codepoint[i] = state->codepoint[i - 1];
state->codepoint[i - 1] = swap_cp;
swap_qc = state->quick_check[i];
state->quick_check[i] = state->quick_check[i - 1];
state->quick_check[i - 1] = swap_qc;
swap_ccc = state->canonical_combining_class[i];
state->canonical_combining_class[i] = state->canonical_combining_class[i - 1];
state->canonical_combining_class[i - 1] = swap_ccc;
dirty = 1;
}
}
}
while (dirty == 1);
return 1;
}

View File

@ -0,0 +1,84 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_STREAMING_H_
#define _UTF8REWIND_INTERNAL_STREAMING_H_
/*!
\file
\brief Streaming interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
/*
UAX15-D4. Stream-Safe Text Process
This is the process of producing a Unicode string in Stream-Safe Text Format by processing that string
from start to finish, inserting U+034F COMBINING GRAPHEME JOINER (CGJ) within long sequences of
non-starters. The exact position of the inserted CGJs are determined according to the following algorithm,
which describes the generation of an output string from an input string:
* If the input string is empty, return an empty output string.
* Set nonStarterCount to zero.
* For each code point C in the input string:
* Produce the NFKD decomposition S.
* If nonStarterCount plus the number of initial non-starters in S is greater than 30, append a CGJ to
the output string and set the nonStarterCount to zero.
* Append C to the output string.
* If there are no starters in S, increment nonStarterCount by the number of code points in S; otherwise,
set nonStarterCount to the number of trailing non-starters in S (which may be zero).
* Return the output string.
*/
#define STREAM_SAFE_MAX 30
#define STREAM_BUFFER_MAX 32
typedef struct {
const char* src;
size_t src_size;
uint8_t index;
uint8_t current;
uint8_t filled;
uint8_t stable;
uint8_t last_length;
unicode_t codepoint[STREAM_BUFFER_MAX];
uint8_t quick_check[STREAM_BUFFER_MAX];
uint8_t canonical_combining_class[STREAM_BUFFER_MAX];
} StreamState;
uint8_t stream_initialize(StreamState* state, const char* input, size_t inputSize);
uint8_t stream_read(StreamState* state, const size_t* propertyIndex, const uint8_t* propertyData);
uint8_t stream_write(StreamState* state, char** output, size_t* outputSize, uint8_t* bytesWritten);
uint8_t stream_reorder(StreamState* state);
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_STREAMING_H_ */

11739
third_party/utf8rewind/unicodedatabase.c vendored Normal file

File diff suppressed because it is too large Load Diff

119
third_party/utf8rewind/unicodedatabase.h vendored Normal file
View File

@ -0,0 +1,119 @@
/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_UNICODEDATABASE_H_
#define _UTF8REWIND_UNICODEDATABASE_H_
/*!
\file
\brief Unicode property database.
\cond INTERNAL
*/
#include "utf8rewind.h"
typedef struct {
unicode_t codepoint;
uint32_t length_and_offset;
} DecompositionRecord;
typedef struct {
uint64_t key;
unicode_t value;
} CompositionRecord;
extern const size_t* GeneralCategoryIndexPtr;
extern const uint32_t* GeneralCategoryDataPtr;
extern const size_t* CanonicalCombiningClassIndexPtr;
extern const uint8_t* CanonicalCombiningClassDataPtr;
extern const size_t* QuickCheckCaseMappedIndexPtr;
extern const uint8_t* QuickCheckCaseMappedDataPtr;
extern const size_t* QuickCheckNFCIndexPtr;
extern const uint8_t* QuickCheckNFCDataPtr;
extern const size_t* QuickCheckNFDIndexPtr;
extern const uint8_t* QuickCheckNFDDataPtr;
extern const size_t* QuickCheckNFKCIndexPtr;
extern const uint8_t* QuickCheckNFKCDataPtr;
extern const size_t* QuickCheckNFKDIndexPtr;
extern const uint8_t* QuickCheckNFKDDataPtr;
extern const size_t UnicodeNFDRecordCount;
extern const DecompositionRecord* UnicodeNFDRecordPtr;
extern const size_t UnicodeNFKDRecordCount;
extern const DecompositionRecord* UnicodeNFKDRecordPtr;
extern const size_t UnicodeUppercaseRecordCount;
extern const DecompositionRecord* UnicodeUppercaseRecordPtr;
extern const size_t UnicodeLowercaseRecordCount;
extern const DecompositionRecord* UnicodeLowercaseRecordPtr;
extern const size_t UnicodeTitlecaseRecordCount;
extern const DecompositionRecord* UnicodeTitlecaseRecordPtr;
extern const size_t UnicodeCompositionRecordCount;
extern const CompositionRecord* UnicodeCompositionRecordPtr;
extern const uint32_t* NFDIndex1Ptr;
extern const uint32_t* NFDIndex2Ptr;
extern const uint32_t* NFDDataPtr;
extern const uint32_t* NFKDIndex1Ptr;
extern const uint32_t* NFKDIndex2Ptr;
extern const uint32_t* NFKDDataPtr;
extern const uint32_t* UppercaseIndex1Ptr;
extern const uint32_t* UppercaseIndex2Ptr;
extern const uint32_t* UppercaseDataPtr;
extern const uint32_t* LowercaseIndex1Ptr;
extern const uint32_t* LowercaseIndex2Ptr;
extern const uint32_t* LowercaseDataPtr;
extern const uint32_t* TitlecaseIndex1Ptr;
extern const uint32_t* TitlecaseIndex2Ptr;
extern const uint32_t* TitlecaseDataPtr;
extern const uint32_t* CaseFoldingIndex1Ptr;
extern const uint32_t* CaseFoldingIndex2Ptr;
extern const uint32_t* CaseFoldingDataPtr;
extern const char* CompressedStringData;
extern const size_t CompressedStringDataLength;
extern const char* DecompositionData;
extern const size_t DecompositionDataLength;
/*! \endcond */
#endif /* _UTF8REWIND_UNICODEDATABASE_H_ */

1429
third_party/utf8rewind/utf8rewind.c vendored Normal file

File diff suppressed because it is too large Load Diff

1870
third_party/utf8rewind/utf8rewind.h vendored Normal file

File diff suppressed because it is too large Load Diff