Vincent Herbet ab854ec035 Improve UTF-8 support in some natives (bug 6475) (#407)
* Compile as static library, update AMBuildScript and link to core

* Update VS project files to include the library

* Add UTF-8 Rewind library (v1.5.1) to third_party directory

* Update ACKNOWLEDGEMENTS.txt

* Move AMXX buffer in its own function

* Move constants from string.inc to string_const.inc and update project files

* Move stocks from string.inc to string_stocks.inc and update project files

* Improve UTF-8 support in containi() and update documentation

* Improve UTF-8 support in strcmp() and update documentation

* Improve UTF-8 support in strfind() and update documentation

Worth to be noted that this native with ignorecase set was not working properly. So broken that no one reported the issue.
This adds also a safety check for "pos" parameter to not go < 0.

* Improve UTF-8 support in strncmp() and update documentation

* Improve UTF-8 support in equali() and update documentation

* Add an option to some UTF-8 Rewind functions for avoiding invalid data to be replaced

By default it replaces any invalid byte or sequence of bytes by 0xFFFD (3 bytes). It can be problematic when the input buffer is not changed (from a plugin) and that some natives need to calculate a position from the converted string. With such replacement, the position is displaced due the final string length being larger.

This compiles the library as C++, because I added some silly param with a default default value which is not supported by C.

* Improve UTF-8 support in replace_string/ex() and update documentation

* Add is_string_category() and update documentation

* Update a little testsuite plugin (and fix linux compilation)

* Add mb_strotolower/upper() and update documentation

* Add mb_ucfirst() and update documentation

* Add mb_strtotile() and update documentation

* Improve UTF-8 support in get_players() and find_player() with name/case insenstive flags set

* Fix KliPPy's complain
2017-08-05 10:32:16 +02:00

291 lines
8.7 KiB
C

/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _UTF8REWIND_INTERNAL_CODEPOINT_H_
#define _UTF8REWIND_INTERNAL_CODEPOINT_H_
/*!
\file
\brief Codepoint interface.
\cond INTERNAL
*/
#include "utf8rewind.h"
/*!
\addtogroup internal Internal functions and definitions
\{
*/
/*!
\def MAX_BASIC_LATIN
\brief The last codepoint part of Basic Latin (U+0000 - U+007F).
*/
#define MAX_BASIC_LATIN 0x007F
/*!
\def MAX_LATIN_1
\brief The last codepoint part of Latin-1 Supplement (U+0080 - U+00FF).
*/
#define MAX_LATIN_1 0x00FF
/*!
\def MAX_BASIC_MULTILINGUAL_PLANE
\brief The last legal codepoint in the Basic Multilingual Plane (BMP).
*/
#define MAX_BASIC_MULTILINGUAL_PLANE 0xFFFF
/*!
\def MAX_LEGAL_UNICODE
\brief The last legal codepoint in Unicode.
*/
#define MAX_LEGAL_UNICODE 0x10FFFF
/*!
\def REPLACEMENT_CHARACTER
\brief The codepoint used to replace illegal codepoints.
*/
#define REPLACEMENT_CHARACTER 0xFFFD
/*!
\def REPLACEMENT_CHARACTER_STRING
\brief The replacement character as a UTF-8 encoded string.
*/
#define REPLACEMENT_CHARACTER_STRING "\xEF\xBF\xBD"
/*!
\def REPLACEMENT_CHARACTER_STRING_LENGTH
\brief Length of the UTF-8 encoded string of the replacment character.
*/
#define REPLACEMENT_CHARACTER_STRING_LENGTH 3
/*!
\def SURROGATE_HIGH_START
\brief The minimum codepoint for the high member of a surrogate pair.
*/
#define SURROGATE_HIGH_START 0xD800
/*!
\def SURROGATE_HIGH_END
\brief The maximum codepoint for the high member of a surrogate pair.
*/
#define SURROGATE_HIGH_END 0xDBFF
/*!
\def SURROGATE_LOW_START
\brief The minimum codepoint for the low member of a surrogate pair.
*/
#define SURROGATE_LOW_START 0xDC00
/*!
\def SURROGATE_LOW_END
\brief The maximum codepoint for the low member of a surrogate pair.
*/
#define SURROGATE_LOW_END 0xDFFF
/*!
\def HANGUL_JAMO_FIRST
\brief The first codepoint part of the Hangul Jamo block.
*/
#define HANGUL_JAMO_FIRST 0x1100
/*!
\def HANGUL_JAMO_LAST
\brief The last codepoint part of the Hangul Jamo block.
*/
#define HANGUL_JAMO_LAST 0x11FF
/*!
\def HANGUL_L_FIRST
\brief The first codepoint part of the Hangul Jamo L section used for
normalization.
*/
#define HANGUL_L_FIRST 0x1100
/*!
\def HANGUL_L_LAST
\brief The last codepoint part of the Hangul Jamo L section used for
normalization.
*/
#define HANGUL_L_LAST 0x1112
/*!
\def HANGUL_L_COUNT
\brief The number of codepoints in the Hangul Jamo L section.
*/
#define HANGUL_L_COUNT 19
/*!
\def HANGUL_V_FIRST
\brief The first codepoint part of the Hangul Jamo V section used for
normalization.
*/
#define HANGUL_V_FIRST 0x1161
/*!
\def HANGUL_V_LAST
\brief The last codepoint part of the Hangul Jamo V section used for
normalization.
*/
#define HANGUL_V_LAST 0x1175
/*!
\def HANGUL_V_COUNT
\brief The number of codepoints in the Hangul Jamo V section.
*/
#define HANGUL_V_COUNT 21
/*!
\def HANGUL_T_FIRST
\brief The first codepoint part of the Hangul Jamo T section used for
normalization.
*/
#define HANGUL_T_FIRST 0x11A7
/*!
\def HANGUL_T_LAST
\brief The last codepoint part of the Hangul Jamo V section used for
normalization.
*/
#define HANGUL_T_LAST 0x11C2
/*!
\def HANGUL_T_COUNT
\brief The number of codepoints in the Hangul Jamo T section.
*/
#define HANGUL_T_COUNT 28
/*!
\def HANGUL_N_COUNT
\brief Number of codepoints part of the Hangul Jamo V and T sections.
*/
#define HANGUL_N_COUNT 588 /* VCount * TCount */
/*!
\def HANGUL_S_FIRST
\brief The first codepoint in the Hangul Syllables block.
*/
#define HANGUL_S_FIRST 0xAC00
/*!
\def HANGUL_S_LAST
\brief The last codepoint in the Hangul Syllables block.
*/
#define HANGUL_S_LAST 0xD7A3
/*!
\def HANGUL_S_COUNT
\brief The number of codepoints in the Hangul Syllables block.
*/
#define HANGUL_S_COUNT 11172 /* LCount * NCount */
#define CP_LATIN_CAPITAL_LETTER_I 0x0049
#define CP_LATIN_CAPITAL_LETTER_J 0x004A
#define CP_LATIN_SMALL_LETTER_I 0x0069
#define CP_LATIN_SMALL_LETTER_J 0x006A
#define CP_LATIN_CAPITAL_LETTER_I_WITH_GRAVE 0x00CC
#define CP_LATIN_CAPITAL_LETTER_I_WITH_ACUTE 0x00CD
#define CP_LATIN_CAPITAL_LETTER_I_WITH_TILDE 0x0128
#define CP_LATIN_CAPITAL_LETTER_I_WITH_OGONEK 0x012E
#define CP_LATIN_SMALL_LETTER_I_WITH_OGONEK 0x012F
#define CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x0130
#define CP_LATIN_SMALL_LETTER_DOTLESS_I 0x0131
#define CP_COMBINING_GRAVE_ACCENT 0x0300
#define CP_COMBINING_ACUTE_ACCENT 0x0301
#define CP_COMBINING_TILDE_ACCENT 0x0303
#define CP_COMBINING_DOT_ABOVE 0x0307
#define CP_COMBINING_GREEK_YPOGEGRAMMENI 0x0345
#define CP_COMBINING_GRAPHEME_JOINER 0x034F
#define CP_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
#define CCC_NOT_REORDERED 0
#define CCC_OVERLAY 1
#define CCC_NUKTA 7
#define CCC_KANA_VOICING 8
#define CCC_VIRAMA 9
#define CCC_FIXED_POSITION_START 10
#define CCC_FIXED_POSITION_END 199
#define CCC_ATTACHED_BELOW_LEFT 200
#define CCC_ATTACHED_BELOW 202
#define CCC_ATTACHED_BOTTOM_RIGHT 204
#define CCC_ATTACHED_LEFT 208
#define CCC_ATTACHED_RIGHT 210
#define CCC_ATTACHED_TOP_LEFT 212
#define CCC_ATTACHED_ABOVE 214
#define CCC_ATTACHED_ABOVE_RIGHT 216
#define CCC_BELOW_LEFT 218
#define CCC_BELOW 220
#define CCC_BELOW_RIGHT 222
#define CCC_LEFT 224
#define CCC_RIGHT 226
#define CCC_ABOVE_LEFT 228
#define CCC_ABOVE 230
#define CCC_ABOVE_RIGHT 232
#define CCC_DOUBLE_BELOW 233
#define CCC_DOUBLE_ABOVE 234
#define CCC_IOTA_SUBSCRIPT 240
#define CCC_INVALID 255
/*!
\brief Get the number of bytes used for encoding a code point.
\param[in] byte Encoded byte
\return Number of bytes needed for decoding or 0 if input is illegal.
*/
extern const uint8_t codepoint_decoded_length[256];
/*!
\brief Write Unicode code point to UTF-8 encoded string.
Target buffer and size is modified by encoded size.
\param[in] encoded Unicode code point
\param[in,out] target Target buffer
\param[in,out] targetSize Size of output buffer in bytes
\return Bytes needed for encoding or 0 on error.
*/
uint8_t codepoint_write(unicode_t encoded, char** target, size_t* targetSize);
/*!
\brief Read Unicode code point from UTF-8 encoded string.
\param[in] input Input buffer
\param[in] inputSize Size of input buffer in bytes
\param[out] decoded Unicode codepoint
\return Bytes read from string or 0 on error.
*/
uint8_t codepoint_read(const char* input, size_t inputSize, unicode_t* decoded);
/*!
\}
*/
/*! \endcond */
#endif /* _UTF8REWIND_INTERNAL_CODEPOINT_H_ */