Vincent Herbet ab854ec035 Improve UTF-8 support in some natives (bug 6475) (#407)
* Compile as static library, update AMBuildScript and link to core

* Update VS project files to include the library

* Add UTF-8 Rewind library (v1.5.1) to third_party directory

* Update ACKNOWLEDGEMENTS.txt

* Move AMXX buffer in its own function

* Move constants from string.inc to string_const.inc and update project files

* Move stocks from string.inc to string_stocks.inc and update project files

* Improve UTF-8 support in containi() and update documentation

* Improve UTF-8 support in strcmp() and update documentation

* Improve UTF-8 support in strfind() and update documentation

Worth to be noted that this native with ignorecase set was not working properly. So broken that no one reported the issue.
This adds also a safety check for "pos" parameter to not go < 0.

* Improve UTF-8 support in strncmp() and update documentation

* Improve UTF-8 support in equali() and update documentation

* Add an option to some UTF-8 Rewind functions for avoiding invalid data to be replaced

By default it replaces any invalid byte or sequence of bytes by 0xFFFD (3 bytes). It can be problematic when the input buffer is not changed (from a plugin) and that some natives need to calculate a position from the converted string. With such replacement, the position is displaced due the final string length being larger.

This compiles the library as C++, because I added some silly param with a default default value which is not supported by C.

* Improve UTF-8 support in replace_string/ex() and update documentation

* Add is_string_category() and update documentation

* Update a little testsuite plugin (and fix linux compilation)

* Add mb_strotolower/upper() and update documentation

* Add mb_ucfirst() and update documentation

* Add mb_strtotile() and update documentation

* Improve UTF-8 support in get_players() and find_player() with name/case insenstive flags set

* Fix KliPPy's complain
2017-08-05 10:32:16 +02:00

272 lines
6.3 KiB
C

/*
Copyright (C) 2014-2016 Quinten Lansu
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the "Software"), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
*/
#include "codepoint.h"
const uint8_t codepoint_decoded_length[256] = {
/* Basic Latin */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 - 0x07 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x08 - 0x0F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x10 - 0x17 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x18 - 0x1F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x20 - 0x27 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x28 - 0x2F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x30 - 0x37 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x38 - 0x3F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x40 - 0x47 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x48 - 0x4F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x50 - 0x57 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x58 - 0x5F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x60 - 0x67 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x68 - 0x6F */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x70 - 0x77 */
1, 1, 1, 1, 1, 1, 1, 1, /* 0x78 - 0x7F */
/* Malformed continuation byte */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x80 - 0x87 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x88 - 0x8F */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x90 - 0x97 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0x98 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xA8 - 0xAF */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xB0 - 0xB7 */
0, 0, 0, 0, 0, 0, 0, 0, /* 0xB8 - 0xBF */
/* Two bytes */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xC0 - 0xC7 */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xC8 - 0xCF */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xD0 - 0xD7 */
2, 2, 2, 2, 2, 2, 2, 2, /* 0xD8 - 0xDF */
/* Three bytes */
3, 3, 3, 3, 3, 3, 3, 3, /* 0xE0 - 0xE7 */
3, 3, 3, 3, 3, 3, 3, 3, /* 0xE8 - 0xEF */
/* Four bytes */
4, 4, 4, 4, 4, 4, 4, 4, /* 0xF0 - 0xF7 */
/* Five bytes */
5, 5, 5, 5, /* 0xF8 - 0xFB */
/* Six bytes */
6, 6, /* 0xFC - 0xFD */
/* Invalid */
7, 7 /* 0xFE - 0xFF */
};
uint8_t codepoint_write(unicode_t encoded, char** target, size_t* targetSize)
{
uint8_t encoded_length;
/* Determine encoded length of code point */
if (encoded <= MAX_BASIC_LATIN)
{
encoded_length = 1;
}
else if (
encoded <= 0x7FF)
{
encoded_length = 2;
}
else if (
encoded <= MAX_BASIC_MULTILINGUAL_PLANE)
{
encoded_length = 3;
}
else if (
encoded > MAX_LEGAL_UNICODE)
{
encoded = REPLACEMENT_CHARACTER;
encoded_length = REPLACEMENT_CHARACTER_STRING_LENGTH;
}
else
{
encoded_length = 4;
}
/* Write to target */
if (*target != 0)
{
char* dst;
if (*targetSize < encoded_length)
{
return 0;
}
dst = *target;
switch (encoded_length)
{
case 1:
*dst++ = (char)encoded;
break;
case 2:
*dst++ = (char)(encoded >> 6) | 0xC0;
*dst++ = (char)(encoded & 0x3F) | 0x80;
break;
case 3:
*dst++ = (char)(encoded >> 12) | 0xE0;
*dst++ = (char)((encoded >> 6) & 0x3F) | 0x80;
*dst++ = (char)(encoded & 0x3F) | 0x80;
break;
case 4:
*dst++ = (char)(encoded >> 18) | 0xF0;
*dst++ = (char)((encoded >> 12) & 0x3F) | 0x80;
*dst++ = (char)((encoded >> 6) & 0x3F) | 0x80;
*dst++ = (char)(encoded & 0x3F) | 0x80;
break;
default:
break;
}
*target += encoded_length;
*targetSize -= encoded_length;
}
return encoded_length;
}
uint8_t codepoint_read(const char* input, size_t inputSize, unicode_t* decoded)
{
const uint8_t* src = (const uint8_t*)input;
if (input == 0 ||
inputSize == 0)
{
/* Invalid data */
return 0;
}
if (*src <= MAX_BASIC_LATIN)
{
/* Basic Latin */
*decoded = (unicode_t)*src;
return 1;
}
else
{
/* Multi-byte sequence */
static const uint8_t SequenceMask[7] = {
0x00, 0x7F, 0x1F, 0x0F,
0x07, 0x03, 0x01
};
static const unicode_t SequenceMinimum[7] = {
0x0000, 0x0000, 0x0080, 0x0800,
0x10000, MAX_LEGAL_UNICODE, MAX_LEGAL_UNICODE
};
size_t src_size = inputSize;
uint8_t src_index;
/* Length of sequence is determined by first byte */
uint8_t decoded_length = codepoint_decoded_length[*src];
if (decoded_length < 1 ||
decoded_length > 6)
{
/* Not a multi-byte sequence starter */
*decoded = REPLACEMENT_CHARACTER;
decoded_length = 1;
}
else if (decoded_length > 4)
{
/* Always an overlong sequence */
*decoded = REPLACEMENT_CHARACTER;
/* All bytes in the sequence must be processed */
for (src_index = 1; src_index < decoded_length; ++src_index)
{
src++;
/* Check if next byte is valid */
if (src_size == 0 || /* Not enough data */
(*src < 0x80 || *src > 0xBF)) /* Not a continuation byte */
{
return src_index;
}
src_size--;
}
}
else
{
/* Use mask to strip value from first byte */
*decoded = (unicode_t)(*src & SequenceMask[decoded_length]);
/* All bytes in the sequence must be processed */
for (src_index = 1; src_index < decoded_length; ++src_index)
{
src++;
/* Check if next byte is valid */
if (src_size == 0 || /* Not enough data */
(*src < 0x80 || *src > 0xBF)) /* Not a continuation byte */
{
*decoded = REPLACEMENT_CHARACTER;
return src_index;
}
src_size--;
/* Add value of continuation byte to codepoint */
*decoded = (*decoded << 6) | (*src & 0x3F);
}
/* Check for overlong sequences and surrogate pairs */
if (*decoded < SequenceMinimum[decoded_length] ||
*decoded > MAX_LEGAL_UNICODE ||
(*decoded >= SURROGATE_HIGH_START && *decoded <= SURROGATE_LOW_END))
{
*decoded = REPLACEMENT_CHARACTER;
}
}
return decoded_length;
}
}