issue #4660: Refactors utf8 api, make it safer and easier to use.

This commit is contained in:
James Chen 2014-05-08 11:10:54 +08:00
parent 42ffd25fca
commit 20a8808b78
2 changed files with 321 additions and 434 deletions

View File

@ -1,127 +1,36 @@
/*
* This file uses some implementations of gutf8.c in glib.
*
* gutf8.c - Operations on UTF-8 strings.
*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
* Copyright (c) 2013-2014 Chukong Technologies Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/****************************************************************************
Copyright (c) 2014 cocos2d-x.org
Copyright (c) 2014 Chukong Technologies Inc.
http://www.cocos2d-x.org
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
****************************************************************************/
#include "ccUTF8.h"
#include "2d/platform/CCCommon.h"
#include "base/CCConsole.h"
#include "ConvertUTF.h"
NS_CC_BEGIN
int cc_wcslen(const unsigned short* str)
{
int i=0;
while(*str++) i++;
return i;
}
/* Code from GLIB gutf8.c starts here. */
#define UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) \
{ \
Len = 1; \
Mask = 0x7f; \
} \
else if ((Char & 0xe0) == 0xc0) \
{ \
Len = 2; \
Mask = 0x1f; \
} \
else if ((Char & 0xf0) == 0xe0) \
{ \
Len = 3; \
Mask = 0x0f; \
} \
else if ((Char & 0xf8) == 0xf0) \
{ \
Len = 4; \
Mask = 0x07; \
} \
else if ((Char & 0xfc) == 0xf8) \
{ \
Len = 5; \
Mask = 0x03; \
} \
else if ((Char & 0xfe) == 0xfc) \
{ \
Len = 6; \
Mask = 0x01; \
} \
else \
Len = -1;
#define UTF8_LENGTH(Char) \
((Char) < 0x80 ? 1 : \
((Char) < 0x800 ? 2 : \
((Char) < 0x10000 ? 3 : \
((Char) < 0x200000 ? 4 : \
((Char) < 0x4000000 ? 5 : 6)))))
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
(Result) = (Chars)[0] & (Mask); \
for ((Count) = 1; (Count) < (Len); ++(Count)) \
{ \
if (((Chars)[(Count)] & 0xc0) != 0x80) \
{ \
(Result) = -1; \
break; \
} \
(Result) <<= 6; \
(Result) |= ((Chars)[(Count)] & 0x3f); \
}
#define UNICODE_VALID(Char) \
((Char) < 0x110000 && \
(((Char) & 0xFFFFF800) != 0xD800) && \
((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
((Char) & 0xFFFE) != 0xFFFE)
static const char utf8_skip_data[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
5, 5, 5, 6, 6, 1, 1
};
static const char *const g_utf8_skip = utf8_skip_data;
#define cc_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
namespace StringUtils {
/*
* @str: the string to search through.
@ -129,14 +38,14 @@ static const char *const g_utf8_skip = utf8_skip_data;
*
* Return value: the index of the last character that is not c.
* */
unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigned short c)
unsigned int getIndexOfLastNotChar16(const std::vector<char16_t>& str, char16_t c)
{
int len = static_cast<int>(str.size());
int i = len - 1;
for (; i >= 0; --i)
if (str[i] != c) return i;
return i;
}
@ -148,13 +57,13 @@ unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigne
*
* Return value: the trimmed string.
* */
static void cc_utf8_trim_from(std::vector<unsigned short>* str, int index)
static void trimUTF16VectorFromIndex(std::vector<char16_t>& str, int index)
{
int size = static_cast<int>(str->size());
int size = static_cast<int>(str.size());
if (index >= size || index < 0)
return;
str->erase(str->begin() + index, str->begin() + size);
str.erase(str.begin() + index, str.begin() + size);
}
/*
@ -164,14 +73,14 @@ static void cc_utf8_trim_from(std::vector<unsigned short>* str, int index)
*
* Return value: weather the character is a whitespace character.
* */
bool isspace_unicode(unsigned short ch)
bool isUnicodeSpace(char16_t ch)
{
return (ch >= 0x0009 && ch <= 0x000D) || ch == 0x0020 || ch == 0x0085 || ch == 0x00A0 || ch == 0x1680
|| (ch >= 0x2000 && ch <= 0x200A) || ch == 0x2028 || ch == 0x2029 || ch == 0x202F
|| ch == 0x205F || ch == 0x3000;
}
bool iscjk_unicode(unsigned short ch)
bool isCJKUnicode(char16_t ch)
{
return (ch >= 0x4E00 && ch <= 0x9FBF) // CJK Unified Ideographs
|| (ch >= 0x2E80 && ch <= 0x2FDF) // CJK Radicals Supplement & Kangxi Radicals
@ -183,131 +92,165 @@ bool iscjk_unicode(unsigned short ch)
|| (ch >= 0x31C0 && ch <= 0x4DFF); // Other exiensions
}
void cc_utf8_trim_ws(std::vector<unsigned short>* str)
void trimUTF16Vector(std::vector<char16_t>& str)
{
int len = static_cast<int>(str->size());
int len = static_cast<int>(str.size());
if ( len <= 0 )
return;
int last_index = len - 1;
// Only start trimming if the last character is whitespace..
if (isspace_unicode((*str)[last_index]))
if (isUnicodeSpace(str[last_index]))
{
for (int i = last_index - 1; i >= 0; --i)
{
if (isspace_unicode((*str)[i]))
if (isUnicodeSpace(str[i]))
last_index = i;
else
break;
}
cc_utf8_trim_from(str, last_index);
trimUTF16VectorFromIndex(str, last_index);
}
}
/*
* cc_utf8_strlen:
* @p: pointer to the start of a UTF-8 encoded string.
* @max: the maximum number of bytes to examine. If @max
* is less than 0, then the string is assumed to be
* null-terminated. If @max is 0, @p will not be examined and
* may be %nullptr.
*
* Returns the length of the string in characters.
*
* Return value: the length of the string in characters
**/
long
cc_utf8_strlen (const char * p, int max)
bool UTF8ToUTF16(const std::string& utf8, std::u16string& outUtf16)
{
long len = 0;
const char *start = p;
if (!(p != nullptr || max == 0))
if (utf8.empty())
{
return 0;
outUtf16.clear();
return true;
}
if (max < 0)
bool ret = false;
const size_t utf16Bytes = (utf8.length()+1) << 1;
char16_t* utf16 = (char16_t*)malloc(utf16Bytes);
memset(utf16, 0, utf16Bytes);
UTF16* utf16Start = (UTF16*)utf16;
UTF16* utf16End = ((UTF16*)utf16) + (utf8.length());
const UTF8* utf8Start = (const UTF8*)utf8.data();
const UTF8* utf8End = ((const UTF8*)utf8.data()) + utf8.length();
if (conversionOK == ConvertUTF8toUTF16((const UTF8 **) &utf8Start, utf8End, &utf16Start, utf16End, strictConversion))
{
while (*p)
{
p = cc_utf8_next_char (p);
++len;
}
outUtf16 = utf16;
ret = true;
}
else
{
if (max == 0 || !*p)
return 0;
p = cc_utf8_next_char (p);
while (p - start < max && *p)
{
++len;
p = cc_utf8_next_char (p);
}
/* only do the last len increment if we got a complete
* char (don't count partial chars)
*/
if (p - start == max)
++len;
}
return len;
free(utf16);
return ret;
}
/*
* g_utf8_get_char:
* @p: a pointer to Unicode character encoded as UTF-8
*
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
* If @p does not point to a valid UTF-8 encoded character, results are
* undefined. If you are not sure that the bytes are complete
* valid Unicode characters, you should use g_utf8_get_char_validated()
* instead.
*
* Return value: the resulting character
**/
static unsigned int
cc_utf8_get_char (const char * p)
bool UTF16ToUTF8(const std::u16string& utf16, std::string& outUtf8)
{
int i, mask = 0, len;
unsigned int result;
unsigned char c = (unsigned char) *p;
UTF8_COMPUTE (c, mask, len);
if (len == -1)
return (unsigned int) - 1;
UTF8_GET (result, p, i, mask, len);
return result;
if (utf16.empty())
{
outUtf8.clear();
return true;
}
bool ret = false;
const size_t utf8Bytes = (utf16.length() << 2) + 1;
char* utf8 = (char*)malloc(utf8Bytes);
memset(utf8, 0, utf8Bytes);
UTF8 *utf8Start = (UTF8*)utf8;
UTF8 *utf8End = ((UTF8*)utf8) + (utf8Bytes -1);
const UTF16* utf16Start = (const UTF16*)utf16.data();
const UTF16* utf16End = ((const UTF16*)utf16.data()) + utf16.length();
if (conversionOK == ConvertUTF16toUTF8(&utf16Start, utf16End, &utf8Start, utf8End, strictConversion))
{
outUtf8 = utf8;
ret = true;
}
free(utf8);
return ret;
}
unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1 */, int* rUtf16Size/* = nullptr */)
std::vector<char16_t> getUTF16VectorFromUTF16String(const std::u16string& str)
{
long len = cc_utf8_strlen(str_old, length);
if (rUtf16Size != nullptr) {
*rUtf16Size = static_cast<int>(len);
}
unsigned short* str_new = new unsigned short[len + 1];
str_new[len] = 0;
for (int i = 0; i < len; ++i)
std::vector<char16_t> str_new;
size_t len = str.length();
for (size_t i = 0; i < len; ++i)
{
str_new[i] = cc_utf8_get_char(str_old);
str_old = cc_utf8_next_char(str_old);
str_new.push_back(str[i]);
}
return str_new;
}
std::vector<char16_t> getChar16VectorFromUTF16String(const std::u16string& utf16)
{
std::vector<char16_t> ret;
size_t len = utf16.length();
ret.reserve(len);
for (size_t i = 0; i < len; ++i)
{
ret.push_back(utf16[i]);
}
return ret;
}
long getCharacterCountInUTF8String(const std::string& utf8)
{
return getUTF8StringLength((const UTF8*)utf8.c_str());
}
} //namespace StringUtils {
int cc_wcslen(const unsigned short* str)
{
int i=0;
while(*str++) i++;
return i;
}
void cc_utf8_trim_ws(std::vector<unsigned short>* str)
{
// unsigned short and char16_t are both 2 bytes
std::vector<char16_t>* ret = reinterpret_cast<std::vector<char16_t>*>(str);
StringUtils::trimUTF16Vector(*ret);
}
bool isspace_unicode(unsigned short ch)
{
return StringUtils::isUnicodeSpace(ch);
}
bool iscjk_unicode(unsigned short ch)
{
return StringUtils::isCJKUnicode(ch);
}
long cc_utf8_strlen (const char * p, int max)
{
CC_UNUSED_PARAM(max);
return StringUtils::getCharacterCountInUTF8String(p);
}
unsigned int cc_utf8_find_last_not_char(const std::vector<unsigned short>& str, unsigned short c)
{
std::vector<char16_t> char16Vector;
for (const auto& e : str)
{
char16Vector.push_back(e);
}
return StringUtils::getIndexOfLastNotChar16(char16Vector, c);
}
std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str)
{
int len = cc_wcslen(str);
@ -320,209 +263,56 @@ std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* st
return str_new;
}
/**
* cc_unichar_to_utf8:
* @c: a ISO10646 character code
* @outbuf: output buffer, must have at least 6 bytes of space.
* If %nullptr, the length will be computed and returned
* and nothing will be written to @outbuf.
*
* Converts a single character to UTF-8.
*
* Return value: number of bytes written
**/
int
cc_unichar_to_utf8 (unsigned int c,
char *outbuf)
unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1*/, int* rUtf16Size/* = nullptr*/)
{
int len = 0;
int first;
int i;
if (str_old == nullptr)
return nullptr;
if (c < 0x80)
unsigned short* ret = nullptr;
std::u16string outUtf16;
bool succeed = StringUtils::UTF8ToUTF16(str_old, outUtf16);
if (succeed)
{
first = 0;
len = 1;
}
else if (c < 0x800)
{
first = 0xc0;
len = 2;
}
else if (c < 0x10000)
{
first = 0xe0;
len = 3;
}
else if (c < 0x200000)
{
first = 0xf0;
len = 4;
}
else if (c < 0x4000000)
{
first = 0xf8;
len = 5;
}
else
{
first = 0xfc;
len = 6;
ret = new unsigned short[outUtf16.length() + 1];
ret[outUtf16.length()] = 0;
memcpy(ret, outUtf16.data(), outUtf16.length());
}
if (outbuf)
{
for (i = len - 1; i > 0; --i)
{
outbuf[i] = (c & 0x3f) | 0x80;
c >>= 6;
}
outbuf[0] = c | first;
}
return len;
return ret;
}
#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
/**
* cc_utf16_to_utf8:
* @str: a UTF-16 encoded string
* @len: the maximum length of @str to use. If @len < 0, then
* the string is terminated with a 0 character.
* @items_read: location to store number of words read, or %nullptr.
* If %nullptr, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
* returned in case @str contains a trailing partial
* character. If an error occurs then the index of the
* invalid input is stored here.
* @items_written: location to store number of bytes written, or %nullptr.
* The value stored here does not include the trailing
* 0 byte.
* @error: location to store the error occuring, or %nullptr to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
*
* Convert a string from UTF-16 to UTF-8. The result will be
* terminated with a 0 byte.
*
* Return value: a pointer to a newly allocated UTF-8 string.
* This value must be freed with free(). If an
* error occurs, %nullptr will be returned and
* @error set.
**/
char *
cc_utf16_to_utf8 (const unsigned short *str,
int len,
long *items_read,
long *items_written)
char * cc_utf16_to_utf8 (const unsigned short *str,
int len,
long *items_read,
long *items_written)
{
/* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
* are marked.
*/
const unsigned short *in;
char *out;
char *result = nullptr;
int n_bytes;
unsigned int high_surrogate;
if (str == nullptr)
return nullptr;
if (str == 0) return nullptr;
n_bytes = 0;
in = str;
high_surrogate = 0;
while ((len < 0 || in - str < len) && *in)
std::u16string utf16;
int utf16Len = len < 0 ? cc_wcslen(str) : len;
for (int i = 0; i < utf16Len; ++i)
{
unsigned short c = *in;
unsigned int wc;
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
{
if (high_surrogate)
{
wc = SURROGATE_VALUE (high_surrogate, c);
high_surrogate = 0;
}
else
{
CCLOGERROR("Invalid sequence in conversion input");
goto err_out;
}
}
else
{
if (high_surrogate)
{
CCLOGERROR("Invalid sequence in conversion input");
goto err_out;
}
if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
{
high_surrogate = c;
goto next1;
}
else
wc = c;
}
/********** DIFFERENT for UTF8/UCS4 **********/
n_bytes += UTF8_LENGTH (wc);
next1:
in++;
utf16.push_back(str[i]);
}
if (high_surrogate && !items_read)
{
CCLOGERROR("Partial character sequence at end of input");
goto err_out;
}
char* ret = nullptr;
std::string outUtf8;
bool succeed = StringUtils::UTF16ToUTF8(utf16, outUtf8);
/* At this point, everything is valid, and we just need to convert
*/
/********** DIFFERENT for UTF8/UCS4 **********/
result = new char[n_bytes + 1];
high_surrogate = 0;
out = result;
in = str;
while (out < result + n_bytes)
if (succeed)
{
unsigned short c = *in;
unsigned int wc;
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
{
wc = SURROGATE_VALUE (high_surrogate, c);
high_surrogate = 0;
}
else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
{
high_surrogate = c;
goto next2;
}
else
wc = c;
/********** DIFFERENT for UTF8/UCS4 **********/
out += cc_unichar_to_utf8 (wc, out);
next2:
in++;
ret = new char[outUtf8.length() + 1];
ret[outUtf8.length()] = '\0';
memcpy(ret, outUtf8.data(), outUtf8.length());
}
/********** DIFFERENT for UTF8/UCS4 **********/
*out = '\0';
if (items_written)
/********** DIFFERENT for UTF8/UCS4 **********/
*items_written = out - result;
err_out:
if (items_read)
*items_read = in - str;
return result;
return ret;
}
NS_CC_END

View File

@ -1,35 +1,131 @@
/*
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc.
* Copyright (c) 2013-2014 Chukong Technologies Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
/****************************************************************************
Copyright (c) 2014 cocos2d-x.org
Copyright (c) 2014 Chukong Technologies Inc.
http://www.cocos2d-x.org
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
****************************************************************************/
#ifndef __cocos2dx__ccUTF8__
#define __cocos2dx__ccUTF8__
#include "base/CCPlatformMacros.h"
#include <vector>
#include <string>
NS_CC_BEGIN
namespace StringUtils {
/**
* @brief Converts utf8 string to utf16 string
* @param utf8 The utf8 string to be converted
* @param outUtf16 The output utf16 string
* @return true if succeed, otherwise false
* @note Please check the return value before using \p outUtf16
* e.g.
* @code
* std::u16string utf16;
* bool ret = StringUtils::UTF8ToUTF16("你好hello", utf16);
* if (ret) {
* do_some_thing_with_utf16(utf16);
* }
* @endcode
*/
CC_DLL bool UTF8ToUTF16(const std::string& utf8, std::u16string& outUtf16);
/**
* @brief Converts utf16 string to utf8 string
* @param utf16 The utf16 string to be converted
* @param outUtf8 The output utf8 string
* @return true if succeed, otherwise false
* @note Please check the return value before using \p outUtf8
* e.g.
* @code
* std::string utf8;
* bool ret = StringUtils::UTF16ToUTF8(u"\u4f60\u597d", utf16);
* if (ret) {
* do_some_thing_with_utf8(utf8);
* }
* @endcode
*/
CC_DLL bool UTF16ToUTF8(const std::u16string& utf16, std::string& outUtf8);
/**
* @brief Trims the unicode spaces at the end of char16_t vector
*/
CC_DLL void trimUTF16Vector(std::vector<char16_t>& str);
/**
* @brief Whether the character is a whitespace character.
*
* @param ch the unicode character
* @returns whether the character is a white space character.
*
* @see http://en.wikipedia.org/wiki/Whitespace_character#Unicode
*
*/
CC_DLL bool isUnicodeSpace(char16_t ch);
/**
* @brief Whether the character is a Chinese/Japanese/Korean character.
*
* @param ch the unicode character
* @returns whether the character is a Chinese character.
*
* @see http://www.searchtb.com/2012/04/chinese_encode.html
* @see http://tieba.baidu.com/p/748765987
*
*/
CC_DLL bool isCJKUnicode(char16_t ch);
/**
* @brief Returns the length of the string in characters.
*
* @param utf8 an UTF-8 encoded string.
* @returns the length of the string in characters
*/
CC_DLL long getCharacterCountInUTF8String(const std::string& utf8);
/**
* @brief Gets the index of the last character that is not equal to the character given.
*
* @param str the string to be searched.
* @param c the character to be searched for.
*
* @returns the index of the last character that is not \p c.
*
*/
CC_DLL unsigned int getIndexOfLastNotChar16(const std::vector<char16_t>& str, char16_t c);
/**
* @brief Gets char16_t vector from a given utf16 string
*/
CC_DLL std::vector<char16_t> getChar16VectorFromUTF16String(const std::u16string& utf16);
} // namespace StringUtils {
CC_DLL int cc_wcslen(const unsigned short* str);
CC_DLL void cc_utf8_trim_ws(std::vector<unsigned short>* str);
CC_DEPRECATED_ATTRIBUTE void cc_utf8_trim_ws(std::vector<unsigned short>* str);
/**
* Whether the character is a whitespace character.
@ -39,7 +135,7 @@ CC_DLL void cc_utf8_trim_ws(std::vector<unsigned short>* str);
*
* @see http://en.wikipedia.org/wiki/Whitespace_character#Unicode
* */
CC_DLL bool isspace_unicode(unsigned short ch);
CC_DEPRECATED_ATTRIBUTE bool isspace_unicode(unsigned short ch);
/**
* Whether the character is a Chinese/Japanese/Korean character.
@ -50,7 +146,7 @@ CC_DLL bool isspace_unicode(unsigned short ch);
* @see http://www.searchtb.com/2012/04/chinese_encode.html
* @see http://tieba.baidu.com/p/748765987
* */
CC_DLL bool iscjk_unicode(unsigned short ch);
CC_DEPRECATED_ATTRIBUTE bool iscjk_unicode(unsigned short ch);
/**
* Returns the length of the string in characters.
@ -62,7 +158,7 @@ CC_DLL bool iscjk_unicode(unsigned short ch);
*
* @returns the length of the string in characters
**/
CC_DLL long
CC_DEPRECATED_ATTRIBUTE long
cc_utf8_strlen (const char * p, int max);
/**
@ -73,9 +169,9 @@ cc_utf8_strlen (const char * p, int max);
*
* @returns the index of the last character that is not \p c.
* */
CC_DLL unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigned short c);
CC_DEPRECATED_ATTRIBUTE unsigned int cc_utf8_find_last_not_char(const std::vector<unsigned short>& str, unsigned short c);
CC_DLL std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str);
CC_DEPRECATED_ATTRIBUTE std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str);
/**
* Creates a utf8 string from a cstring.
@ -84,7 +180,7 @@ CC_DLL std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned sh
*
* @returns the newly created utf8 string.
* */
CC_DLL unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, int* rUtf16Size = nullptr);
CC_DEPRECATED_ATTRIBUTE unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, int* rUtf16Size = nullptr);
/**
* Convert a string from UTF-16 to UTF-8. The result will be null terminated.
@ -103,12 +199,13 @@ CC_DLL unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, in
* @returns a pointer to a newly allocated UTF-8 string. This value must be
* freed with free(). If an error occurs, %nullptr will be returned.
**/
CC_DLL char *
CC_DEPRECATED_ATTRIBUTE char *
cc_utf16_to_utf8 (const unsigned short *str,
int len,
long *items_read,
long *items_written);
NS_CC_END
#endif /* defined(__cocos2dx__ccUTF8__) */