issue #4660: Refactors utf8 api, make it safer and easier to use.

This commit is contained in:
James Chen 2014-05-08 11:10:54 +08:00
parent 42ffd25fca
commit 20a8808b78
2 changed files with 321 additions and 434 deletions

View File

@ -1,127 +1,36 @@
/* /****************************************************************************
* This file uses some implementations of gutf8.c in glib. Copyright (c) 2014 cocos2d-x.org
* Copyright (c) 2014 Chukong Technologies Inc.
* gutf8.c - Operations on UTF-8 strings.
* http://www.cocos2d-x.org
* Copyright (C) 1999 Tom Tromey
* Copyright (C) 2000 Red Hat, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy
* Copyright (c) 2013-2014 Chukong Technologies Inc. of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* This library is free software; you can redistribute it and/or to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* modify it under the terms of the GNU Lesser General Public copies of the Software, and to permit persons to whom the Software is
* License as published by the Free Software Foundation; either furnished to do so, subject to the following conditions:
* version 2 of the License, or (at your option) any later version.
* The above copyright notice and this permission notice shall be included in
* This library is distributed in the hope that it will be useful, all copies or substantial portions of the Software.
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* Lesser General Public License for more details. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* You should have received a copy of the GNU Lesser General Public AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* License along with this library; if not, write to the LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* Free Software Foundation, Inc., 59 Temple Place - Suite 330, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* Boston, MA 02111-1307, USA. THE SOFTWARE.
*/ ****************************************************************************/
#include "ccUTF8.h" #include "ccUTF8.h"
#include "2d/platform/CCCommon.h" #include "2d/platform/CCCommon.h"
#include "base/CCConsole.h" #include "base/CCConsole.h"
#include "ConvertUTF.h"
NS_CC_BEGIN NS_CC_BEGIN
int cc_wcslen(const unsigned short* str) namespace StringUtils {
{
int i=0;
while(*str++) i++;
return i;
}
/* Code from GLIB gutf8.c starts here. */
#define UTF8_COMPUTE(Char, Mask, Len) \
if (Char < 128) \
{ \
Len = 1; \
Mask = 0x7f; \
} \
else if ((Char & 0xe0) == 0xc0) \
{ \
Len = 2; \
Mask = 0x1f; \
} \
else if ((Char & 0xf0) == 0xe0) \
{ \
Len = 3; \
Mask = 0x0f; \
} \
else if ((Char & 0xf8) == 0xf0) \
{ \
Len = 4; \
Mask = 0x07; \
} \
else if ((Char & 0xfc) == 0xf8) \
{ \
Len = 5; \
Mask = 0x03; \
} \
else if ((Char & 0xfe) == 0xfc) \
{ \
Len = 6; \
Mask = 0x01; \
} \
else \
Len = -1;
#define UTF8_LENGTH(Char) \
((Char) < 0x80 ? 1 : \
((Char) < 0x800 ? 2 : \
((Char) < 0x10000 ? 3 : \
((Char) < 0x200000 ? 4 : \
((Char) < 0x4000000 ? 5 : 6)))))
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
(Result) = (Chars)[0] & (Mask); \
for ((Count) = 1; (Count) < (Len); ++(Count)) \
{ \
if (((Chars)[(Count)] & 0xc0) != 0x80) \
{ \
(Result) = -1; \
break; \
} \
(Result) <<= 6; \
(Result) |= ((Chars)[(Count)] & 0x3f); \
}
#define UNICODE_VALID(Char) \
((Char) < 0x110000 && \
(((Char) & 0xFFFFF800) != 0xD800) && \
((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
((Char) & 0xFFFE) != 0xFFFE)
static const char utf8_skip_data[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
5, 5, 5, 6, 6, 1, 1
};
static const char *const g_utf8_skip = utf8_skip_data;
#define cc_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
/* /*
* @str: the string to search through. * @str: the string to search through.
@ -129,14 +38,14 @@ static const char *const g_utf8_skip = utf8_skip_data;
* *
* Return value: the index of the last character that is not c. * Return value: the index of the last character that is not c.
* */ * */
unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigned short c) unsigned int getIndexOfLastNotChar16(const std::vector<char16_t>& str, char16_t c)
{ {
int len = static_cast<int>(str.size()); int len = static_cast<int>(str.size());
int i = len - 1; int i = len - 1;
for (; i >= 0; --i) for (; i >= 0; --i)
if (str[i] != c) return i; if (str[i] != c) return i;
return i; return i;
} }
@ -148,13 +57,13 @@ unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigne
* *
* Return value: the trimmed string. * Return value: the trimmed string.
* */ * */
static void cc_utf8_trim_from(std::vector<unsigned short>* str, int index) static void trimUTF16VectorFromIndex(std::vector<char16_t>& str, int index)
{ {
int size = static_cast<int>(str->size()); int size = static_cast<int>(str.size());
if (index >= size || index < 0) if (index >= size || index < 0)
return; return;
str->erase(str->begin() + index, str->begin() + size); str.erase(str.begin() + index, str.begin() + size);
} }
/* /*
@ -164,14 +73,14 @@ static void cc_utf8_trim_from(std::vector<unsigned short>* str, int index)
* *
* Return value: weather the character is a whitespace character. * Return value: weather the character is a whitespace character.
* */ * */
bool isspace_unicode(unsigned short ch) bool isUnicodeSpace(char16_t ch)
{ {
return (ch >= 0x0009 && ch <= 0x000D) || ch == 0x0020 || ch == 0x0085 || ch == 0x00A0 || ch == 0x1680 return (ch >= 0x0009 && ch <= 0x000D) || ch == 0x0020 || ch == 0x0085 || ch == 0x00A0 || ch == 0x1680
|| (ch >= 0x2000 && ch <= 0x200A) || ch == 0x2028 || ch == 0x2029 || ch == 0x202F || (ch >= 0x2000 && ch <= 0x200A) || ch == 0x2028 || ch == 0x2029 || ch == 0x202F
|| ch == 0x205F || ch == 0x3000; || ch == 0x205F || ch == 0x3000;
} }
bool iscjk_unicode(unsigned short ch) bool isCJKUnicode(char16_t ch)
{ {
return (ch >= 0x4E00 && ch <= 0x9FBF) // CJK Unified Ideographs return (ch >= 0x4E00 && ch <= 0x9FBF) // CJK Unified Ideographs
|| (ch >= 0x2E80 && ch <= 0x2FDF) // CJK Radicals Supplement & Kangxi Radicals || (ch >= 0x2E80 && ch <= 0x2FDF) // CJK Radicals Supplement & Kangxi Radicals
@ -183,131 +92,165 @@ bool iscjk_unicode(unsigned short ch)
|| (ch >= 0x31C0 && ch <= 0x4DFF); // Other exiensions || (ch >= 0x31C0 && ch <= 0x4DFF); // Other exiensions
} }
void cc_utf8_trim_ws(std::vector<unsigned short>* str) void trimUTF16Vector(std::vector<char16_t>& str)
{ {
int len = static_cast<int>(str->size()); int len = static_cast<int>(str.size());
if ( len <= 0 ) if ( len <= 0 )
return; return;
int last_index = len - 1; int last_index = len - 1;
// Only start trimming if the last character is whitespace.. // Only start trimming if the last character is whitespace..
if (isspace_unicode((*str)[last_index])) if (isUnicodeSpace(str[last_index]))
{ {
for (int i = last_index - 1; i >= 0; --i) for (int i = last_index - 1; i >= 0; --i)
{ {
if (isspace_unicode((*str)[i])) if (isUnicodeSpace(str[i]))
last_index = i; last_index = i;
else else
break; break;
} }
cc_utf8_trim_from(str, last_index); trimUTF16VectorFromIndex(str, last_index);
} }
} }
/* bool UTF8ToUTF16(const std::string& utf8, std::u16string& outUtf16)
* cc_utf8_strlen:
* @p: pointer to the start of a UTF-8 encoded string.
* @max: the maximum number of bytes to examine. If @max
* is less than 0, then the string is assumed to be
* null-terminated. If @max is 0, @p will not be examined and
* may be %nullptr.
*
* Returns the length of the string in characters.
*
* Return value: the length of the string in characters
**/
long
cc_utf8_strlen (const char * p, int max)
{ {
long len = 0; if (utf8.empty())
const char *start = p;
if (!(p != nullptr || max == 0))
{ {
return 0; outUtf16.clear();
return true;
} }
if (max < 0) bool ret = false;
const size_t utf16Bytes = (utf8.length()+1) << 1;
char16_t* utf16 = (char16_t*)malloc(utf16Bytes);
memset(utf16, 0, utf16Bytes);
UTF16* utf16Start = (UTF16*)utf16;
UTF16* utf16End = ((UTF16*)utf16) + (utf8.length());
const UTF8* utf8Start = (const UTF8*)utf8.data();
const UTF8* utf8End = ((const UTF8*)utf8.data()) + utf8.length();
if (conversionOK == ConvertUTF8toUTF16((const UTF8 **) &utf8Start, utf8End, &utf16Start, utf16End, strictConversion))
{ {
while (*p) outUtf16 = utf16;
{ ret = true;
p = cc_utf8_next_char (p);
++len;
}
} }
else
{ free(utf16);
if (max == 0 || !*p)
return 0; return ret;
p = cc_utf8_next_char (p);
while (p - start < max && *p)
{
++len;
p = cc_utf8_next_char (p);
}
/* only do the last len increment if we got a complete
* char (don't count partial chars)
*/
if (p - start == max)
++len;
}
return len;
} }
/* bool UTF16ToUTF8(const std::u16string& utf16, std::string& outUtf8)
* g_utf8_get_char:
* @p: a pointer to Unicode character encoded as UTF-8
*
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
* If @p does not point to a valid UTF-8 encoded character, results are
* undefined. If you are not sure that the bytes are complete
* valid Unicode characters, you should use g_utf8_get_char_validated()
* instead.
*
* Return value: the resulting character
**/
static unsigned int
cc_utf8_get_char (const char * p)
{ {
int i, mask = 0, len; if (utf16.empty())
unsigned int result; {
unsigned char c = (unsigned char) *p; outUtf8.clear();
return true;
UTF8_COMPUTE (c, mask, len); }
if (len == -1)
return (unsigned int) - 1; bool ret = false;
UTF8_GET (result, p, i, mask, len); const size_t utf8Bytes = (utf16.length() << 2) + 1;
char* utf8 = (char*)malloc(utf8Bytes);
return result; memset(utf8, 0, utf8Bytes);
UTF8 *utf8Start = (UTF8*)utf8;
UTF8 *utf8End = ((UTF8*)utf8) + (utf8Bytes -1);
const UTF16* utf16Start = (const UTF16*)utf16.data();
const UTF16* utf16End = ((const UTF16*)utf16.data()) + utf16.length();
if (conversionOK == ConvertUTF16toUTF8(&utf16Start, utf16End, &utf8Start, utf8End, strictConversion))
{
outUtf8 = utf8;
ret = true;
}
free(utf8);
return ret;
} }
std::vector<char16_t> getUTF16VectorFromUTF16String(const std::u16string& str)
unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1 */, int* rUtf16Size/* = nullptr */)
{ {
long len = cc_utf8_strlen(str_old, length); std::vector<char16_t> str_new;
if (rUtf16Size != nullptr) {
*rUtf16Size = static_cast<int>(len); size_t len = str.length();
} for (size_t i = 0; i < len; ++i)
unsigned short* str_new = new unsigned short[len + 1];
str_new[len] = 0;
for (int i = 0; i < len; ++i)
{ {
str_new[i] = cc_utf8_get_char(str_old); str_new.push_back(str[i]);
str_old = cc_utf8_next_char(str_old);
} }
return str_new; return str_new;
} }
std::vector<char16_t> getChar16VectorFromUTF16String(const std::u16string& utf16)
{
std::vector<char16_t> ret;
size_t len = utf16.length();
ret.reserve(len);
for (size_t i = 0; i < len; ++i)
{
ret.push_back(utf16[i]);
}
return ret;
}
long getCharacterCountInUTF8String(const std::string& utf8)
{
return getUTF8StringLength((const UTF8*)utf8.c_str());
}
} //namespace StringUtils {
int cc_wcslen(const unsigned short* str)
{
int i=0;
while(*str++) i++;
return i;
}
void cc_utf8_trim_ws(std::vector<unsigned short>* str)
{
// unsigned short and char16_t are both 2 bytes
std::vector<char16_t>* ret = reinterpret_cast<std::vector<char16_t>*>(str);
StringUtils::trimUTF16Vector(*ret);
}
bool isspace_unicode(unsigned short ch)
{
return StringUtils::isUnicodeSpace(ch);
}
bool iscjk_unicode(unsigned short ch)
{
return StringUtils::isCJKUnicode(ch);
}
long cc_utf8_strlen (const char * p, int max)
{
CC_UNUSED_PARAM(max);
return StringUtils::getCharacterCountInUTF8String(p);
}
unsigned int cc_utf8_find_last_not_char(const std::vector<unsigned short>& str, unsigned short c)
{
std::vector<char16_t> char16Vector;
for (const auto& e : str)
{
char16Vector.push_back(e);
}
return StringUtils::getIndexOfLastNotChar16(char16Vector, c);
}
std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str) std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str)
{ {
int len = cc_wcslen(str); int len = cc_wcslen(str);
@ -320,209 +263,56 @@ std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* st
return str_new; return str_new;
} }
/** unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1*/, int* rUtf16Size/* = nullptr*/)
* cc_unichar_to_utf8:
* @c: a ISO10646 character code
* @outbuf: output buffer, must have at least 6 bytes of space.
* If %nullptr, the length will be computed and returned
* and nothing will be written to @outbuf.
*
* Converts a single character to UTF-8.
*
* Return value: number of bytes written
**/
int
cc_unichar_to_utf8 (unsigned int c,
char *outbuf)
{ {
int len = 0; if (str_old == nullptr)
int first; return nullptr;
int i;
if (c < 0x80) unsigned short* ret = nullptr;
std::u16string outUtf16;
bool succeed = StringUtils::UTF8ToUTF16(str_old, outUtf16);
if (succeed)
{ {
first = 0; ret = new unsigned short[outUtf16.length() + 1];
len = 1; ret[outUtf16.length()] = 0;
} memcpy(ret, outUtf16.data(), outUtf16.length());
else if (c < 0x800)
{
first = 0xc0;
len = 2;
}
else if (c < 0x10000)
{
first = 0xe0;
len = 3;
}
else if (c < 0x200000)
{
first = 0xf0;
len = 4;
}
else if (c < 0x4000000)
{
first = 0xf8;
len = 5;
}
else
{
first = 0xfc;
len = 6;
} }
if (outbuf) return ret;
{
for (i = len - 1; i > 0; --i)
{
outbuf[i] = (c & 0x3f) | 0x80;
c >>= 6;
}
outbuf[0] = c | first;
}
return len;
} }
#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000) char * cc_utf16_to_utf8 (const unsigned short *str,
int len,
/** long *items_read,
* cc_utf16_to_utf8: long *items_written)
* @str: a UTF-16 encoded string
* @len: the maximum length of @str to use. If @len < 0, then
* the string is terminated with a 0 character.
* @items_read: location to store number of words read, or %nullptr.
* If %nullptr, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
* returned in case @str contains a trailing partial
* character. If an error occurs then the index of the
* invalid input is stored here.
* @items_written: location to store number of bytes written, or %nullptr.
* The value stored here does not include the trailing
* 0 byte.
* @error: location to store the error occuring, or %nullptr to ignore
* errors. Any of the errors in #GConvertError other than
* %G_CONVERT_ERROR_NO_CONVERSION may occur.
*
* Convert a string from UTF-16 to UTF-8. The result will be
* terminated with a 0 byte.
*
* Return value: a pointer to a newly allocated UTF-8 string.
* This value must be freed with free(). If an
* error occurs, %nullptr will be returned and
* @error set.
**/
char *
cc_utf16_to_utf8 (const unsigned short *str,
int len,
long *items_read,
long *items_written)
{ {
/* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ if (str == nullptr)
* are marked. return nullptr;
*/
const unsigned short *in;
char *out;
char *result = nullptr;
int n_bytes;
unsigned int high_surrogate;
if (str == 0) return nullptr;
n_bytes = 0; std::u16string utf16;
in = str; int utf16Len = len < 0 ? cc_wcslen(str) : len;
high_surrogate = 0;
while ((len < 0 || in - str < len) && *in) for (int i = 0; i < utf16Len; ++i)
{ {
unsigned short c = *in; utf16.push_back(str[i]);
unsigned int wc;
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
{
if (high_surrogate)
{
wc = SURROGATE_VALUE (high_surrogate, c);
high_surrogate = 0;
}
else
{
CCLOGERROR("Invalid sequence in conversion input");
goto err_out;
}
}
else
{
if (high_surrogate)
{
CCLOGERROR("Invalid sequence in conversion input");
goto err_out;
}
if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
{
high_surrogate = c;
goto next1;
}
else
wc = c;
}
/********** DIFFERENT for UTF8/UCS4 **********/
n_bytes += UTF8_LENGTH (wc);
next1:
in++;
} }
if (high_surrogate && !items_read) char* ret = nullptr;
{ std::string outUtf8;
CCLOGERROR("Partial character sequence at end of input"); bool succeed = StringUtils::UTF16ToUTF8(utf16, outUtf8);
goto err_out;
}
/* At this point, everything is valid, and we just need to convert if (succeed)
*/
/********** DIFFERENT for UTF8/UCS4 **********/
result = new char[n_bytes + 1];
high_surrogate = 0;
out = result;
in = str;
while (out < result + n_bytes)
{ {
unsigned short c = *in; ret = new char[outUtf8.length() + 1];
unsigned int wc; ret[outUtf8.length()] = '\0';
memcpy(ret, outUtf8.data(), outUtf8.length());
if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
{
wc = SURROGATE_VALUE (high_surrogate, c);
high_surrogate = 0;
}
else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
{
high_surrogate = c;
goto next2;
}
else
wc = c;
/********** DIFFERENT for UTF8/UCS4 **********/
out += cc_unichar_to_utf8 (wc, out);
next2:
in++;
} }
/********** DIFFERENT for UTF8/UCS4 **********/ return ret;
*out = '\0';
if (items_written)
/********** DIFFERENT for UTF8/UCS4 **********/
*items_written = out - result;
err_out:
if (items_read)
*items_read = in - str;
return result;
} }
NS_CC_END NS_CC_END

View File

@ -1,35 +1,131 @@
/* /****************************************************************************
* Copyright (C) 1999 Tom Tromey Copyright (c) 2014 cocos2d-x.org
* Copyright (C) 2000 Red Hat, Inc. Copyright (c) 2014 Chukong Technologies Inc.
* Copyright (c) 2013-2014 Chukong Technologies Inc.
* http://www.cocos2d-x.org
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public Permission is hereby granted, free of charge, to any person obtaining a copy
* License as published by the Free Software Foundation; either of this software and associated documentation files (the "Software"), to deal
* version 2 of the License, or (at your option) any later version. in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* This library is distributed in the hope that it will be useful, copies of the Software, and to permit persons to whom the Software is
* but WITHOUT ANY WARRANTY; without even the implied warranty of furnished to do so, subject to the following conditions:
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* Free Software Foundation, Inc., 59 Temple Place - Suite 330, IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* Boston, MA 02111-1307, USA. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
*/ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
****************************************************************************/
#ifndef __cocos2dx__ccUTF8__ #ifndef __cocos2dx__ccUTF8__
#define __cocos2dx__ccUTF8__ #define __cocos2dx__ccUTF8__
#include "base/CCPlatformMacros.h" #include "base/CCPlatformMacros.h"
#include <vector> #include <vector>
#include <string>
NS_CC_BEGIN NS_CC_BEGIN
namespace StringUtils {
/**
* @brief Converts utf8 string to utf16 string
* @param utf8 The utf8 string to be converted
* @param outUtf16 The output utf16 string
* @return true if succeed, otherwise false
* @note Please check the return value before using \p outUtf16
* e.g.
* @code
* std::u16string utf16;
* bool ret = StringUtils::UTF8ToUTF16("你好hello", utf16);
* if (ret) {
* do_some_thing_with_utf16(utf16);
* }
* @endcode
*/
CC_DLL bool UTF8ToUTF16(const std::string& utf8, std::u16string& outUtf16);
/**
* @brief Converts utf16 string to utf8 string
* @param utf16 The utf16 string to be converted
* @param outUtf8 The output utf8 string
* @return true if succeed, otherwise false
* @note Please check the return value before using \p outUtf8
* e.g.
* @code
* std::string utf8;
* bool ret = StringUtils::UTF16ToUTF8(u"\u4f60\u597d", utf16);
* if (ret) {
* do_some_thing_with_utf8(utf8);
* }
* @endcode
*/
CC_DLL bool UTF16ToUTF8(const std::u16string& utf16, std::string& outUtf8);
/**
* @brief Trims the unicode spaces at the end of char16_t vector
*/
CC_DLL void trimUTF16Vector(std::vector<char16_t>& str);
/**
* @brief Whether the character is a whitespace character.
*
* @param ch the unicode character
* @returns whether the character is a white space character.
*
* @see http://en.wikipedia.org/wiki/Whitespace_character#Unicode
*
*/
CC_DLL bool isUnicodeSpace(char16_t ch);
/**
* @brief Whether the character is a Chinese/Japanese/Korean character.
*
* @param ch the unicode character
* @returns whether the character is a Chinese character.
*
* @see http://www.searchtb.com/2012/04/chinese_encode.html
* @see http://tieba.baidu.com/p/748765987
*
*/
CC_DLL bool isCJKUnicode(char16_t ch);
/**
* @brief Returns the length of the string in characters.
*
* @param utf8 an UTF-8 encoded string.
* @returns the length of the string in characters
*/
CC_DLL long getCharacterCountInUTF8String(const std::string& utf8);
/**
* @brief Gets the index of the last character that is not equal to the character given.
*
* @param str the string to be searched.
* @param c the character to be searched for.
*
* @returns the index of the last character that is not \p c.
*
*/
CC_DLL unsigned int getIndexOfLastNotChar16(const std::vector<char16_t>& str, char16_t c);
/**
* @brief Gets char16_t vector from a given utf16 string
*/
CC_DLL std::vector<char16_t> getChar16VectorFromUTF16String(const std::u16string& utf16);
} // namespace StringUtils {
CC_DLL int cc_wcslen(const unsigned short* str); CC_DLL int cc_wcslen(const unsigned short* str);
CC_DLL void cc_utf8_trim_ws(std::vector<unsigned short>* str); CC_DEPRECATED_ATTRIBUTE void cc_utf8_trim_ws(std::vector<unsigned short>* str);
/** /**
* Whether the character is a whitespace character. * Whether the character is a whitespace character.
@ -39,7 +135,7 @@ CC_DLL void cc_utf8_trim_ws(std::vector<unsigned short>* str);
* *
* @see http://en.wikipedia.org/wiki/Whitespace_character#Unicode * @see http://en.wikipedia.org/wiki/Whitespace_character#Unicode
* */ * */
CC_DLL bool isspace_unicode(unsigned short ch); CC_DEPRECATED_ATTRIBUTE bool isspace_unicode(unsigned short ch);
/** /**
* Whether the character is a Chinese/Japanese/Korean character. * Whether the character is a Chinese/Japanese/Korean character.
@ -50,7 +146,7 @@ CC_DLL bool isspace_unicode(unsigned short ch);
* @see http://www.searchtb.com/2012/04/chinese_encode.html * @see http://www.searchtb.com/2012/04/chinese_encode.html
* @see http://tieba.baidu.com/p/748765987 * @see http://tieba.baidu.com/p/748765987
* */ * */
CC_DLL bool iscjk_unicode(unsigned short ch); CC_DEPRECATED_ATTRIBUTE bool iscjk_unicode(unsigned short ch);
/** /**
* Returns the length of the string in characters. * Returns the length of the string in characters.
@ -62,7 +158,7 @@ CC_DLL bool iscjk_unicode(unsigned short ch);
* *
* @returns the length of the string in characters * @returns the length of the string in characters
**/ **/
CC_DLL long CC_DEPRECATED_ATTRIBUTE long
cc_utf8_strlen (const char * p, int max); cc_utf8_strlen (const char * p, int max);
/** /**
@ -73,9 +169,9 @@ cc_utf8_strlen (const char * p, int max);
* *
* @returns the index of the last character that is not \p c. * @returns the index of the last character that is not \p c.
* */ * */
CC_DLL unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigned short c); CC_DEPRECATED_ATTRIBUTE unsigned int cc_utf8_find_last_not_char(const std::vector<unsigned short>& str, unsigned short c);
CC_DLL std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str); CC_DEPRECATED_ATTRIBUTE std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str);
/** /**
* Creates a utf8 string from a cstring. * Creates a utf8 string from a cstring.
@ -84,7 +180,7 @@ CC_DLL std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned sh
* *
* @returns the newly created utf8 string. * @returns the newly created utf8 string.
* */ * */
CC_DLL unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, int* rUtf16Size = nullptr); CC_DEPRECATED_ATTRIBUTE unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, int* rUtf16Size = nullptr);
/** /**
* Convert a string from UTF-16 to UTF-8. The result will be null terminated. * Convert a string from UTF-16 to UTF-8. The result will be null terminated.
@ -103,12 +199,13 @@ CC_DLL unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, in
* @returns a pointer to a newly allocated UTF-8 string. This value must be * @returns a pointer to a newly allocated UTF-8 string. This value must be
* freed with free(). If an error occurs, %nullptr will be returned. * freed with free(). If an error occurs, %nullptr will be returned.
**/ **/
CC_DLL char * CC_DEPRECATED_ATTRIBUTE char *
cc_utf16_to_utf8 (const unsigned short *str, cc_utf16_to_utf8 (const unsigned short *str,
int len, int len,
long *items_read, long *items_read,
long *items_written); long *items_written);
NS_CC_END NS_CC_END
#endif /* defined(__cocos2dx__ccUTF8__) */ #endif /* defined(__cocos2dx__ccUTF8__) */