From 20a8808b78563dd9d7290c6f6cd90b2399c5744b Mon Sep 17 00:00:00 2001 From: James Chen Date: Thu, 8 May 2014 11:10:54 +0800 Subject: [PATCH] issue #4660: Refactors utf8 api, make it safer and easier to use. --- cocos/2d/ccUTF8.cpp | 602 +++++++++++++++----------------------------- cocos/2d/ccUTF8.h | 153 ++++++++--- 2 files changed, 321 insertions(+), 434 deletions(-) diff --git a/cocos/2d/ccUTF8.cpp b/cocos/2d/ccUTF8.cpp index 336314e696..98aa52e6b3 100644 --- a/cocos/2d/ccUTF8.cpp +++ b/cocos/2d/ccUTF8.cpp @@ -1,127 +1,36 @@ -/* - * This file uses some implementations of gutf8.c in glib. - * - * gutf8.c - Operations on UTF-8 strings. - * - * Copyright (C) 1999 Tom Tromey - * Copyright (C) 2000 Red Hat, Inc. - * Copyright (c) 2013-2014 Chukong Technologies Inc. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ +/**************************************************************************** + Copyright (c) 2014 cocos2d-x.org + Copyright (c) 2014 Chukong Technologies Inc. + + http://www.cocos2d-x.org + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + ****************************************************************************/ #include "ccUTF8.h" #include "2d/platform/CCCommon.h" #include "base/CCConsole.h" +#include "ConvertUTF.h" NS_CC_BEGIN -int cc_wcslen(const unsigned short* str) -{ - int i=0; - while(*str++) i++; - return i; -} - -/* Code from GLIB gutf8.c starts here. */ - -#define UTF8_COMPUTE(Char, Mask, Len) \ -if (Char < 128) \ -{ \ -Len = 1; \ -Mask = 0x7f; \ -} \ -else if ((Char & 0xe0) == 0xc0) \ -{ \ -Len = 2; \ -Mask = 0x1f; \ -} \ -else if ((Char & 0xf0) == 0xe0) \ -{ \ -Len = 3; \ -Mask = 0x0f; \ -} \ -else if ((Char & 0xf8) == 0xf0) \ -{ \ -Len = 4; \ -Mask = 0x07; \ -} \ -else if ((Char & 0xfc) == 0xf8) \ -{ \ -Len = 5; \ -Mask = 0x03; \ -} \ -else if ((Char & 0xfe) == 0xfc) \ -{ \ -Len = 6; \ -Mask = 0x01; \ -} \ -else \ -Len = -1; - -#define UTF8_LENGTH(Char) \ -((Char) < 0x80 ? 1 : \ -((Char) < 0x800 ? 2 : \ -((Char) < 0x10000 ? 3 : \ -((Char) < 0x200000 ? 4 : \ -((Char) < 0x4000000 ? 5 : 6))))) - - -#define UTF8_GET(Result, Chars, Count, Mask, Len) \ -(Result) = (Chars)[0] & (Mask); \ -for ((Count) = 1; (Count) < (Len); ++(Count)) \ -{ \ -if (((Chars)[(Count)] & 0xc0) != 0x80) \ -{ \ -(Result) = -1; \ -break; \ -} \ -(Result) <<= 6; \ -(Result) |= ((Chars)[(Count)] & 0x3f); \ -} - -#define UNICODE_VALID(Char) \ -((Char) < 0x110000 && \ -(((Char) & 0xFFFFF800) != 0xD800) && \ -((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ -((Char) & 0xFFFE) != 0xFFFE) - - -static const char utf8_skip_data[256] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, - 5, 5, 5, 6, 6, 1, 1 -}; - -static const char *const g_utf8_skip = utf8_skip_data; - -#define cc_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)]) +namespace StringUtils { /* * @str: the string to search through. @@ -129,14 +38,14 @@ static const char *const g_utf8_skip = utf8_skip_data; * * Return value: the index of the last character that is not c. * */ -unsigned int cc_utf8_find_last_not_char(std::vector str, unsigned short c) +unsigned int getIndexOfLastNotChar16(const std::vector& str, char16_t c) { int len = static_cast(str.size()); - + int i = len - 1; for (; i >= 0; --i) if (str[i] != c) return i; - + return i; } @@ -148,13 +57,13 @@ unsigned int cc_utf8_find_last_not_char(std::vector str, unsigne * * Return value: the trimmed string. * */ -static void cc_utf8_trim_from(std::vector* str, int index) +static void trimUTF16VectorFromIndex(std::vector& str, int index) { - int size = static_cast(str->size()); + int size = static_cast(str.size()); if (index >= size || index < 0) return; - - str->erase(str->begin() + index, str->begin() + size); + + str.erase(str.begin() + index, str.begin() + size); } /* @@ -164,14 +73,14 @@ static void cc_utf8_trim_from(std::vector* str, int index) * * Return value: weather the character is a whitespace character. * */ -bool isspace_unicode(unsigned short ch) +bool isUnicodeSpace(char16_t ch) { return (ch >= 0x0009 && ch <= 0x000D) || ch == 0x0020 || ch == 0x0085 || ch == 0x00A0 || ch == 0x1680 || (ch >= 0x2000 && ch <= 0x200A) || ch == 0x2028 || ch == 0x2029 || ch == 0x202F || ch == 0x205F || ch == 0x3000; } -bool iscjk_unicode(unsigned short ch) +bool isCJKUnicode(char16_t ch) { return (ch >= 0x4E00 && ch <= 0x9FBF) // CJK Unified Ideographs || (ch >= 0x2E80 && ch <= 0x2FDF) // CJK Radicals Supplement & Kangxi Radicals @@ -183,131 +92,165 @@ bool iscjk_unicode(unsigned short ch) || (ch >= 0x31C0 && ch <= 0x4DFF); // Other exiensions } -void cc_utf8_trim_ws(std::vector* str) +void trimUTF16Vector(std::vector& str) { - int len = static_cast(str->size()); - + int len = static_cast(str.size()); + if ( len <= 0 ) return; - + int last_index = len - 1; - + // Only start trimming if the last character is whitespace.. - if (isspace_unicode((*str)[last_index])) + if (isUnicodeSpace(str[last_index])) { for (int i = last_index - 1; i >= 0; --i) { - if (isspace_unicode((*str)[i])) + if (isUnicodeSpace(str[i])) last_index = i; else break; } - - cc_utf8_trim_from(str, last_index); + + trimUTF16VectorFromIndex(str, last_index); } } -/* - * cc_utf8_strlen: - * @p: pointer to the start of a UTF-8 encoded string. - * @max: the maximum number of bytes to examine. If @max - * is less than 0, then the string is assumed to be - * null-terminated. If @max is 0, @p will not be examined and - * may be %nullptr. - * - * Returns the length of the string in characters. - * - * Return value: the length of the string in characters - **/ -long -cc_utf8_strlen (const char * p, int max) +bool UTF8ToUTF16(const std::string& utf8, std::u16string& outUtf16) { - long len = 0; - const char *start = p; - - if (!(p != nullptr || max == 0)) + if (utf8.empty()) { - return 0; + outUtf16.clear(); + return true; } - - if (max < 0) + + bool ret = false; + const size_t utf16Bytes = (utf8.length()+1) << 1; + char16_t* utf16 = (char16_t*)malloc(utf16Bytes); + memset(utf16, 0, utf16Bytes); + + UTF16* utf16Start = (UTF16*)utf16; + UTF16* utf16End = ((UTF16*)utf16) + (utf8.length()); + + const UTF8* utf8Start = (const UTF8*)utf8.data(); + const UTF8* utf8End = ((const UTF8*)utf8.data()) + utf8.length(); + + if (conversionOK == ConvertUTF8toUTF16((const UTF8 **) &utf8Start, utf8End, &utf16Start, utf16End, strictConversion)) { - while (*p) - { - p = cc_utf8_next_char (p); - ++len; - } + outUtf16 = utf16; + ret = true; } - else - { - if (max == 0 || !*p) - return 0; - - p = cc_utf8_next_char (p); - - while (p - start < max && *p) - { - ++len; - p = cc_utf8_next_char (p); - } - - /* only do the last len increment if we got a complete - * char (don't count partial chars) - */ - if (p - start == max) - ++len; - } - - return len; + + free(utf16); + + return ret; } -/* - * g_utf8_get_char: - * @p: a pointer to Unicode character encoded as UTF-8 - * - * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. - * If @p does not point to a valid UTF-8 encoded character, results are - * undefined. If you are not sure that the bytes are complete - * valid Unicode characters, you should use g_utf8_get_char_validated() - * instead. - * - * Return value: the resulting character - **/ -static unsigned int -cc_utf8_get_char (const char * p) +bool UTF16ToUTF8(const std::u16string& utf16, std::string& outUtf8) { - int i, mask = 0, len; - unsigned int result; - unsigned char c = (unsigned char) *p; - - UTF8_COMPUTE (c, mask, len); - if (len == -1) - return (unsigned int) - 1; - UTF8_GET (result, p, i, mask, len); - - return result; + if (utf16.empty()) + { + outUtf8.clear(); + return true; + } + + bool ret = false; + const size_t utf8Bytes = (utf16.length() << 2) + 1; + char* utf8 = (char*)malloc(utf8Bytes); + memset(utf8, 0, utf8Bytes); + + UTF8 *utf8Start = (UTF8*)utf8; + UTF8 *utf8End = ((UTF8*)utf8) + (utf8Bytes -1); + + const UTF16* utf16Start = (const UTF16*)utf16.data(); + const UTF16* utf16End = ((const UTF16*)utf16.data()) + utf16.length(); + + if (conversionOK == ConvertUTF16toUTF8(&utf16Start, utf16End, &utf8Start, utf8End, strictConversion)) + { + outUtf8 = utf8; + ret = true; + } + + free(utf8); + + return ret; } - -unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1 */, int* rUtf16Size/* = nullptr */) +std::vector getUTF16VectorFromUTF16String(const std::u16string& str) { - long len = cc_utf8_strlen(str_old, length); - if (rUtf16Size != nullptr) { - *rUtf16Size = static_cast(len); - } - - unsigned short* str_new = new unsigned short[len + 1]; - str_new[len] = 0; - - for (int i = 0; i < len; ++i) + std::vector str_new; + + size_t len = str.length(); + for (size_t i = 0; i < len; ++i) { - str_new[i] = cc_utf8_get_char(str_old); - str_old = cc_utf8_next_char(str_old); + str_new.push_back(str[i]); } - return str_new; } +std::vector getChar16VectorFromUTF16String(const std::u16string& utf16) +{ + std::vector ret; + size_t len = utf16.length(); + ret.reserve(len); + for (size_t i = 0; i < len; ++i) + { + ret.push_back(utf16[i]); + } + return ret; +} + +long getCharacterCountInUTF8String(const std::string& utf8) +{ + return getUTF8StringLength((const UTF8*)utf8.c_str()); +} + +} //namespace StringUtils { + + +int cc_wcslen(const unsigned short* str) +{ + int i=0; + while(*str++) i++; + return i; +} + +void cc_utf8_trim_ws(std::vector* str) +{ + // unsigned short and char16_t are both 2 bytes + std::vector* ret = reinterpret_cast*>(str); + StringUtils::trimUTF16Vector(*ret); +} + +bool isspace_unicode(unsigned short ch) +{ + return StringUtils::isUnicodeSpace(ch); +} + + +bool iscjk_unicode(unsigned short ch) +{ + return StringUtils::isCJKUnicode(ch); +} + + +long cc_utf8_strlen (const char * p, int max) +{ + CC_UNUSED_PARAM(max); + return StringUtils::getCharacterCountInUTF8String(p); +} + +unsigned int cc_utf8_find_last_not_char(const std::vector& str, unsigned short c) +{ + std::vector char16Vector; + for (const auto& e : str) + { + char16Vector.push_back(e); + } + + return StringUtils::getIndexOfLastNotChar16(char16Vector, c); +} + std::vector cc_utf16_vec_from_utf16_str(const unsigned short* str) { int len = cc_wcslen(str); @@ -320,209 +263,56 @@ std::vector cc_utf16_vec_from_utf16_str(const unsigned short* st return str_new; } -/** - * cc_unichar_to_utf8: - * @c: a ISO10646 character code - * @outbuf: output buffer, must have at least 6 bytes of space. - * If %nullptr, the length will be computed and returned - * and nothing will be written to @outbuf. - * - * Converts a single character to UTF-8. - * - * Return value: number of bytes written - **/ -int -cc_unichar_to_utf8 (unsigned int c, - char *outbuf) +unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1*/, int* rUtf16Size/* = nullptr*/) { - int len = 0; - int first; - int i; + if (str_old == nullptr) + return nullptr; - if (c < 0x80) + unsigned short* ret = nullptr; + + std::u16string outUtf16; + bool succeed = StringUtils::UTF8ToUTF16(str_old, outUtf16); + + if (succeed) { - first = 0; - len = 1; - } - else if (c < 0x800) - { - first = 0xc0; - len = 2; - } - else if (c < 0x10000) - { - first = 0xe0; - len = 3; - } - else if (c < 0x200000) - { - first = 0xf0; - len = 4; - } - else if (c < 0x4000000) - { - first = 0xf8; - len = 5; - } - else - { - first = 0xfc; - len = 6; + ret = new unsigned short[outUtf16.length() + 1]; + ret[outUtf16.length()] = 0; + memcpy(ret, outUtf16.data(), outUtf16.length()); } - if (outbuf) - { - for (i = len - 1; i > 0; --i) - { - outbuf[i] = (c & 0x3f) | 0x80; - c >>= 6; - } - outbuf[0] = c | first; - } - - return len; + return ret; } -#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000) - -/** - * cc_utf16_to_utf8: - * @str: a UTF-16 encoded string - * @len: the maximum length of @str to use. If @len < 0, then - * the string is terminated with a 0 character. - * @items_read: location to store number of words read, or %nullptr. - * If %nullptr, then %G_CONVERT_ERROR_PARTIAL_INPUT will be - * returned in case @str contains a trailing partial - * character. If an error occurs then the index of the - * invalid input is stored here. - * @items_written: location to store number of bytes written, or %nullptr. - * The value stored here does not include the trailing - * 0 byte. - * @error: location to store the error occuring, or %nullptr to ignore - * errors. Any of the errors in #GConvertError other than - * %G_CONVERT_ERROR_NO_CONVERSION may occur. - * - * Convert a string from UTF-16 to UTF-8. The result will be - * terminated with a 0 byte. - * - * Return value: a pointer to a newly allocated UTF-8 string. - * This value must be freed with free(). If an - * error occurs, %nullptr will be returned and - * @error set. - **/ -char * -cc_utf16_to_utf8 (const unsigned short *str, - int len, - long *items_read, - long *items_written) +char * cc_utf16_to_utf8 (const unsigned short *str, + int len, + long *items_read, + long *items_written) { - /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ - * are marked. - */ - const unsigned short *in; - char *out; - char *result = nullptr; - int n_bytes; - unsigned int high_surrogate; + if (str == nullptr) + return nullptr; - if (str == 0) return nullptr; - n_bytes = 0; - in = str; - high_surrogate = 0; - while ((len < 0 || in - str < len) && *in) + std::u16string utf16; + int utf16Len = len < 0 ? cc_wcslen(str) : len; + + for (int i = 0; i < utf16Len; ++i) { - unsigned short c = *in; - unsigned int wc; - - if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ - { - if (high_surrogate) - { - wc = SURROGATE_VALUE (high_surrogate, c); - high_surrogate = 0; - } - else - { - CCLOGERROR("Invalid sequence in conversion input"); - goto err_out; - } - } - else - { - if (high_surrogate) - { - CCLOGERROR("Invalid sequence in conversion input"); - goto err_out; - } - - if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ - { - high_surrogate = c; - goto next1; - } - else - wc = c; - } - - /********** DIFFERENT for UTF8/UCS4 **********/ - n_bytes += UTF8_LENGTH (wc); - - next1: - in++; + utf16.push_back(str[i]); } - if (high_surrogate && !items_read) - { - CCLOGERROR("Partial character sequence at end of input"); - goto err_out; - } + char* ret = nullptr; + std::string outUtf8; + bool succeed = StringUtils::UTF16ToUTF8(utf16, outUtf8); - /* At this point, everything is valid, and we just need to convert - */ - /********** DIFFERENT for UTF8/UCS4 **********/ - result = new char[n_bytes + 1]; - - high_surrogate = 0; - out = result; - in = str; - while (out < result + n_bytes) + if (succeed) { - unsigned short c = *in; - unsigned int wc; - - if (c >= 0xdc00 && c < 0xe000) /* low surrogate */ - { - wc = SURROGATE_VALUE (high_surrogate, c); - high_surrogate = 0; - } - else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */ - { - high_surrogate = c; - goto next2; - } - else - wc = c; - - /********** DIFFERENT for UTF8/UCS4 **********/ - out += cc_unichar_to_utf8 (wc, out); - - next2: - in++; + ret = new char[outUtf8.length() + 1]; + ret[outUtf8.length()] = '\0'; + memcpy(ret, outUtf8.data(), outUtf8.length()); } - /********** DIFFERENT for UTF8/UCS4 **********/ - *out = '\0'; - - if (items_written) - /********** DIFFERENT for UTF8/UCS4 **********/ - *items_written = out - result; - -err_out: - if (items_read) - *items_read = in - str; - - return result; + return ret; } + NS_CC_END diff --git a/cocos/2d/ccUTF8.h b/cocos/2d/ccUTF8.h index 17c5dd0cc2..91e66ce10c 100644 --- a/cocos/2d/ccUTF8.h +++ b/cocos/2d/ccUTF8.h @@ -1,35 +1,131 @@ -/* - * Copyright (C) 1999 Tom Tromey - * Copyright (C) 2000 Red Hat, Inc. - * Copyright (c) 2013-2014 Chukong Technologies Inc. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - */ +/**************************************************************************** + Copyright (c) 2014 cocos2d-x.org + Copyright (c) 2014 Chukong Technologies Inc. + + http://www.cocos2d-x.org + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + ****************************************************************************/ #ifndef __cocos2dx__ccUTF8__ #define __cocos2dx__ccUTF8__ #include "base/CCPlatformMacros.h" #include +#include NS_CC_BEGIN +namespace StringUtils { + +/** + * @brief Converts utf8 string to utf16 string + * @param utf8 The utf8 string to be converted + * @param outUtf16 The output utf16 string + * @return true if succeed, otherwise false + * @note Please check the return value before using \p outUtf16 + * e.g. + * @code + * std::u16string utf16; + * bool ret = StringUtils::UTF8ToUTF16("你好hello", utf16); + * if (ret) { + * do_some_thing_with_utf16(utf16); + * } + * @endcode + */ +CC_DLL bool UTF8ToUTF16(const std::string& utf8, std::u16string& outUtf16); + +/** + * @brief Converts utf16 string to utf8 string + * @param utf16 The utf16 string to be converted + * @param outUtf8 The output utf8 string + * @return true if succeed, otherwise false + * @note Please check the return value before using \p outUtf8 + * e.g. + * @code + * std::string utf8; + * bool ret = StringUtils::UTF16ToUTF8(u"\u4f60\u597d", utf16); + * if (ret) { + * do_some_thing_with_utf8(utf8); + * } + * @endcode + */ +CC_DLL bool UTF16ToUTF8(const std::u16string& utf16, std::string& outUtf8); + +/** + * @brief Trims the unicode spaces at the end of char16_t vector + */ +CC_DLL void trimUTF16Vector(std::vector& str); + +/** + * @brief Whether the character is a whitespace character. + * + * @param ch the unicode character + * @returns whether the character is a white space character. + * + * @see http://en.wikipedia.org/wiki/Whitespace_character#Unicode + * + */ +CC_DLL bool isUnicodeSpace(char16_t ch); + +/** + * @brief Whether the character is a Chinese/Japanese/Korean character. + * + * @param ch the unicode character + * @returns whether the character is a Chinese character. + * + * @see http://www.searchtb.com/2012/04/chinese_encode.html + * @see http://tieba.baidu.com/p/748765987 + * + */ +CC_DLL bool isCJKUnicode(char16_t ch); + +/** + * @brief Returns the length of the string in characters. + * + * @param utf8 an UTF-8 encoded string. + * @returns the length of the string in characters + */ +CC_DLL long getCharacterCountInUTF8String(const std::string& utf8); + +/** + * @brief Gets the index of the last character that is not equal to the character given. + * + * @param str the string to be searched. + * @param c the character to be searched for. + * + * @returns the index of the last character that is not \p c. + * + */ +CC_DLL unsigned int getIndexOfLastNotChar16(const std::vector& str, char16_t c); + +/** + * @brief Gets char16_t vector from a given utf16 string + */ +CC_DLL std::vector getChar16VectorFromUTF16String(const std::u16string& utf16); + +} // namespace StringUtils { + + CC_DLL int cc_wcslen(const unsigned short* str); -CC_DLL void cc_utf8_trim_ws(std::vector* str); +CC_DEPRECATED_ATTRIBUTE void cc_utf8_trim_ws(std::vector* str); /** * Whether the character is a whitespace character. @@ -39,7 +135,7 @@ CC_DLL void cc_utf8_trim_ws(std::vector* str); * * @see http://en.wikipedia.org/wiki/Whitespace_character#Unicode * */ -CC_DLL bool isspace_unicode(unsigned short ch); +CC_DEPRECATED_ATTRIBUTE bool isspace_unicode(unsigned short ch); /** * Whether the character is a Chinese/Japanese/Korean character. @@ -50,7 +146,7 @@ CC_DLL bool isspace_unicode(unsigned short ch); * @see http://www.searchtb.com/2012/04/chinese_encode.html * @see http://tieba.baidu.com/p/748765987 * */ -CC_DLL bool iscjk_unicode(unsigned short ch); +CC_DEPRECATED_ATTRIBUTE bool iscjk_unicode(unsigned short ch); /** * Returns the length of the string in characters. @@ -62,7 +158,7 @@ CC_DLL bool iscjk_unicode(unsigned short ch); * * @returns the length of the string in characters **/ -CC_DLL long +CC_DEPRECATED_ATTRIBUTE long cc_utf8_strlen (const char * p, int max); /** @@ -73,9 +169,9 @@ cc_utf8_strlen (const char * p, int max); * * @returns the index of the last character that is not \p c. * */ -CC_DLL unsigned int cc_utf8_find_last_not_char(std::vector str, unsigned short c); +CC_DEPRECATED_ATTRIBUTE unsigned int cc_utf8_find_last_not_char(const std::vector& str, unsigned short c); -CC_DLL std::vector cc_utf16_vec_from_utf16_str(const unsigned short* str); +CC_DEPRECATED_ATTRIBUTE std::vector cc_utf16_vec_from_utf16_str(const unsigned short* str); /** * Creates a utf8 string from a cstring. @@ -84,7 +180,7 @@ CC_DLL std::vector cc_utf16_vec_from_utf16_str(const unsigned sh * * @returns the newly created utf8 string. * */ -CC_DLL unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, int* rUtf16Size = nullptr); +CC_DEPRECATED_ATTRIBUTE unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, int* rUtf16Size = nullptr); /** * Convert a string from UTF-16 to UTF-8. The result will be null terminated. @@ -103,12 +199,13 @@ CC_DLL unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, in * @returns a pointer to a newly allocated UTF-8 string. This value must be * freed with free(). If an error occurs, %nullptr will be returned. **/ -CC_DLL char * +CC_DEPRECATED_ATTRIBUTE char * cc_utf16_to_utf8 (const unsigned short *str, int len, long *items_read, long *items_written); + NS_CC_END #endif /* defined(__cocos2dx__ccUTF8__) */