issue #4660: Refactors utf8 api, make it safer and easier to use.

2014-05-08 11:10:54 +08:00 · 2014-05-08 11:10:54 +08:00 · 20a8808b78
parent 42ffd25fca
commit 20a8808b78
2 changed files with 321 additions and 434 deletions
--- a/cocos/2d/ccUTF8.cpp
+++ b/cocos/2d/ccUTF8.cpp
@ -1,127 +1,36 @@
-/* 
- * This file uses some implementations of gutf8.c in glib.
- *
- * gutf8.c - Operations on UTF-8 strings.
- *
- * Copyright (C) 1999      Tom Tromey
- * Copyright (C) 2000      Red Hat, Inc.
- * Copyright (c) 2013-2014 Chukong Technologies Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
+/****************************************************************************
+ Copyright (c) 2014 cocos2d-x.org
+ Copyright (c) 2014 Chukong Technologies Inc.
+
+ http://www.cocos2d-x.org
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ ****************************************************************************/

 #include "ccUTF8.h"
 #include "2d/platform/CCCommon.h"
 #include "base/CCConsole.h"
+#include "ConvertUTF.h"

 NS_CC_BEGIN

-int cc_wcslen(const unsigned short* str)
-{
-    int i=0;
-    while(*str++) i++;
-    return i;
-}
-
-/* Code from GLIB gutf8.c starts here. */
-
-#define UTF8_COMPUTE(Char, Mask, Len)        \
-if (Char < 128)                \
-{                        \
-Len = 1;                    \
-Mask = 0x7f;                \
-}                        \
-else if ((Char & 0xe0) == 0xc0)        \
-{                        \
-Len = 2;                    \
-Mask = 0x1f;                \
-}                        \
-else if ((Char & 0xf0) == 0xe0)        \
-{                        \
-Len = 3;                    \
-Mask = 0x0f;                \
-}                        \
-else if ((Char & 0xf8) == 0xf0)        \
-{                        \
-Len = 4;                    \
-Mask = 0x07;                \
-}                        \
-else if ((Char & 0xfc) == 0xf8)        \
-{                        \
-Len = 5;                    \
-Mask = 0x03;                \
-}                        \
-else if ((Char & 0xfe) == 0xfc)        \
-{                        \
-Len = 6;                    \
-Mask = 0x01;                \
-}                        \
-else                        \
-Len = -1;
-
-#define UTF8_LENGTH(Char)            \
-((Char) < 0x80 ? 1 :                \
-((Char) < 0x800 ? 2 :            \
-((Char) < 0x10000 ? 3 :            \
-((Char) < 0x200000 ? 4 :            \
-((Char) < 0x4000000 ? 5 : 6)))))
-
-
-#define UTF8_GET(Result, Chars, Count, Mask, Len)    \
-(Result) = (Chars)[0] & (Mask);            \
-for ((Count) = 1; (Count) < (Len); ++(Count))        \
-{                            \
-if (((Chars)[(Count)] & 0xc0) != 0x80)        \
-{                        \
-(Result) = -1;                \
-break;                    \
-}                        \
-(Result) <<= 6;                    \
-(Result) |= ((Chars)[(Count)] & 0x3f);        \
-}
-
-#define UNICODE_VALID(Char)            \
-((Char) < 0x110000 &&                \
-(((Char) & 0xFFFFF800) != 0xD800) &&        \
-((Char) < 0xFDD0 || (Char) > 0xFDEF) &&    \
-((Char) & 0xFFFE) != 0xFFFE)
-
-
-static const char utf8_skip_data[256] = {
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
-    5, 5, 5, 6, 6, 1, 1
-};
-
-static const char *const g_utf8_skip = utf8_skip_data;
-
-#define cc_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])
+namespace StringUtils {

 /*
 * @str:    the string to search through.
@ -129,14 +38,14 @@ static const char *const g_utf8_skip = utf8_skip_data;
 *
 * Return value: the index of the last character that is not c.
 * */
-unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigned short c)
+unsigned int getIndexOfLastNotChar16(const std::vector<char16_t>& str, char16_t c)
 {
    int len = static_cast<int>(str.size());
-    
+
    int i = len - 1;
    for (; i >= 0; --i)
        if (str[i] != c) return i;
-    
+
    return i;
 }

@ -148,13 +57,13 @@ unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigne
 *
 * Return value: the trimmed string.
 * */
-static void cc_utf8_trim_from(std::vector<unsigned short>* str, int index)
+static void trimUTF16VectorFromIndex(std::vector<char16_t>& str, int index)
 {
-    int size = static_cast<int>(str->size());
+    int size = static_cast<int>(str.size());
    if (index >= size || index < 0)
        return;
-    
-    str->erase(str->begin() + index, str->begin() + size);
+
+    str.erase(str.begin() + index, str.begin() + size);
 }

 /*
@ -164,14 +73,14 @@ static void cc_utf8_trim_from(std::vector<unsigned short>* str, int index)
 *
 * Return value: weather the character is a whitespace character.
 * */
-bool isspace_unicode(unsigned short ch)
+bool isUnicodeSpace(char16_t ch)
 {
    return  (ch >= 0x0009 && ch <= 0x000D) || ch == 0x0020 || ch == 0x0085 || ch == 0x00A0 || ch == 0x1680
    || (ch >= 0x2000 && ch <= 0x200A) || ch == 0x2028 || ch == 0x2029 || ch == 0x202F
    ||  ch == 0x205F || ch == 0x3000;
 }

-bool iscjk_unicode(unsigned short ch)
+bool isCJKUnicode(char16_t ch)
 {
    return (ch >= 0x4E00 && ch <= 0x9FBF)   // CJK Unified Ideographs
        || (ch >= 0x2E80 && ch <= 0x2FDF)   // CJK Radicals Supplement & Kangxi Radicals
@ -183,131 +92,165 @@ bool iscjk_unicode(unsigned short ch)
        || (ch >= 0x31C0 && ch <= 0x4DFF);  // Other exiensions
 }

-void cc_utf8_trim_ws(std::vector<unsigned short>* str)
+void trimUTF16Vector(std::vector<char16_t>& str)
 {
-    int len = static_cast<int>(str->size());
-    
+    int len = static_cast<int>(str.size());
+
    if ( len <= 0 )
        return;
-    
+
    int last_index = len - 1;
-    
+
    // Only start trimming if the last character is whitespace..
-    if (isspace_unicode((*str)[last_index]))
+    if (isUnicodeSpace(str[last_index]))
    {
        for (int i = last_index - 1; i >= 0; --i)
        {
-            if (isspace_unicode((*str)[i]))
+            if (isUnicodeSpace(str[i]))
                last_index = i;
            else
                break;
        }
-        
-        cc_utf8_trim_from(str, last_index);
+
+        trimUTF16VectorFromIndex(str, last_index);
    }
 }

-/*
- * cc_utf8_strlen:
- * @p: pointer to the start of a UTF-8 encoded string.
- * @max: the maximum number of bytes to examine. If @max
- *       is less than 0, then the string is assumed to be
- *       null-terminated. If @max is 0, @p will not be examined and
- *       may be %nullptr.
- *
- * Returns the length of the string in characters.
- *
- * Return value: the length of the string in characters
- **/
-long
-cc_utf8_strlen (const char * p, int max)
+bool UTF8ToUTF16(const std::string& utf8, std::u16string& outUtf16)
 {
-    long len = 0;
-    const char *start = p;
-    
-    if (!(p != nullptr || max == 0))
+    if (utf8.empty())
    {
-        return 0;
+        outUtf16.clear();
+        return true;
    }
-    
-    if (max < 0)
+
+    bool ret = false;
+    const size_t utf16Bytes = (utf8.length()+1) << 1;
+    char16_t* utf16 = (char16_t*)malloc(utf16Bytes);
+    memset(utf16, 0, utf16Bytes);
+
+    UTF16* utf16Start = (UTF16*)utf16;
+    UTF16* utf16End = ((UTF16*)utf16) + (utf8.length());
+
+    const UTF8* utf8Start = (const UTF8*)utf8.data();
+    const UTF8* utf8End = ((const UTF8*)utf8.data()) + utf8.length();
+
+    if (conversionOK == ConvertUTF8toUTF16((const UTF8 **) &utf8Start, utf8End, &utf16Start, utf16End, strictConversion))
    {
-        while (*p)
-        {
-            p = cc_utf8_next_char (p);
-            ++len;
-        }
+        outUtf16 = utf16;
+        ret = true;
    }
-    else
-    {
-        if (max == 0 || !*p)
-            return 0;
-        
-        p = cc_utf8_next_char (p);
-        
-        while (p - start < max && *p)
-        {
-            ++len;
-            p = cc_utf8_next_char (p);
-        }
-        
-        /* only do the last len increment if we got a complete
-         * char (don't count partial chars)
-         */
-        if (p - start == max)
-            ++len;
-    }
-    
-    return len;
+
+    free(utf16);
+
+    return ret;
 }

-/*
- * g_utf8_get_char:
- * @p: a pointer to Unicode character encoded as UTF-8
- *
- * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
- * If @p does not point to a valid UTF-8 encoded character, results are
- * undefined. If you are not sure that the bytes are complete
- * valid Unicode characters, you should use g_utf8_get_char_validated()
- * instead.
- *
- * Return value: the resulting character
- **/
-static unsigned int
-cc_utf8_get_char (const char * p)
+bool UTF16ToUTF8(const std::u16string& utf16, std::string& outUtf8)
 {
-    int i, mask = 0, len;
-    unsigned int result;
-    unsigned char c = (unsigned char) *p;
-    
-    UTF8_COMPUTE (c, mask, len);
-    if (len == -1)
-        return (unsigned int) - 1;
-    UTF8_GET (result, p, i, mask, len);
-    
-    return result;
+    if (utf16.empty())
+    {
+        outUtf8.clear();
+        return true;
+    }
+
+    bool ret = false;
+    const size_t utf8Bytes = (utf16.length() << 2) + 1;
+    char* utf8 = (char*)malloc(utf8Bytes);
+    memset(utf8, 0, utf8Bytes);
+
+    UTF8 *utf8Start = (UTF8*)utf8;
+    UTF8 *utf8End = ((UTF8*)utf8) + (utf8Bytes -1);
+
+    const UTF16* utf16Start = (const UTF16*)utf16.data();
+    const UTF16* utf16End = ((const UTF16*)utf16.data()) + utf16.length();
+
+    if (conversionOK == ConvertUTF16toUTF8(&utf16Start, utf16End, &utf8Start, utf8End, strictConversion))
+    {
+        outUtf8 = utf8;
+        ret = true;
+    }
+
+    free(utf8);
+
+    return ret;
 }

-
-unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1 */, int* rUtf16Size/* = nullptr */)
+std::vector<char16_t> getUTF16VectorFromUTF16String(const std::u16string& str)
 {
-    long len = cc_utf8_strlen(str_old, length);
-    if (rUtf16Size != nullptr) {
-        *rUtf16Size = static_cast<int>(len);
-    }
-    
-    unsigned short* str_new = new unsigned short[len + 1];
-    str_new[len] = 0;
-    
-    for (int i = 0; i < len; ++i)
+    std::vector<char16_t> str_new;
+
+    size_t len = str.length();
+    for (size_t i = 0; i < len; ++i)
    {
-        str_new[i] = cc_utf8_get_char(str_old);
-        str_old = cc_utf8_next_char(str_old);
+        str_new.push_back(str[i]);
    }
-    
    return str_new;
 }

+std::vector<char16_t> getChar16VectorFromUTF16String(const std::u16string& utf16)
+{
+    std::vector<char16_t> ret;
+    size_t len = utf16.length();
+    ret.reserve(len);
+    for (size_t i = 0; i < len; ++i)
+    {
+        ret.push_back(utf16[i]);
+    }
+    return ret;
+}
+
+long getCharacterCountInUTF8String(const std::string& utf8)
+{
+    return getUTF8StringLength((const UTF8*)utf8.c_str());
+}
+
+} //namespace StringUtils {
+
+
+int cc_wcslen(const unsigned short* str)
+{
+    int i=0;
+    while(*str++) i++;
+    return i;
+}
+
+void cc_utf8_trim_ws(std::vector<unsigned short>* str)
+{
+    // unsigned short and char16_t are both 2 bytes
+    std::vector<char16_t>* ret = reinterpret_cast<std::vector<char16_t>*>(str);
+    StringUtils::trimUTF16Vector(*ret);
+}
+
+bool isspace_unicode(unsigned short ch)
+{
+    return StringUtils::isUnicodeSpace(ch);
+}
+
+
+bool iscjk_unicode(unsigned short ch)
+{
+    return StringUtils::isCJKUnicode(ch);
+}
+
+
+long cc_utf8_strlen (const char * p, int max)
+{
+    CC_UNUSED_PARAM(max);
+    return StringUtils::getCharacterCountInUTF8String(p);
+}
+
+unsigned int cc_utf8_find_last_not_char(const std::vector<unsigned short>& str, unsigned short c)
+{
+    std::vector<char16_t> char16Vector;
+    for (const auto& e : str)
+    {
+        char16Vector.push_back(e);
+    }
+    
+    return StringUtils::getIndexOfLastNotChar16(char16Vector, c);
+}
+
 std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str)
 {
    int len = cc_wcslen(str);
@ -320,209 +263,56 @@ std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* st
    return str_new;
 }

-/**
- * cc_unichar_to_utf8:
- * @c: a ISO10646 character code
- * @outbuf: output buffer, must have at least 6 bytes of space.
- *       If %nullptr, the length will be computed and returned
- *       and nothing will be written to @outbuf.
- *
- * Converts a single character to UTF-8.
- *
- * Return value: number of bytes written
- **/
-int
-cc_unichar_to_utf8 (unsigned int c,
-                   char   *outbuf)
+unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1*/, int* rUtf16Size/* = nullptr*/)
 {
-    int len = 0;
-    int first;
-    int i;
+    if (str_old == nullptr)
+        return nullptr;
    
-    if (c < 0x80)
+    unsigned short* ret = nullptr;
+    
+    std::u16string outUtf16;
+    bool succeed = StringUtils::UTF8ToUTF16(str_old, outUtf16);
+    
+    if (succeed)
    {
-        first = 0;
-        len = 1;
-    }
-    else if (c < 0x800)
-    {
-        first = 0xc0;
-        len = 2;
-    }
-    else if (c < 0x10000)
-    {
-        first = 0xe0;
-        len = 3;
-    }
-    else if (c < 0x200000)
-    {
-        first = 0xf0;
-        len = 4;
-    }
-    else if (c < 0x4000000)
-    {
-        first = 0xf8;
-        len = 5;
-    }
-    else
-    {
-        first = 0xfc;
-        len = 6;
+        ret = new unsigned short[outUtf16.length() + 1];
+        ret[outUtf16.length()] = 0;
+        memcpy(ret, outUtf16.data(), outUtf16.length());
    }
    
-    if (outbuf)
-    {
-        for (i = len - 1; i > 0; --i)
-        {
-            outbuf[i] = (c & 0x3f) | 0x80;
-            c >>= 6;
-        }
-        outbuf[0] = c | first;
-    }
-    
-    return len;
+    return ret;
 }

-#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
-
-/**
- * cc_utf16_to_utf8:
- * @str: a UTF-16 encoded string
- * @len: the maximum length of @str to use. If @len < 0, then
- *       the string is terminated with a 0 character.
- * @items_read: location to store number of words read, or %nullptr.
- *              If %nullptr, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
- *              returned in case @str contains a trailing partial
- *              character. If an error occurs then the index of the
- *              invalid input is stored here.
- * @items_written: location to store number of bytes written, or %nullptr.
- *                 The value stored here does not include the trailing
- *                 0 byte.
- * @error: location to store the error occuring, or %nullptr to ignore
- *         errors. Any of the errors in #GConvertError other than
- *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
- *
- * Convert a string from UTF-16 to UTF-8. The result will be
- * terminated with a 0 byte.
- *
- * Return value: a pointer to a newly allocated UTF-8 string.
- *               This value must be freed with free(). If an
- *               error occurs, %nullptr will be returned and
- *               @error set.
- **/
-char *
-cc_utf16_to_utf8 (const unsigned short  *str,
-                 int             len,
-                 long            *items_read,
-                 long            *items_written)
+char * cc_utf16_to_utf8 (const unsigned short  *str,
+                  int             len,
+                  long            *items_read,
+                  long            *items_written)
 {
-    /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
-     * are marked.
-     */
-    const unsigned short *in;
-    char *out;
-    char *result = nullptr;
-    int n_bytes;
-    unsigned int high_surrogate;
+    if (str == nullptr)
+        return nullptr;
    
-    if (str == 0) return nullptr;
    
-    n_bytes = 0;
-    in = str;
-    high_surrogate = 0;
-    while ((len < 0 || in - str < len) && *in)
+    std::u16string utf16;
+    int utf16Len = len < 0 ? cc_wcslen(str) : len;
+    
+    for (int i = 0; i < utf16Len; ++i)
    {
-        unsigned short c = *in;
-        unsigned int wc;
-        
-        if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
-        {
-            if (high_surrogate)
-            {
-                wc = SURROGATE_VALUE (high_surrogate, c);
-                high_surrogate = 0;
-            }
-            else
-            {
-                CCLOGERROR("Invalid sequence in conversion input");
-                goto err_out;
-            }
-        }
-        else
-        {
-            if (high_surrogate)
-            {
-                CCLOGERROR("Invalid sequence in conversion input");
-                goto err_out;
-            }
-            
-            if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
-            {
-                high_surrogate = c;
-                goto next1;
-            }
-            else
-                wc = c;
-        }
-        
-        /********** DIFFERENT for UTF8/UCS4 **********/
-        n_bytes += UTF8_LENGTH (wc);
-        
-    next1:
-        in++;
+        utf16.push_back(str[i]);
    }
    
-    if (high_surrogate && !items_read)
-    {        
-        CCLOGERROR("Partial character sequence at end of input");
-        goto err_out;
-    }
+    char* ret = nullptr;
+    std::string outUtf8;
+    bool succeed = StringUtils::UTF16ToUTF8(utf16, outUtf8);
    
-    /* At this point, everything is valid, and we just need to convert
-     */
-    /********** DIFFERENT for UTF8/UCS4 **********/
-    result = new char[n_bytes + 1];
-    
-    high_surrogate = 0;
-    out = result;
-    in = str;
-    while (out < result + n_bytes)
+    if (succeed)
    {
-        unsigned short c = *in;
-        unsigned int wc;
-        
-        if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
-        {
-            wc = SURROGATE_VALUE (high_surrogate, c);
-            high_surrogate = 0;
-        }
-        else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
-        {
-            high_surrogate = c;
-            goto next2;
-        }
-        else
-            wc = c;
-        
-        /********** DIFFERENT for UTF8/UCS4 **********/
-        out += cc_unichar_to_utf8 (wc, out);
-        
-    next2:
-        in++;
+        ret = new char[outUtf8.length() + 1];
+        ret[outUtf8.length()] = '\0';
+        memcpy(ret, outUtf8.data(), outUtf8.length());
    }
    
-    /********** DIFFERENT for UTF8/UCS4 **********/
-    *out = '\0';
-    
-    if (items_written)
-    /********** DIFFERENT for UTF8/UCS4 **********/
-        *items_written = out - result;
-    
-err_out:
-    if (items_read)
-        *items_read = in - str;
-    
-    return result;
+    return ret;
 }

+
 NS_CC_END
--- a/cocos/2d/ccUTF8.h
+++ b/cocos/2d/ccUTF8.h
@ -1,35 +1,131 @@
-/*
- * Copyright (C) 1999      Tom Tromey
- * Copyright (C) 2000      Red Hat, Inc.
- * Copyright (c) 2013-2014 Chukong Technologies Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
+/****************************************************************************
+ Copyright (c) 2014 cocos2d-x.org
+ Copyright (c) 2014 Chukong Technologies Inc.
+
+ http://www.cocos2d-x.org
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ ****************************************************************************/

 #ifndef __cocos2dx__ccUTF8__
 #define __cocos2dx__ccUTF8__

 #include "base/CCPlatformMacros.h"
 #include <vector>
+#include <string>

 NS_CC_BEGIN

+namespace StringUtils {
+
+/**
+ *  @brief Converts utf8 string to utf16 string
+ *  @param utf8 The utf8 string to be converted
+ *  @param outUtf16 The output utf16 string
+ *  @return true if succeed, otherwise false
+ *  @note Please check the return value before using \p outUtf16
+ *  e.g.
+ *  @code
+ *    std::u16string utf16;
+ *    bool ret = StringUtils::UTF8ToUTF16("你好hello", utf16);
+ *    if (ret) {
+ *        do_some_thing_with_utf16(utf16);
+ *    }
+ *  @endcode
+ */
+CC_DLL bool UTF8ToUTF16(const std::string& utf8, std::u16string& outUtf16);
+
+/**
+ *  @brief Converts utf16 string to utf8 string
+ *  @param utf16 The utf16 string to be converted
+ *  @param outUtf8 The output utf8 string
+ *  @return true if succeed, otherwise false
+ *  @note Please check the return value before using \p outUtf8
+ *  e.g.
+ *  @code
+ *    std::string utf8;
+ *    bool ret = StringUtils::UTF16ToUTF8(u"\u4f60\u597d", utf16);
+ *    if (ret) {
+ *        do_some_thing_with_utf8(utf8);
+ *    }
+ *  @endcode
+ */
+CC_DLL bool UTF16ToUTF8(const std::u16string& utf16, std::string& outUtf8);
+
+/**
+ *  @brief Trims the unicode spaces at the end of char16_t vector
+ */
+CC_DLL void trimUTF16Vector(std::vector<char16_t>& str);
+
+/**
+ *  @brief Whether the character is a whitespace character.
+ *
+ *  @param ch    the unicode character
+ *  @returns     whether the character is a white space character.
+ *
+ *  @see http://en.wikipedia.org/wiki/Whitespace_character#Unicode
+ *
+ */
+CC_DLL bool isUnicodeSpace(char16_t ch);
+
+/**
+ *  @brief Whether the character is a Chinese/Japanese/Korean character.
+ *
+ *  @param ch    the unicode character
+ *  @returns     whether the character is a Chinese character.
+ *
+ *  @see http://www.searchtb.com/2012/04/chinese_encode.html
+ *  @see http://tieba.baidu.com/p/748765987
+ *
+ */
+CC_DLL bool isCJKUnicode(char16_t ch);
+
+/**
+ *  @brief Returns the length of the string in characters.
+ *
+ *  @param utf8 an UTF-8 encoded string.
+ *  @returns the length of the string in characters
+ */
+CC_DLL long getCharacterCountInUTF8String(const std::string& utf8);
+
+/**
+ *  @brief Gets the index of the last character that is not equal to the character given.
+ *
+ *  @param str   the string to be searched.
+ *  @param c     the character to be searched for.
+ *
+ *  @returns the index of the last character that is not \p c.
+ *
+ */
+CC_DLL unsigned int getIndexOfLastNotChar16(const std::vector<char16_t>& str, char16_t c);
+
+/**
+ *  @brief Gets char16_t vector from a given utf16 string
+ */
+CC_DLL std::vector<char16_t> getChar16VectorFromUTF16String(const std::u16string& utf16);
+
+} // namespace StringUtils {
+
+
 CC_DLL int cc_wcslen(const unsigned short* str);

-CC_DLL void cc_utf8_trim_ws(std::vector<unsigned short>* str);
+CC_DEPRECATED_ATTRIBUTE void cc_utf8_trim_ws(std::vector<unsigned short>* str);

 /**
 * Whether the character is a whitespace character.
@ -39,7 +135,7 @@ CC_DLL void cc_utf8_trim_ws(std::vector<unsigned short>* str);
 *
 * @see http://en.wikipedia.org/wiki/Whitespace_character#Unicode
 * */
-CC_DLL bool isspace_unicode(unsigned short ch);
+CC_DEPRECATED_ATTRIBUTE bool isspace_unicode(unsigned short ch);

 /**
 * Whether the character is a Chinese/Japanese/Korean character.
@ -50,7 +146,7 @@ CC_DLL bool isspace_unicode(unsigned short ch);
 * @see http://www.searchtb.com/2012/04/chinese_encode.html
 * @see http://tieba.baidu.com/p/748765987
 * */
-CC_DLL bool iscjk_unicode(unsigned short ch);
+CC_DEPRECATED_ATTRIBUTE bool iscjk_unicode(unsigned short ch);

 /**
 * Returns the length of the string in characters.
@ -62,7 +158,7 @@ CC_DLL bool iscjk_unicode(unsigned short ch);
 *
 * @returns the length of the string in characters
 **/
-CC_DLL long
+CC_DEPRECATED_ATTRIBUTE long
 cc_utf8_strlen (const char * p, int max);

 /**
@ -73,9 +169,9 @@ cc_utf8_strlen (const char * p, int max);
 *
 * @returns the index of the last character that is not \p c.
 * */
-CC_DLL unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigned short c);
+CC_DEPRECATED_ATTRIBUTE unsigned int cc_utf8_find_last_not_char(const std::vector<unsigned short>& str, unsigned short c);

-CC_DLL std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str);
+CC_DEPRECATED_ATTRIBUTE std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str);

 /**
 * Creates a utf8 string from a cstring.
@ -84,7 +180,7 @@ CC_DLL std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned sh
 *
 * @returns the newly created utf8 string.
 * */
-CC_DLL unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, int* rUtf16Size = nullptr);
+CC_DEPRECATED_ATTRIBUTE unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, int* rUtf16Size = nullptr);

 /**
 * Convert a string from UTF-16 to UTF-8. The result will be null terminated.
@ -103,12 +199,13 @@ CC_DLL unsigned short* cc_utf8_to_utf16(const char* str_old, int length = -1, in
 * @returns a pointer to a newly allocated UTF-8 string. This value must be
 *          freed with free(). If an error occurs, %nullptr will be returned.
 **/
-CC_DLL char *
+CC_DEPRECATED_ATTRIBUTE char *
 cc_utf16_to_utf8 (const unsigned short  *str,
                  int             len,
                  long            *items_read,
                  long            *items_written);

+
 NS_CC_END

 #endif /* defined(__cocos2dx__ccUTF8__) */