axmol/cocos/2d/ccUTF8.cpp

/*
 * This file uses some implementations of gutf8.c in glib.
 *
 * gutf8.c - Operations on UTF-8 strings.
 *
 * Copyright (C) 1999      Tom Tromey
 * Copyright (C) 2000      Red Hat, Inc.
 * Copyright (c) 2013-2014 Chukong Technologies Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include "ccUTF8.h"
#include "2d/platform/CCCommon.h"
#include "base/CCConsole.h"

NS_CC_BEGIN

int cc_wcslen(const unsigned short* str)
{
    int i=0;
    while(*str++) i++;
    return i;
}

/* Code from GLIB gutf8.c starts here. */

#define UTF8_COMPUTE(Char, Mask, Len)        \
if (Char < 128)                \
{                        \
Len = 1;                    \
Mask = 0x7f;                \
}                        \
else if ((Char & 0xe0) == 0xc0)        \
{                        \
Len = 2;                    \
Mask = 0x1f;                \
}                        \
else if ((Char & 0xf0) == 0xe0)        \
{                        \
Len = 3;                    \
Mask = 0x0f;                \
}                        \
else if ((Char & 0xf8) == 0xf0)        \
{                        \
Len = 4;                    \
Mask = 0x07;                \
}                        \
else if ((Char & 0xfc) == 0xf8)        \
{                        \
Len = 5;                    \
Mask = 0x03;                \
}                        \
else if ((Char & 0xfe) == 0xfc)        \
{                        \
Len = 6;                    \
Mask = 0x01;                \
}                        \
else                        \
Len = -1;

#define UTF8_LENGTH(Char)            \
((Char) < 0x80 ? 1 :                \
((Char) < 0x800 ? 2 :            \
((Char) < 0x10000 ? 3 :            \
((Char) < 0x200000 ? 4 :            \
((Char) < 0x4000000 ? 5 : 6)))))


#define UTF8_GET(Result, Chars, Count, Mask, Len)    \
(Result) = (Chars)[0] & (Mask);            \
for ((Count) = 1; (Count) < (Len); ++(Count))        \
{                            \
if (((Chars)[(Count)] & 0xc0) != 0x80)        \
{                        \
(Result) = -1;                \
break;                    \
}                        \
(Result) <<= 6;                    \
(Result) |= ((Chars)[(Count)] & 0x3f);        \
}

#define UNICODE_VALID(Char)            \
((Char) < 0x110000 &&                \
(((Char) & 0xFFFFF800) != 0xD800) &&        \
((Char) < 0xFDD0 || (Char) > 0xFDEF) &&    \
((Char) & 0xFFFE) != 0xFFFE)


static const char utf8_skip_data[256] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
    5, 5, 5, 6, 6, 1, 1
};

static const char *const g_utf8_skip = utf8_skip_data;

#define cc_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(unsigned char *)(p)])

/*
 * @str:    the string to search through.
 * @c:        the character to not look for.
 *
 * Return value: the index of the last character that is not c.
 * */
unsigned int cc_utf8_find_last_not_char(std::vector<unsigned short> str, unsigned short c)
{
    int len = static_cast<int>(str.size());

    int i = len - 1;
    for (; i >= 0; --i)
        if (str[i] != c) return i;

    return i;
}

/*
 * @str:    the string to trim
 * @index:    the index to start trimming from.
 *
 * Trims str st str=[0, index) after the operation.
 *
 * Return value: the trimmed string.
 * */
static void cc_utf8_trim_from(std::vector<unsigned short>* str, int index)
{
    int size = static_cast<int>(str->size());
    if (index >= size || index < 0)
        return;

    str->erase(str->begin() + index, str->begin() + size);
}

/*
 * @ch is the unicode character whitespace?
 *
 * Reference: http://en.wikipedia.org/wiki/Whitespace_character#Unicode
 *
 * Return value: weather the character is a whitespace character.
 * */
bool isspace_unicode(unsigned short ch)
{
    return  (ch >= 0x0009 && ch <= 0x000D) || ch == 0x0020 || ch == 0x0085 || ch == 0x00A0 || ch == 0x1680
    || (ch >= 0x2000 && ch <= 0x200A) || ch == 0x2028 || ch == 0x2029 || ch == 0x202F
    ||  ch == 0x205F || ch == 0x3000;
}

bool iscjk_unicode(unsigned short ch)
{
    return (ch >= 0x4E00 && ch <= 0x9FBF)   // CJK Unified Ideographs
        || (ch >= 0x2E80 && ch <= 0x2FDF)   // CJK Radicals Supplement & Kangxi Radicals
        || (ch >= 0x2FF0 && ch <= 0x30FF)   // Ideographic Description Characters, CJK Symbols and Punctuation & Japanese
        || (ch >= 0x3100 && ch <= 0x31BF)   // Korean
        || (ch >= 0xAC00 && ch <= 0xD7AF)   // Hangul Syllables
        || (ch >= 0xF900 && ch <= 0xFAFF)   // CJK Compatibility Ideographs
        || (ch >= 0xFE30 && ch <= 0xFE4F)   // CJK Compatibility Forms
        || (ch >= 0x31C0 && ch <= 0x4DFF);  // Other exiensions
}

void cc_utf8_trim_ws(std::vector<unsigned short>* str)
{
    int len = static_cast<int>(str->size());

    if ( len <= 0 )
        return;

    int last_index = len - 1;

    // Only start trimming if the last character is whitespace..
    if (isspace_unicode((*str)[last_index]))
    {
        for (int i = last_index - 1; i >= 0; --i)
        {
            if (isspace_unicode((*str)[i]))
                last_index = i;
            else
                break;
        }

        cc_utf8_trim_from(str, last_index);
    }
}

/*
 * cc_utf8_strlen:
 * @p: pointer to the start of a UTF-8 encoded string.
 * @max: the maximum number of bytes to examine. If @max
 *       is less than 0, then the string is assumed to be
 *       null-terminated. If @max is 0, @p will not be examined and
 *       may be %nullptr.
 *
 * Returns the length of the string in characters.
 *
 * Return value: the length of the string in characters
 **/
long
cc_utf8_strlen (const char * p, int max)
{
    long len = 0;
    const char *start = p;

    if (!(p != nullptr || max == 0))
    {
        return 0;
    }

    if (max < 0)
    {
        while (*p)
        {
            p = cc_utf8_next_char (p);
            ++len;
        }
    }
    else
    {
        if (max == 0 || !*p)
            return 0;

        p = cc_utf8_next_char (p);

        while (p - start < max && *p)
        {
            ++len;
            p = cc_utf8_next_char (p);
        }

        /* only do the last len increment if we got a complete
         * char (don't count partial chars)
         */
        if (p - start == max)
            ++len;
    }

    return len;
}

/*
 * g_utf8_get_char:
 * @p: a pointer to Unicode character encoded as UTF-8
 *
 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
 * If @p does not point to a valid UTF-8 encoded character, results are
 * undefined. If you are not sure that the bytes are complete
 * valid Unicode characters, you should use g_utf8_get_char_validated()
 * instead.
 *
 * Return value: the resulting character
 **/
static unsigned int
cc_utf8_get_char (const char * p)
{
    int i, mask = 0, len;
    unsigned int result;
    unsigned char c = (unsigned char) *p;

    UTF8_COMPUTE (c, mask, len);
    if (len == -1)
        return (unsigned int) - 1;
    UTF8_GET (result, p, i, mask, len);

    return result;
}


unsigned short* cc_utf8_to_utf16(const char* str_old, int length/* = -1 */, int* rUtf16Size/* = nullptr */)
{
    long len = cc_utf8_strlen(str_old, length);
    if (rUtf16Size != nullptr) {
        *rUtf16Size = static_cast<int>(len);
    }

    unsigned short* str_new = new unsigned short[len + 1];
    str_new[len] = 0;

    for (int i = 0; i < len; ++i)
    {
        str_new[i] = cc_utf8_get_char(str_old);
        str_old = cc_utf8_next_char(str_old);
    }

    return str_new;
}

std::vector<unsigned short> cc_utf16_vec_from_utf16_str(const unsigned short* str)
{
    int len = cc_wcslen(str);
    std::vector<unsigned short> str_new;

    for (int i = 0; i < len; ++i)
    {
        str_new.push_back(str[i]);
    }
    return str_new;
}

/**
 * cc_unichar_to_utf8:
 * @c: a ISO10646 character code
 * @outbuf: output buffer, must have at least 6 bytes of space.
 *       If %nullptr, the length will be computed and returned
 *       and nothing will be written to @outbuf.
 *
 * Converts a single character to UTF-8.
 *
 * Return value: number of bytes written
 **/
int
cc_unichar_to_utf8 (unsigned int c,
                   char   *outbuf)
{
    int len = 0;
    int first;
    int i;

    if (c < 0x80)
    {
        first = 0;
        len = 1;
    }
    else if (c < 0x800)
    {
        first = 0xc0;
        len = 2;
    }
    else if (c < 0x10000)
    {
        first = 0xe0;
        len = 3;
    }
    else if (c < 0x200000)
    {
        first = 0xf0;
        len = 4;
    }
    else if (c < 0x4000000)
    {
        first = 0xf8;
        len = 5;
    }
    else
    {
        first = 0xfc;
        len = 6;
    }

    if (outbuf)
    {
        for (i = len - 1; i > 0; --i)
        {
            outbuf[i] = (c & 0x3f) | 0x80;
            c >>= 6;
        }
        outbuf[0] = c | first;
    }

    return len;
}

#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)

/**
 * cc_utf16_to_utf8:
 * @str: a UTF-16 encoded string
 * @len: the maximum length of @str to use. If @len < 0, then
 *       the string is terminated with a 0 character.
 * @items_read: location to store number of words read, or %nullptr.
 *              If %nullptr, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
 *              returned in case @str contains a trailing partial
 *              character. If an error occurs then the index of the
 *              invalid input is stored here.
 * @items_written: location to store number of bytes written, or %nullptr.
 *                 The value stored here does not include the trailing
 *                 0 byte.
 * @error: location to store the error occuring, or %nullptr to ignore
 *         errors. Any of the errors in #GConvertError other than
 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
 *
 * Convert a string from UTF-16 to UTF-8. The result will be
 * terminated with a 0 byte.
 *
 * Return value: a pointer to a newly allocated UTF-8 string.
 *               This value must be freed with free(). If an
 *               error occurs, %nullptr will be returned and
 *               @error set.
 **/
char *
cc_utf16_to_utf8 (const unsigned short  *str,
                 int             len,
                 long            *items_read,
                 long            *items_written)
{
    /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
     * are marked.
     */
    const unsigned short *in;
    char *out;
    char *result = nullptr;
    int n_bytes;
    unsigned int high_surrogate;

    if (str == 0) return nullptr;

    n_bytes = 0;
    in = str;
    high_surrogate = 0;
    while ((len < 0 || in - str < len) && *in)
    {
        unsigned short c = *in;
        unsigned int wc;

        if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
        {
            if (high_surrogate)
            {
                wc = SURROGATE_VALUE (high_surrogate, c);
                high_surrogate = 0;
            }
            else
            {
                CCLOGERROR("Invalid sequence in conversion input");
                goto err_out;
            }
        }
        else
        {
            if (high_surrogate)
            {
                CCLOGERROR("Invalid sequence in conversion input");
                goto err_out;
            }

            if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
            {
                high_surrogate = c;
                goto next1;
            }
            else
                wc = c;
        }

        /********** DIFFERENT for UTF8/UCS4 **********/
        n_bytes += UTF8_LENGTH (wc);

    next1:
        in++;
    }

    if (high_surrogate && !items_read)
    {
        CCLOGERROR("Partial character sequence at end of input");
        goto err_out;
    }

    /* At this point, everything is valid, and we just need to convert
     */
    /********** DIFFERENT for UTF8/UCS4 **********/
    result = new char[n_bytes + 1];

    high_surrogate = 0;
    out = result;
    in = str;
    while (out < result + n_bytes)
    {
        unsigned short c = *in;
        unsigned int wc;

        if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
        {
            wc = SURROGATE_VALUE (high_surrogate, c);
            high_surrogate = 0;
        }
        else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
        {
            high_surrogate = c;
            goto next2;
        }
        else
            wc = c;

        /********** DIFFERENT for UTF8/UCS4 **********/
        out += cc_unichar_to_utf8 (wc, out);

    next2:
        in++;
    }

    /********** DIFFERENT for UTF8/UCS4 **********/
    *out = '\0';

    if (items_written)
    /********** DIFFERENT for UTF8/UCS4 **********/
        *items_written = out - result;

err_out:
    if (items_read)
        *items_read = in - str;

    return result;
}

NS_CC_END