issue #4660: Adds ConvertUTFWrapper.cpp, using llvm::ConvertUTF8toWide and llvm:: convertUTF16ToUTF8String

This commit is contained in:
James Chen 2014-05-08 16:58:16 +08:00
parent 5fda13639d
commit a61b1097db
5 changed files with 269 additions and 55 deletions

View File

@ -98,6 +98,10 @@
1A12775A18DFCC4F0005F345 /* CCTweenFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 2986667918B1B079000E39CA /* CCTweenFunction.h */; };
1A12775B18DFCC540005F345 /* CCTweenFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 2986667918B1B079000E39CA /* CCTweenFunction.h */; };
1A12775C18DFCC590005F345 /* CCTweenFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2986667818B1B079000E39CA /* CCTweenFunction.cpp */; };
1A1645B0191B726C008C7C7F /* ConvertUTF.c in Sources */ = {isa = PBXBuildFile; fileRef = 1A1645AE191B726C008C7C7F /* ConvertUTF.c */; };
1A1645B1191B726C008C7C7F /* ConvertUTF.c in Sources */ = {isa = PBXBuildFile; fileRef = 1A1645AE191B726C008C7C7F /* ConvertUTF.c */; };
1A1645B2191B726C008C7C7F /* ConvertUTFWrapper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1A1645AF191B726C008C7C7F /* ConvertUTFWrapper.cpp */; };
1A1645B3191B726C008C7C7F /* ConvertUTFWrapper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1A1645AF191B726C008C7C7F /* ConvertUTFWrapper.cpp */; };
1A570061180BC5A10088DEC7 /* CCAction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1A570047180BC5A10088DEC7 /* CCAction.cpp */; };
1A570062180BC5A10088DEC7 /* CCAction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1A570047180BC5A10088DEC7 /* CCAction.cpp */; };
1A570063180BC5A10088DEC7 /* CCAction.h in Headers */ = {isa = PBXBuildFile; fileRef = 1A570048180BC5A10088DEC7 /* CCAction.h */; };
@ -765,8 +769,6 @@
1ABA68AF1888D700007D1BB4 /* CCFontCharMap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ABA68AC1888D700007D1BB4 /* CCFontCharMap.cpp */; };
1ABA68B01888D700007D1BB4 /* CCFontCharMap.h in Headers */ = {isa = PBXBuildFile; fileRef = 1ABA68AD1888D700007D1BB4 /* CCFontCharMap.h */; };
1ABA68B11888D700007D1BB4 /* CCFontCharMap.h in Headers */ = {isa = PBXBuildFile; fileRef = 1ABA68AD1888D700007D1BB4 /* CCFontCharMap.h */; };
1AC0269A1914068200FA920D /* ConvertUTF.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1AC026981914068200FA920D /* ConvertUTF.cpp */; };
1AC0269B1914068200FA920D /* ConvertUTF.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1AC026981914068200FA920D /* ConvertUTF.cpp */; };
1AC0269C1914068200FA920D /* ConvertUTF.h in Headers */ = {isa = PBXBuildFile; fileRef = 1AC026991914068200FA920D /* ConvertUTF.h */; };
1AC0269D1914068200FA920D /* ConvertUTF.h in Headers */ = {isa = PBXBuildFile; fileRef = 1AC026991914068200FA920D /* ConvertUTF.h */; };
1AD71DA9180E26E600808F54 /* CCBAnimationManager.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1AD71CFA180E26E600808F54 /* CCBAnimationManager.cpp */; };
@ -1878,6 +1880,8 @@
1A0DB7301823827C0025743D /* CCGL.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CCGL.h; sourceTree = "<group>"; };
1A0DB7311823827C0025743D /* CCEAGLView.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CCEAGLView.h; sourceTree = "<group>"; };
1A0DB7351823828F0025743D /* CCGL.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CCGL.h; sourceTree = "<group>"; };
1A1645AE191B726C008C7C7F /* ConvertUTF.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = ConvertUTF.c; sourceTree = "<group>"; };
1A1645AF191B726C008C7C7F /* ConvertUTFWrapper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvertUTFWrapper.cpp; sourceTree = "<group>"; };
1A570047180BC5A10088DEC7 /* CCAction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CCAction.cpp; sourceTree = "<group>"; };
1A570048180BC5A10088DEC7 /* CCAction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CCAction.h; sourceTree = "<group>"; };
1A570049180BC5A10088DEC7 /* CCActionCamera.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CCActionCamera.cpp; sourceTree = "<group>"; };
@ -2129,7 +2133,6 @@
1AAF584E180E40B9000584C8 /* LocalStorageAndroid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LocalStorageAndroid.cpp; sourceTree = "<group>"; };
1ABA68AC1888D700007D1BB4 /* CCFontCharMap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CCFontCharMap.cpp; sourceTree = "<group>"; };
1ABA68AD1888D700007D1BB4 /* CCFontCharMap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CCFontCharMap.h; sourceTree = "<group>"; };
1AC026981914068200FA920D /* ConvertUTF.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvertUTF.cpp; sourceTree = "<group>"; };
1AC026991914068200FA920D /* ConvertUTF.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvertUTF.h; sourceTree = "<group>"; };
1AD71CFA180E26E600808F54 /* CCBAnimationManager.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CCBAnimationManager.cpp; sourceTree = "<group>"; };
1AD71CFB180E26E600808F54 /* CCBAnimationManager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CCBAnimationManager.h; sourceTree = "<group>"; };
@ -3799,8 +3802,9 @@
1AC026971914068200FA920D /* ConvertUTF */ = {
isa = PBXGroup;
children = (
1AC026981914068200FA920D /* ConvertUTF.cpp */,
1A1645AE191B726C008C7C7F /* ConvertUTF.c */,
1AC026991914068200FA920D /* ConvertUTF.h */,
1A1645AF191B726C008C7C7F /* ConvertUTFWrapper.cpp */,
);
name = ConvertUTF;
path = ../external/ConvertUTF;
@ -6112,7 +6116,7 @@
2AC795DB1862870F005EC8E1 /* SkeletonBounds.cpp in Sources */,
2AC795DC1862870F005EC8E1 /* Event.cpp in Sources */,
1A01C68A18F57BE800EFE3A6 /* CCDeprecated.cpp in Sources */,
1AC0269A1914068200FA920D /* ConvertUTF.cpp in Sources */,
1A1645B0191B726C008C7C7F /* ConvertUTF.c in Sources */,
500DC93219106300007B91BF /* CCAutoreleasePool.cpp in Sources */,
2905FA5618CF08D100240AA3 /* UILayout.cpp in Sources */,
2AC795DD1862870F005EC8E1 /* EventData.cpp in Sources */,
@ -6214,6 +6218,7 @@
500DC9B619106E6D007B91BF /* TransformUtils.cpp in Sources */,
1A5701EE180BCB8C0088DEC7 /* CCTransitionProgress.cpp in Sources */,
1A5701F7180BCBAD0088DEC7 /* CCMenu.cpp in Sources */,
1A1645B2191B726C008C7C7F /* ConvertUTFWrapper.cpp in Sources */,
1A5701FB180BCBAD0088DEC7 /* CCMenuItem.cpp in Sources */,
1A570202180BCBD40088DEC7 /* CCClippingNode.cpp in Sources */,
06CAAACF186AD7FC0012A414 /* TriggerBase.cpp in Sources */,
@ -6823,13 +6828,14 @@
1A8C59EC180E930E00EF57C3 /* CCSkin.cpp in Sources */,
2905FA4718CF08D100240AA3 /* UIButton.cpp in Sources */,
1A8C59F0180E930E00EF57C3 /* CCSpriteFrameCacheHelper.cpp in Sources */,
1AC0269B1914068200FA920D /* ConvertUTF.cpp in Sources */,
1A1645B1191B726C008C7C7F /* ConvertUTF.c in Sources */,
B2AF2FA218EBAEAE00C5807C /* Vector2.cpp in Sources */,
500DC8D219105F7D007B91BF /* CCAffineTransform.cpp in Sources */,
1A8C59F4180E930E00EF57C3 /* CCSSceneReader.cpp in Sources */,
1A8C59F8180E930E00EF57C3 /* CCTransformHelp.cpp in Sources */,
1A8C59FC180E930E00EF57C3 /* CCTween.cpp in Sources */,
2905FA5318CF08D100240AA3 /* UIImageView.cpp in Sources */,
1A1645B3191B726C008C7C7F /* ConvertUTFWrapper.cpp in Sources */,
1A8C5A04180E930E00EF57C3 /* CCUtilMath.cpp in Sources */,
2905FA7518CF08D100240AA3 /* UIScrollView.cpp in Sources */,
1A8C5A0E180E930E00EF57C3 /* DictionaryHelper.cpp in Sources */,

View File

@ -125,17 +125,15 @@ bool UTF8ToUTF16(const std::string& utf8, std::u16string& outUtf16)
}
bool ret = false;
const size_t utf16Bytes = (utf8.length()+1) << 1;
const size_t utf16Bytes = (utf8.length()+1) * sizeof(char16_t);
char16_t* utf16 = (char16_t*)malloc(utf16Bytes);
memset(utf16, 0, utf16Bytes);
UTF16* utf16Start = (UTF16*)utf16;
UTF16* utf16End = ((UTF16*)utf16) + (utf8.length());
char* utf16ptr = reinterpret_cast<char*>(utf16);
const UTF8* error = nullptr;
const UTF8* utf8Start = (const UTF8*)utf8.data();
const UTF8* utf8End = ((const UTF8*)utf8.data()) + utf8.length();
if (conversionOK == ConvertUTF8toUTF16((const UTF8 **) &utf8Start, utf8End, &utf16Start, utf16End, strictConversion))
if (llvm::ConvertUTF8toWide(2, utf8, utf16ptr, error))
{
outUtf16 = utf16;
ret = true;
@ -154,26 +152,7 @@ bool UTF16ToUTF8(const std::u16string& utf16, std::string& outUtf8)
return true;
}
bool ret = false;
const size_t utf8Bytes = (utf16.length() << 2) + 1;
char* utf8 = (char*)malloc(utf8Bytes);
memset(utf8, 0, utf8Bytes);
UTF8 *utf8Start = (UTF8*)utf8;
UTF8 *utf8End = ((UTF8*)utf8) + (utf8Bytes -1);
const UTF16* utf16Start = (const UTF16*)utf16.data();
const UTF16* utf16End = ((const UTF16*)utf16.data()) + utf16.length();
if (conversionOK == ConvertUTF16toUTF8(&utf16Start, utf16End, &utf8Start, utf8End, strictConversion))
{
outUtf8 = utf8;
ret = true;
}
free(utf8);
return ret;
return llvm::convertUTF16ToUTF8String(utf16, outUtf8);
}
std::vector<char16_t> getUTF16VectorFromUTF16String(const std::u16string& str)

View File

@ -51,7 +51,6 @@
#ifdef CVTUTF_DEBUG
#include <stdio.h>
#endif
#include <string.h>
static const int halfShift = 10; /* used for shifting by 10 bits */
@ -401,6 +400,22 @@ unsigned getNumBytesForUTF8(UTF8 first) {
return trailingBytesForUTF8[first] + 1;
}
int getUTF8StringLength(const UTF8* utf8)
{
const UTF8** source = &utf8;
const UTF8* sourceEnd = utf8 + strlen((const char*)utf8);
int ret = 0;
while (*source != sourceEnd) {
int length = trailingBytesForUTF8[**source] + 1;
if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
return 0;
*source += length;
++ret;
}
return ret;
}
/* --------------------------------------------------------------------- */
/*
@ -417,21 +432,6 @@ Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
return true;
}
int getUTF8StringLength(const UTF8* utf8)
{
const UTF8** source = &utf8;
const UTF8* sourceEnd = utf8 + strlen((char*)utf8);
int ret = 0;
while (*source != sourceEnd) {
int length = trailingBytesForUTF8[**source] + 1;
if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
return 0;
*source += length;
++ret;
}
return ret;
}
/* --------------------------------------------------------------------- */
ConversionResult ConvertUTF8toUTF16 (

View File

@ -87,13 +87,13 @@
------------------------------------------------------------------------ */
#ifndef CC_LLVM_SUPPORT_CONVERTUTF_H
#define CC_LLVM_SUPPORT_CONVERTUTF_H
#ifndef LLVM_SUPPORT_CONVERTUTF_H
#define LLVM_SUPPORT_CONVERTUTF_H
/* ---------------------------------------------------------------------
The following 4 definitions are compiler-specific.
The C standard does not guarantee that char16_t has at least
16 bits, so char16_t is no less portable than unsigned short!
The C standard does not guarantee that wchar_t has at least
16 bits, so wchar_t is no less portable than unsigned short!
All should be unsigned values to avoid sign extension during
bit mask & shift operations.
------------------------------------------------------------------------ */
@ -127,6 +127,10 @@ typedef enum {
lenientConversion
} ConversionFlags;
/* This is for C++ and does no harm in C */
#ifdef __cplusplus
extern "C" {
#endif
ConversionResult ConvertUTF8toUTF16 (
const UTF8** sourceStart, const UTF8* sourceEnd,
@ -157,10 +161,93 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd);
Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd);
unsigned getNumBytesForUTF8(UTF8 firstByte);
int getUTF8StringLength(const UTF8* utf8);
#ifdef __cplusplus
}
/*************************************************************************/
/* Below are LLVM-specific wrappers of the functions above. */
//#include "llvm/ADT/ArrayRef.h"
//#include "llvm/ADT/StringRef.h"
#include <vector>
#include <string>
namespace llvm {
/**
* Convert an UTF8 StringRef to UTF8, UTF16, or UTF32 depending on
* WideCharWidth. The converted data is written to ResultPtr, which needs to
* point to at least WideCharWidth * (Source.Size() + 1) bytes. On success,
* ResultPtr will point one after the end of the copied string. On failure,
* ResultPtr will not be changed, and ErrorPtr will be set to the location of
* the first character which could not be converted.
* \return true on success.
*/
bool ConvertUTF8toWide(unsigned WideCharWidth, const std::string& Source,
char *&ResultPtr, const UTF8 *&ErrorPtr);
/**
* Convert an Unicode code point to UTF8 sequence.
*
* \param Source a Unicode code point.
* \param [in,out] ResultPtr pointer to the output buffer, needs to be at least
* \c UNI_MAX_UTF8_BYTES_PER_CODE_POINT bytes. On success \c ResultPtr is
* updated one past end of the converted sequence.
*
* \returns true on success.
*/
bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr);
/**
* Convert the first UTF8 sequence in the given source buffer to a UTF32
* code point.
*
* \param [in,out] source A pointer to the source buffer. If the conversion
* succeeds, this pointer will be updated to point to the byte just past the
* end of the converted sequence.
* \param sourceEnd A pointer just past the end of the source buffer.
* \param [out] target The converted code
* \param flags Whether the conversion is strict or lenient.
*
* \returns conversionOK on success
*
* \sa ConvertUTF8toUTF32
*/
static inline ConversionResult convertUTF8Sequence(const UTF8 **source,
const UTF8 *sourceEnd,
UTF32 *target,
ConversionFlags flags) {
if (*source == sourceEnd)
return sourceExhausted;
unsigned size = getNumBytesForUTF8(**source);
if ((ptrdiff_t)size > sourceEnd - *source)
return sourceExhausted;
return ConvertUTF8toUTF32(source, *source + size, &target, target + 1, flags);
}
/**
* Returns true if a blob of text starts with a UTF-16 big or little endian byte
* order mark.
*/
bool hasUTF16ByteOrderMark(const char* SrcBytes, size_t len);
/**
* Converts a stream of raw bytes assumed to be UTF16 into a UTF8 std::string.
*
* \param [in] SrcBytes A buffer of what is assumed to be UTF-16 encoded text.
* \param [out] Out Converted UTF-8 is stored here on success.
* \returns true on success
*/
bool convertUTF16ToUTF8String(const std::u16string& utf16, std::string &Out);
} /* end namespace llvm */
#endif
/* --------------------------------------------------------------------- */
#endif // CC_LLVM_SUPPORT_CONVERTUTF_H
#endif

View File

@ -0,0 +1,142 @@
//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "ConvertUTF.h"
//#include "llvm/Support/SwapByteOrder.h"
#include <string>
#include <vector>
#include <assert.h>
namespace llvm {
bool ConvertUTF8toWide(unsigned WideCharWidth, const std::string& Source,
char *&ResultPtr, const UTF8 *&ErrorPtr) {
assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);
ConversionResult result = conversionOK;
// Copy the character span over.
if (WideCharWidth == 1) {
const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.data());
if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.data() + Source.length()))) {
result = sourceIllegal;
ErrorPtr = Pos;
} else {
memcpy(ResultPtr, Source.data(), Source.size());
ResultPtr += Source.size();
}
} else if (WideCharWidth == 2) {
const UTF8 *sourceStart = (const UTF8*)Source.data();
// FIXME: Make the type of the result buffer correct instead of
// using reinterpret_cast.
UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
ConversionFlags flags = strictConversion;
result = ConvertUTF8toUTF16(
&sourceStart, sourceStart + Source.size(),
&targetStart, targetStart + 2*Source.size(), flags);
if (result == conversionOK)
ResultPtr = reinterpret_cast<char*>(targetStart);
else
ErrorPtr = sourceStart;
} else if (WideCharWidth == 4) {
const UTF8 *sourceStart = (const UTF8*)Source.data();
// FIXME: Make the type of the result buffer correct instead of
// using reinterpret_cast.
UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
ConversionFlags flags = strictConversion;
result = ConvertUTF8toUTF32(
&sourceStart, sourceStart + Source.size(),
&targetStart, targetStart + 4*Source.size(), flags);
if (result == conversionOK)
ResultPtr = reinterpret_cast<char*>(targetStart);
else
ErrorPtr = sourceStart;
}
assert((result != targetExhausted)
&& "ConvertUTF8toUTFXX exhausted target buffer");
return result == conversionOK;
}
bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
const UTF32 *SourceStart = &Source;
const UTF32 *SourceEnd = SourceStart + 1;
UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr);
UTF8 *TargetEnd = TargetStart + 4;
ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,
&TargetStart, TargetEnd,
strictConversion);
if (CR != conversionOK)
return false;
ResultPtr = reinterpret_cast<char*>(TargetStart);
return true;
}
bool hasUTF16ByteOrderMark(const char* S, size_t len) {
return (len >= 2 &&
((S[0] == '\xff' && S[1] == '\xfe') ||
(S[0] == '\xfe' && S[1] == '\xff')));
}
/// SwapByteOrder_16 - This function returns a byte-swapped representation of
/// the 16-bit argument.
inline uint16_t SwapByteOrder_16(uint16_t value) {
#if defined(_MSC_VER) && !defined(_DEBUG)
// The DLL version of the runtime lacks these functions (bug!?), but in a
// release build they're replaced with BSWAP instructions anyway.
return _byteswap_ushort(value);
#else
uint16_t Hi = value << 8;
uint16_t Lo = value >> 8;
return Hi | Lo;
#endif
}
bool convertUTF16ToUTF8String(const std::u16string& utf16, std::string &Out) {
assert(Out.empty());
// Avoid OOB by returning early on empty input.
if (utf16.empty())
return true;
const UTF16 *Src = reinterpret_cast<const UTF16 *>(utf16.data());
const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(utf16.data() + utf16.length());
// Byteswap if necessary.
std::vector<UTF16> ByteSwapped;
if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
for (size_t I = 0, E = ByteSwapped.size(); I != E; ++I)
ByteSwapped[I] = SwapByteOrder_16(ByteSwapped[I]);
Src = &ByteSwapped[0];
SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
}
// Skip the BOM for conversion.
if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)
Src++;
// Just allocate enough space up front. We'll shrink it later.
Out.resize(utf16.length() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);
UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
UTF8 *DstEnd = Dst + Out.size();
ConversionResult CR =
ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
assert(CR != targetExhausted);
if (CR != conversionOK) {
Out.clear();
return false;
}
Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
return true;
}
} // end namespace llvm