* Copyright 2010-2011, Oliver Tappe, zooey@hirschkaefer.de.
* Distributed under the terms of the MIT License.
*/
#include "ICUCtypeData.h"
#include <langinfo.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <unicode/uchar.h>
#include <unicode/uvernum.h>
#include <Debug.h>
#undef TRACE
#ifdef TRACE_CTYPE
# include <OS.h>
# define TRACE(x) debug_printf x
#else
# define TRACE(x) ;
#endif
U_NAMESPACE_USE
namespace BPrivate {
namespace Libroot {
ICUCtypeData::ICUCtypeData(pthread_key_t tlsKey)
:
inherited(tlsKey),
fDataBridge(NULL)
{
}
ICUCtypeData::~ICUCtypeData()
{
}
void
ICUCtypeData::Initialize(LocaleCtypeDataBridge* dataBridge)
{
*dataBridge->addrOfClassInfoTable = &fClassInfo[128];
*dataBridge->addrOfToLowerTable = &fToLowerMap[128];
*dataBridge->addrOfToUpperTable = &fToUpperMap[128];
fDataBridge = dataBridge;
}
status_t
ICUCtypeData::SetTo(const Locale& locale, const char* posixLocaleName)
{
status_t result = inherited::SetTo(locale, posixLocaleName);
if (result != B_OK)
return result;
UErrorCode icuStatus = U_ZERO_ERROR;
UConverter* converter;
result = _GetConverter(converter);
if (result != B_OK)
return result;
ucnv_reset(converter);
fDataBridge->setMbCurMax(ucnv_getMaxCharSize(converter));
char buffer[] = { 0, 0 };
for (int i = 0; i < 256; ++i) {
const char* source = buffer;
buffer[0] = (char)i;
buffer[1] = '\0';
icuStatus = U_ZERO_ERROR;
UChar32 unicodeChar
= ucnv_getNextUChar(converter, &source, source + 1, &icuStatus);
unsigned short classInfo = 0;
unsigned int toLower = i;
unsigned int toUpper = i;
if (U_SUCCESS(icuStatus)) {
if (u_isblank(unicodeChar))
classInfo |= _ISblank;
if (u_charType(unicodeChar) == U_CONTROL_CHAR)
classInfo |= _IScntrl;
if (u_ispunct(unicodeChar))
classInfo |= _ISpunct;
if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_ALNUM))
classInfo |= _ISalnum;
if (u_isUUppercase(unicodeChar))
classInfo |= _ISupper;
if (u_isULowercase(unicodeChar))
classInfo |= _ISlower;
if (u_isUAlphabetic(unicodeChar))
classInfo |= _ISalpha;
if (u_isdigit(unicodeChar))
classInfo |= _ISdigit;
if (u_isxdigit(unicodeChar))
classInfo |= _ISxdigit;
if (u_isUWhiteSpace(unicodeChar))
classInfo |= _ISspace;
if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_PRINT))
classInfo |= _ISprint;
if (u_hasBinaryProperty(unicodeChar, UCHAR_POSIX_GRAPH))
classInfo |= _ISgraph;
UChar lowerChar = u_tolower(unicodeChar);
icuStatus = U_ZERO_ERROR;
ucnv_fromUChars(converter, buffer, 1, &lowerChar, 1, &icuStatus);
if (U_SUCCESS(icuStatus))
toLower = (unsigned char)buffer[0];
UChar upperChar = u_toupper(unicodeChar);
icuStatus = U_ZERO_ERROR;
ucnv_fromUChars(converter, buffer, 1, &upperChar, 1, &icuStatus);
if (U_SUCCESS(icuStatus))
toUpper = (unsigned char)buffer[0];
}
fClassInfo[i + 128] = classInfo;
fToLowerMap[i + 128] = toLower;
fToUpperMap[i + 128] = toUpper;
if (i >= 128 && i < 255) {
fClassInfo[i - 128] = classInfo;
fToLowerMap[i - 128] = toLower;
fToUpperMap[i - 128] = toUpper;
}
}
return B_OK;
}
status_t
ICUCtypeData::SetToPosix()
{
status_t result = inherited::SetToPosix();
if (result == B_OK) {
memcpy(fClassInfo, fDataBridge->posixClassInfo, sizeof(fClassInfo));
memcpy(fToLowerMap, fDataBridge->posixToLowerMap, sizeof(fToLowerMap));
memcpy(fToUpperMap, fDataBridge->posixToUpperMap, sizeof(fToUpperMap));
fDataBridge->setMbCurMax(1);
}
return result;
}
int
ICUCtypeData::IsWCType(wint_t wc, wctype_t charClass)
{
if (wc == WEOF)
return 0;
switch (charClass) {
case _ISalnum:
return u_hasBinaryProperty(wc, UCHAR_POSIX_ALNUM);
case _ISalpha:
return u_isUAlphabetic(wc);
case _ISblank:
return u_isblank(wc);
case _IScntrl:
return u_charType(wc) == U_CONTROL_CHAR;
case _ISdigit:
return u_isdigit(wc);
case _ISgraph:
return u_hasBinaryProperty(wc, UCHAR_POSIX_GRAPH);
case _ISlower:
return u_isULowercase(wc);
case _ISprint:
return u_hasBinaryProperty(wc, UCHAR_POSIX_PRINT);
case _ISpunct:
return u_ispunct(wc);
case _ISspace:
return u_isUWhiteSpace(wc);
case _ISupper:
return u_isUUppercase(wc);
case _ISxdigit:
return u_isxdigit(wc);
default:
return 0;
}
}
status_t
ICUCtypeData::ToWCTrans(wint_t wc, wctrans_t transition, wint_t& result)
{
switch (transition) {
case _ISlower:
result = u_tolower(wc);
return B_OK;
case _ISupper:
result = u_toupper(wc);
return B_OK;
default:
return B_BAD_VALUE;
}
}
status_t
ICUCtypeData::MultibyteToWchar(wchar_t* wcOut, const char* mb, size_t mbLen,
mbstate_t* mbState, size_t& lengthOut)
{
UConverter* converter = NULL;
status_t result = _GetConverterForMbState(mbState, converter);
if (result != B_OK) {
TRACE(("MultibyteToWchar(): couldn't get converter for mbstate %p - "
"%" B_PRIx32 "\n", mbState, result));
return result;
}
UErrorCode icuStatus = U_ZERO_ERROR;
const char* buffer = mb;
UChar targetBuffer[3];
UChar* target = targetBuffer;
ucnv_toUnicode(converter, &target, target + 1, &buffer, buffer + mbLen,
NULL, FALSE, &icuStatus);
size_t sourceLengthUsed = buffer - mb;
size_t targetLengthUsed = (size_t)(target - targetBuffer);
if (U16_IS_LEAD(targetBuffer[0])) {
TRACE(("MultibyteToWchar(): have a surrogate pair\n"));
icuStatus = U_ZERO_ERROR;
ucnv_toUnicode(converter, &target, target + 2 - targetLengthUsed,
&buffer, buffer + mbLen - sourceLengthUsed,
NULL, FALSE, &icuStatus);
sourceLengthUsed = buffer - mb;
targetLengthUsed = (size_t)(target - targetBuffer);
}
if (icuStatus == U_BUFFER_OVERFLOW_ERROR && targetLengthUsed > 0) {
icuStatus = U_ZERO_ERROR;
}
if (!U_SUCCESS(icuStatus)) {
TRACE(("MultibyteToWchar(): illegal character sequence\n"));
ucnv_resetToUnicode(converter);
result = B_BAD_DATA;
} else if (targetLengthUsed == 0) {
TRACE(("MultibyteToWchar(): incomplete character (len=%lu)\n", mbLen));
for (size_t i = 0; i < mbLen; ++i)
TRACE(("\tbyte %lu: %x\n", i, mb[i]));
mbState->count = sourceLengthUsed;
result = B_BAD_INDEX;
} else {
UChar32 unicodeChar = 0xBADBEEF;
U16_GET(targetBuffer, 0, 0, targetLengthUsed, unicodeChar);
if (unicodeChar == 0) {
_DropConverterFromMbState(mbState);
memset(mbState, 0, sizeof(mbstate_t));
lengthOut = 0;
} else {
mbState->count = 0;
lengthOut = sourceLengthUsed;
}
if (wcOut != NULL)
*wcOut = unicodeChar;
result = B_OK;
}
return result;
}
status_t
ICUCtypeData::MultibyteStringToWchar(wchar_t* wcDest, size_t wcDestLength,
const char** mbSource, size_t mbSourceLength, mbstate_t* mbState,
size_t& lengthOut)
{
UConverter* converter = NULL;
status_t result = _GetConverterForMbState(mbState, converter);
if (result != B_OK) {
TRACE(("MultibyteStringToWchar(): couldn't get converter for mbstate %p"
" - %" B_PRIx32 "\n", mbState, result));
return result;
}
bool wcsIsTerminated = false;
const char* source = *mbSource;
const char* sourceEnd = source + mbSourceLength;
if (sourceEnd < source) {
sourceEnd = (const char*)-1;
}
if (wcDest == NULL) {
wcDestLength = (size_t)-1;
}
UErrorCode icuStatus = U_ZERO_ERROR;
size_t sourceLengthUsed = 0;
for (lengthOut = 0; lengthOut < wcDestLength; ++lengthOut) {
if (sourceLengthUsed >= mbSourceLength)
break;
UChar32 unicodeChar = ucnv_getNextUChar(converter, &source,
std::min(source + MB_CUR_MAX, sourceEnd), &icuStatus);
TRACE(("MultibyteStringToWchar() l:%lu wl:%lu s:%p se:%p sl:%lu slu:%lu"
" uchar:%x st:%x\n", lengthOut, wcDestLength, source, sourceEnd,
mbSourceLength, sourceLengthUsed, unicodeChar, icuStatus));
if (!U_SUCCESS(icuStatus))
break;
sourceLengthUsed = source - *mbSource;
if (wcDest != NULL)
*wcDest++ = unicodeChar;
if (unicodeChar == L'\0') {
wcsIsTerminated = true;
break;
}
icuStatus = U_ZERO_ERROR;
}
if (!U_SUCCESS(icuStatus)) {
TRACE(("MultibyteStringToWchar(): illegal character sequence\n"));
ucnv_resetToUnicode(converter);
result = B_BAD_DATA;
if (wcDest != NULL)
*mbSource = *mbSource + sourceLengthUsed;
} else if (wcsIsTerminated) {
_DropConverterFromMbState(mbState);
memset(mbState, 0, sizeof(mbstate_t));
if (wcDest != NULL)
*mbSource = NULL;
} else {
mbState->count = 0;
if (wcDest != NULL)
*mbSource = source;
}
return result;
}
status_t
ICUCtypeData::WcharToMultibyte(char* mbOut, wchar_t wc, mbstate_t* mbState,
size_t& lengthOut)
{
UConverter* converter = NULL;
status_t result = _GetConverterForMbState(mbState, converter);
if (result != B_OK) {
TRACE(("WcharToMultibyte(): couldn't get converter for mbstate %p - "
"%" B_PRIx32 "\n", mbState, result));
return result;
}
UChar ucharBuffer[2];
size_t ucharLength;
if (U_IS_BMP(wc)) {
ucharBuffer[0] = wc;
ucharLength = 1;
} else {
ucharBuffer[0] = U16_LEAD(wc);
ucharBuffer[1] = U16_TRAIL(wc);
ucharLength = 2;
}
UErrorCode icuStatus = U_ZERO_ERROR;
size_t mbLength = mbOut == NULL ? 0 : MB_CUR_MAX;
lengthOut = ucnv_fromUChars(converter, mbOut, mbLength, ucharBuffer,
ucharLength, &icuStatus);
TRACE(("WcharToMultibyte() l:%lu mb:%p ml:%lu uchar:%x st:%x\n", lengthOut,
mbOut, mbLength, wc, icuStatus));
if (icuStatus == U_BUFFER_OVERFLOW_ERROR && mbOut == NULL) {
icuStatus = U_ZERO_ERROR;
}
if (!U_SUCCESS(icuStatus)) {
if (icuStatus == U_ILLEGAL_ARGUMENT_ERROR) {
TRACE(("WcharToMultibyte(): bad converter\n"));
return B_BAD_VALUE;
}
TRACE(("WcharToMultibyte(): illegal character sequence\n"));
ucnv_resetFromUnicode(converter);
return B_BAD_DATA;
}
if (wc == 0) {
_DropConverterFromMbState(mbState);
memset(mbState, 0, sizeof(mbstate_t));
}
return B_OK;
}
status_t
ICUCtypeData::WcharStringToMultibyte(char* mbDest, size_t mbDestLength,
const wchar_t** wcSource, size_t wcSourceLength, mbstate_t* mbState,
size_t& lengthOut)
{
UConverter* converter = NULL;
status_t result = _GetConverterForMbState(mbState, converter);
if (result != B_OK) {
TRACE(("WcharStringToMultibyte(): couldn't get converter for mbstate %p"
" - %" B_PRIx32 "\n", mbState, result));
return result;
}
bool mbsIsTerminated = false;
const UChar32* source = (UChar32*)*wcSource;
UErrorCode icuStatus = U_ZERO_ERROR;
lengthOut = 0;
size_t sourceLengthUsed = 0;
for (; sourceLengthUsed < wcSourceLength; ++sourceLengthUsed, ++source) {
if (mbDest != NULL && lengthOut >= mbDestLength)
break;
UChar ucharBuffer[2];
size_t ucharLength;
if (U_IS_BMP(*source)) {
ucharBuffer[0] = *source;
ucharLength = 1;
} else {
ucharBuffer[0] = U16_LEAD(*source);
ucharBuffer[1] = U16_TRAIL(*source);
ucharLength = 2;
}
size_t destLength = mbDest == NULL ? 0 : mbDestLength - lengthOut;
char buffer[MB_CUR_MAX];
size_t mbLength = ucnv_fromUChars(converter,
mbDest == NULL ? NULL : buffer, destLength, ucharBuffer,
ucharLength, &icuStatus);
TRACE(("WcharStringToMultibyte() l:%lu mb:%p ml:%lu s:%p ul:%lu slu:%lu"
" uchar:%x st:%x\n", mbLength, mbDest, destLength, source,
ucharLength, sourceLengthUsed, *source, icuStatus));
if (icuStatus == U_BUFFER_OVERFLOW_ERROR) {
icuStatus = U_ZERO_ERROR;
if (destLength > 0)
break;
} else if (mbDest != NULL)
memcpy(mbDest, buffer, mbLength);
if (!U_SUCCESS(icuStatus))
break;
if (mbDest != NULL)
mbDest += mbLength;
if (*source == L'\0') {
mbsIsTerminated = true;
break;
}
lengthOut += mbLength;
icuStatus = U_ZERO_ERROR;
}
if (!U_SUCCESS(icuStatus)) {
TRACE(("WcharStringToMultibyte(): illegal character sequence\n"));
ucnv_resetFromUnicode(converter);
result = B_BAD_DATA;
if (mbDest != NULL)
*wcSource = *wcSource + sourceLengthUsed;
} else if (mbsIsTerminated) {
_DropConverterFromMbState(mbState);
memset(mbState, 0, sizeof(mbstate_t));
if (mbDest != NULL)
*wcSource = NULL;
} else {
mbState->count = 0;
if (mbDest != NULL)
*wcSource = (wchar_t*)source;
}
return result;
}
const char*
ICUCtypeData::GetLanginfo(int index)
{
switch(index) {
case CODESET:
return fGivenCharset;
default:
return "";
}
}
status_t
ICUCtypeData::_GetConverterForMbState(mbstate_t* mbState,
UConverter*& converterOut)
{
if (strcmp(mbState->charset, fGivenCharset) == 0
&& (char*)mbState->converter >= mbState->data
&& (char*)mbState->converter < mbState->data + 8) {
converterOut = (UConverter*)mbState->converter;
return B_OK;
}
_DropConverterFromMbState(mbState);
UConverter* icuConverter;
status_t result = _GetConverter(icuConverter);
if (result != B_OK)
return result;
UErrorCode icuStatus = U_ZERO_ERROR;
int32_t bufferSize = sizeof(mbState->data);
UConverter* clone
= ucnv_safeClone(icuConverter, mbState->data, &bufferSize, &icuStatus);
if (clone == NULL || !U_SUCCESS(icuStatus))
return B_ERROR;
if ((char*)clone < mbState->data || (char*)clone >= mbState->data + 8) {
return B_NO_MEMORY;
}
strlcpy(mbState->charset, fGivenCharset, sizeof(mbState->charset));
mbState->converter = clone;
converterOut = clone;
return B_OK;
}
status_t
ICUCtypeData::_DropConverterFromMbState(mbstate_t* mbState)
{
if (mbState->converter != NULL && (char*)mbState->converter >= mbState->data
&& (char*)mbState->converter < mbState->data + 8) {
ucnv_close((UConverter*)mbState->converter);
}
memset(mbState, 0, sizeof(mbstate_t));
return B_OK;
}
}
}