* Copyright 2004-2010, Haiku, Inc.
* Distributed under the terms of the MIT License.
*/
#ifndef _UTF8_FUNCTIONS_H
#define _UTF8_FUNCTIONS_H
#include <SupportDefs.h>
static inline bool
IsInsideGlyph(uchar ch)
{
return (ch & 0xc0) == 0x80;
}
static inline uint32
UTF8NextCharLenUnsafe(const char *text)
{
const char *ptr = text;
do {
ptr++;
} while (IsInsideGlyph(*ptr));
return ptr - text;
}
static inline uint32
UTF8NextCharLen(const char *text)
{
if (text == NULL || *text == 0)
return 0;
return UTF8NextCharLenUnsafe(text);
}
static inline uint32
UTF8NextCharLen(const char *bytes, size_t length)
{
if (bytes == NULL || length == 0 || bytes[0] == 0)
return 0;
if ((bytes[0] & 0x80) == 0) {
return 1;
}
if (IsInsideGlyph(bytes[0])) {
return 0;
}
uint8 mask = 0x20;
size_t bytesExpected = 2;
while ((bytes[0] & mask) != 0) {
if (mask == 0x02) {
return 0;
}
bytesExpected++;
mask >>= 1;
}
if (bytesExpected > length)
return 0;
for (size_t i = 1; i < bytesExpected; i++) {
if (!IsInsideGlyph(bytes[i])) {
return 0;
}
}
return bytesExpected;
}
static inline uint32
UTF8PreviousCharLen(const char *text, const char *limit)
{
const char *ptr = text;
if (ptr == NULL || limit == NULL)
return 0;
do {
if (ptr == limit)
break;
ptr--;
} while (IsInsideGlyph(*ptr));
return text - ptr;
}
numChars characters are read. If numChars is a negative value it is ignored
and the string is read up to the terminating 0.
*/
static inline uint32
UTF8CountBytes(const char *bytes, int32 numChars)
{
if (bytes == NULL)
return 0;
if (numChars < 0)
numChars = INT_MAX;
const char *base = bytes;
while (bytes[0] != '\0') {
if ((bytes[0] & 0xc0) != 0x80) {
if (--numChars < 0)
break;
}
bytes++;
}
return bytes - base;
}
numBytes bytes are read. If numBytes is a negative value it is ignored
and the string is read up to the terminating 0.
*/
static inline uint32
UTF8CountChars(const char *bytes, int32 numBytes)
{
if (bytes == NULL)
return 0;
uint32 length = 0;
if (numBytes < 0) {
while (bytes[0]) {
if ((bytes++[0] & 0xc0) != 0x80)
length++;
}
} else {
const char *last = bytes + numBytes - 1;
while (bytes[0] && bytes <= last) {
if ((bytes++[0] & 0xc0) != 0x80)
length++;
}
}
return length;
}
to UTF-32 char codes that can be used by FreeType. The string pointer is
then advanced to the next character in the string. In case the terminating
0 is reached, the string pointer is not advanced anymore and nulls are
returned. This makes it safe to overruns and enables streamed processing
of UTF8 strings.
*/
static inline uint32
UTF8ToCharCode(const char **bytes)
{
#define UTF8_SUBSTITUTE_CHARACTER 0xfffd
uint32 result;
if (((*bytes)[0] & 0x80) == 0) {
result = (*bytes)[0];
if (result != '\0') {
(*bytes)++;
}
return result;
}
if (((*bytes)[0] & 0xc0) == 0x80) {
(*bytes)++;
return UTF8_SUBSTITUTE_CHARACTER;
}
uint8 mask = 0x80;
result = (uint32)((*bytes)[0] & 0xff);
(*bytes)++;
while (result & mask) {
if (mask == 0x02) {
return UTF8_SUBSTITUTE_CHARACTER;
}
result &= ~mask;
mask >>= 1;
}
while (((*bytes)[0] & 0xc0) == 0x80) {
result <<= 6;
result += (*bytes)[0] & 0x3f;
(*bytes)++;
mask <<= 1;
if (mask == 0x40)
return result;
}
if (mask == 0x40)
return result;
if ((*bytes)[0] == '\0') {
return 0x00;
}
return UTF8_SUBSTITUTE_CHARACTER;
#undef UTF8_SUBSTITUTE_CHARACTER
}
#endif