Source code
Revision control
Copy as Markdown
Other Tools
// UTFConvert.cpp↩
↩
#include "StdAfx.h"↩
↩
#include "MyTypes.h"↩
#include "UTFConvert.h"↩
↩
#ifdef _WIN32↩
#define _WCHART_IS_16BIT 1↩
#endif↩
↩
/*↩
_UTF8_START(n) - is a base value for start byte (head), if there are (n) additional bytes after start byte↩
↩
n : _UTF8_START(n) : Bits of code point↩
↩
0 : 0x80 : : unused↩
1 : 0xC0 : 11 :↩
2 : 0xE0 : 16 : Basic Multilingual Plane↩
3 : 0xF0 : 21 : Unicode space↩
3 : 0xF8 : 26 :↩
5 : 0xFC : 31 : UCS-4↩
6 : 0xFE : 36 : We can use it, if we want to encode any 32-bit value↩
7 : 0xFF :↩
*/↩
↩
#define _UTF8_START(n) (0x100 - (1 << (7 - (n))))↩
↩
#define _UTF8_HEAD_PARSE2(n) if (c < _UTF8_START((n) + 1)) { numBytes = (n); c -= _UTF8_START(n); }↩
↩
#define _UTF8_HEAD_PARSE \↩
_UTF8_HEAD_PARSE2(1) \↩
else _UTF8_HEAD_PARSE2(2) \↩
else _UTF8_HEAD_PARSE2(3) \↩
else _UTF8_HEAD_PARSE2(4) \↩
else _UTF8_HEAD_PARSE2(5) \↩
↩
// else _UTF8_HEAD_PARSE2(6)↩
↩
bool CheckUTF8(const char *src, bool allowReduced) throw()↩
{↩
for (;;)↩
{↩
Byte c = *src++;↩
if (c == 0)↩
return true;↩
↩
if (c < 0x80)↩
continue;↩
if (c < 0xC0) // (c < 0xC0 + 2) // if we support only optimal encoding chars↩
return false;↩
↩
unsigned numBytes;↩
_UTF8_HEAD_PARSE↩
else↩
return false;↩
↩
UInt32 val = c;↩
↩
do↩
{↩
Byte c2 = *src++;↩
if (c2 < 0x80 || c2 >= 0xC0)↩
return allowReduced && c2 == 0;↩
val <<= 6;↩
val |= (c2 - 0x80);↩
}↩
while (--numBytes);↩
↩
if (val >= 0x110000)↩
return false;↩
}↩
}↩
↩
↩
#define _ERROR_UTF8 \↩
{ if (dest) dest[destPos] = (wchar_t)0xFFFD; destPos++; ok = false; continue; }↩
↩
static bool Utf8_To_Utf16(wchar_t *dest, size_t *destLen, const char *src, const char *srcLim) throw()↩
{↩
size_t destPos = 0;↩
bool ok = true;↩
↩
for (;;)↩
{↩
Byte c;↩
if (src == srcLim)↩
{↩
*destLen = destPos;↩
return ok;↩
}↩
c = *src++;↩
↩
if (c < 0x80)↩
{↩
if (dest)↩
dest[destPos] = (wchar_t)c;↩
destPos++;↩
continue;↩
}↩
if (c < 0xC0)↩
_ERROR_UTF8↩
↩
unsigned numBytes;↩
_UTF8_HEAD_PARSE↩
else↩
_ERROR_UTF8↩
↩
UInt32 val = c;↩
↩
do↩
{↩
Byte c2;↩
if (src == srcLim)↩
break;↩
c2 = *src;↩
if (c2 < 0x80 || c2 >= 0xC0)↩
break;↩
src++;↩
val <<= 6;↩
val |= (c2 - 0x80);↩
}↩
while (--numBytes);↩
↩
if (numBytes != 0)↩
_ERROR_UTF8↩
↩
if (val < 0x10000)↩
{↩
if (dest)↩
dest[destPos] = (wchar_t)val;↩
destPos++;↩
}↩
else↩
{↩
val -= 0x10000;↩
if (val >= 0x100000)↩
_ERROR_UTF8↩
if (dest)↩
{↩
dest[destPos + 0] = (wchar_t)(0xD800 + (val >> 10));↩
dest[destPos + 1] = (wchar_t)(0xDC00 + (val & 0x3FF));↩
}↩
destPos += 2;↩
}↩
}↩
}↩
↩
#define _UTF8_RANGE(n) (((UInt32)1) << ((n) * 5 + 6))↩
↩
#define _UTF8_HEAD(n, val) ((char)(_UTF8_START(n) + (val >> (6 * (n)))))↩
#define _UTF8_CHAR(n, val) ((char)(0x80 + (((val) >> (6 * (n))) & 0x3F)))↩
↩
static size_t Utf16_To_Utf8_Calc(const wchar_t *src, const wchar_t *srcLim)↩
{↩
size_t size = srcLim - src;↩
for (;;)↩
{↩
if (src == srcLim)↩
return size;↩
↩
UInt32 val = *src++;↩
↩
if (val < 0x80)↩
continue;↩
↩
if (val < _UTF8_RANGE(1))↩
{↩
size++;↩
continue;↩
}↩
↩
if (val >= 0xD800 && val < 0xDC00 && src != srcLim)↩
{↩
UInt32 c2 = *src;↩
if (c2 >= 0xDC00 && c2 < 0xE000)↩
{↩
src++;↩
size += 2;↩
continue;↩
}↩
}↩
↩
#ifdef _WCHART_IS_16BIT↩
↩
size += 2;↩
↩
#else↩
↩
if (val < _UTF8_RANGE(2)) size += 2;↩
else if (val < _UTF8_RANGE(3)) size += 3;↩
else if (val < _UTF8_RANGE(4)) size += 4;↩
else if (val < _UTF8_RANGE(5)) size += 5;↩
else size += 6;↩
↩
#endif↩
}↩
}↩
↩
static char *Utf16_To_Utf8(char *dest, const wchar_t *src, const wchar_t *srcLim)↩
{↩
for (;;)↩
{↩
if (src == srcLim)↩
return dest;↩
↩
UInt32 val = *src++;↩
↩
if (val < 0x80)↩
{↩
*dest++ = (char)val;↩
continue;↩
}↩
↩
if (val < _UTF8_RANGE(1))↩
{↩
dest[0] = _UTF8_HEAD(1, val);↩
dest[1] = _UTF8_CHAR(0, val);↩
dest += 2;↩
continue;↩
}↩
↩
if (val >= 0xD800 && val < 0xDC00 && src != srcLim)↩
{↩
UInt32 c2 = *src;↩
if (c2 >= 0xDC00 && c2 < 0xE000)↩
{↩
src++;↩
val = (((val - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000;↩
dest[0] = _UTF8_HEAD(3, val);↩
dest[1] = _UTF8_CHAR(2, val);↩
dest[2] = _UTF8_CHAR(1, val);↩
dest[3] = _UTF8_CHAR(0, val);↩
dest += 4;↩
continue;↩
}↩
}↩
↩
#ifndef _WCHART_IS_16BIT↩
if (val < _UTF8_RANGE(2))↩
#endif↩
{↩
dest[0] = _UTF8_HEAD(2, val);↩
dest[1] = _UTF8_CHAR(1, val);↩
dest[2] = _UTF8_CHAR(0, val);↩
dest += 3;↩
continue;↩
}↩
↩
#ifndef _WCHART_IS_16BIT↩
↩
UInt32 b;↩
unsigned numBits;↩
if (val < _UTF8_RANGE(3)) { numBits = 6 * 3; b = _UTF8_HEAD(3, val); }↩
else if (val < _UTF8_RANGE(4)) { numBits = 6 * 4; b = _UTF8_HEAD(4, val); }↩
else if (val < _UTF8_RANGE(5)) { numBits = 6 * 5; b = _UTF8_HEAD(5, val); }↩
else { numBits = 6 * 6; b = _UTF8_START(6); }↩
↩
*dest++ = (Byte)b;↩
↩
do↩
{↩
numBits -= 6;↩
*dest++ = (char)(0x80 + ((val >> numBits) & 0x3F));↩
}↩
while (numBits != 0);↩
↩
#endif↩
}↩
}↩
↩
bool ConvertUTF8ToUnicode(const AString &src, UString &dest)↩
{↩
dest.Empty();↩
size_t destLen = 0;↩
Utf8_To_Utf16(NULL, &destLen, src, src.Ptr(src.Len()));↩
bool res = Utf8_To_Utf16(dest.GetBuf((unsigned)destLen), &destLen, src, src.Ptr(src.Len()));↩
dest.ReleaseBuf_SetEnd((unsigned)destLen);↩
return res;↩
}↩
↩
void ConvertUnicodeToUTF8(const UString &src, AString &dest)↩
{↩
dest.Empty();↩
size_t destLen = Utf16_To_Utf8_Calc(src, src.Ptr(src.Len()));↩
Utf16_To_Utf8(dest.GetBuf((unsigned)destLen), src, src.Ptr(src.Len()));↩
dest.ReleaseBuf_SetEnd((unsigned)destLen);↩
}↩