Source code
Revision control
Copy as Markdown
Other Tools
/*
* Character Set Handling
* (C) 1999-2007 Jack Lloyd
*
* Botan is released under the Simplified BSD License (see license.txt)
*/
#include <botan/charset.h>
#include <botan/exceptn.h>
#include <botan/loadstor.h>
#include <cctype>
namespace Botan {
namespace {
void append_utf8_for(std::string& s, uint32_t c)
{
if(c >= 0xD800 && c < 0xE000)
throw Decoding_Error("Invalid Unicode character");
if(c <= 0x7F)
{
const uint8_t b0 = static_cast<uint8_t>(c);
s.push_back(static_cast<char>(b0));
}
else if(c <= 0x7FF)
{
const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);
const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);
s.push_back(static_cast<char>(b0));
s.push_back(static_cast<char>(b1));
}
else if(c <= 0xFFFF)
{
const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);
const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);
s.push_back(static_cast<char>(b0));
s.push_back(static_cast<char>(b1));
s.push_back(static_cast<char>(b2));
}
else if(c <= 0x10FFFF)
{
const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);
const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);
const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);
const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);
s.push_back(static_cast<char>(b0));
s.push_back(static_cast<char>(b1));
s.push_back(static_cast<char>(b2));
s.push_back(static_cast<char>(b3));
}
else
throw Decoding_Error("Invalid Unicode character");
}
}
std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)
{
if(len % 2 != 0)
throw Decoding_Error("Invalid length for UCS-2 string");
const size_t chars = len / 2;
std::string s;
for(size_t i = 0; i != chars; ++i)
{
const uint16_t c = load_be<uint16_t>(ucs2, i);
append_utf8_for(s, c);
}
return s;
}
std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)
{
if(len % 4 != 0)
throw Decoding_Error("Invalid length for UCS-4 string");
const size_t chars = len / 4;
std::string s;
for(size_t i = 0; i != chars; ++i)
{
const uint32_t c = load_be<uint32_t>(ucs4, i);
append_utf8_for(s, c);
}
return s;
}
/*
* Convert from UTF-8 to ISO 8859-1
*/
std::string utf8_to_latin1(const std::string& utf8)
{
std::string iso8859;
size_t position = 0;
while(position != utf8.size())
{
const uint8_t c1 = static_cast<uint8_t>(utf8[position++]);
if(c1 <= 0x7F)
{
iso8859 += static_cast<char>(c1);
}
else if(c1 >= 0xC0 && c1 <= 0xC7)
{
if(position == utf8.size())
throw Decoding_Error("UTF-8: sequence truncated");
const uint8_t c2 = static_cast<uint8_t>(utf8[position++]);
const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F);
if(iso_char <= 0x7F)
throw Decoding_Error("UTF-8: sequence longer than needed");
iso8859 += static_cast<char>(iso_char);
}
else
throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used");
}
return iso8859;
}
namespace Charset {
namespace {
/*
* Convert from UCS-2 to ISO 8859-1
*/
std::string ucs2_to_latin1(const std::string& ucs2)
{
if(ucs2.size() % 2 == 1)
throw Decoding_Error("UCS-2 string has an odd number of bytes");
std::string latin1;
for(size_t i = 0; i != ucs2.size(); i += 2)
{
const uint8_t c1 = ucs2[i];
const uint8_t c2 = ucs2[i+1];
if(c1 != 0)
throw Decoding_Error("UCS-2 has non-Latin1 characters");
latin1 += static_cast<char>(c2);
}
return latin1;
}
/*
* Convert from ISO 8859-1 to UTF-8
*/
std::string latin1_to_utf8(const std::string& iso8859)
{
std::string utf8;
for(size_t i = 0; i != iso8859.size(); ++i)
{
const uint8_t c = static_cast<uint8_t>(iso8859[i]);
if(c <= 0x7F)
utf8 += static_cast<char>(c);
else
{
utf8 += static_cast<char>((0xC0 | (c >> 6)));
utf8 += static_cast<char>((0x80 | (c & 0x3F)));
}
}
return utf8;
}
}
/*
* Perform character set transcoding
*/
std::string transcode(const std::string& str,
Character_Set to, Character_Set from)
{
if(to == LOCAL_CHARSET)
to = LATIN1_CHARSET;
if(from == LOCAL_CHARSET)
from = LATIN1_CHARSET;
if(to == from)
return str;
if(from == LATIN1_CHARSET && to == UTF8_CHARSET)
return latin1_to_utf8(str);
if(from == UTF8_CHARSET && to == LATIN1_CHARSET)
return utf8_to_latin1(str);
if(from == UCS2_CHARSET && to == LATIN1_CHARSET)
return ucs2_to_latin1(str);
throw Invalid_Argument("Unknown transcoding operation from " +
std::to_string(from) + " to " + std::to_string(to));
}
/*
* Check if a character represents a digit
*/
bool is_digit(char c)
{
if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
c == '5' || c == '6' || c == '7' || c == '8' || c == '9')
return true;
return false;
}
/*
* Check if a character represents whitespace
*/
bool is_space(char c)
{
if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
return true;
return false;
}
/*
* Convert a character to a digit
*/
uint8_t char2digit(char c)
{
switch(c)
{
case '0': return 0;
case '1': return 1;
case '2': return 2;
case '3': return 3;
case '4': return 4;
case '5': return 5;
case '6': return 6;
case '7': return 7;
case '8': return 8;
case '9': return 9;
}
throw Invalid_Argument("char2digit: Input is not a digit character");
}
/*
* Convert a digit to a character
*/
char digit2char(uint8_t b)
{
switch(b)
{
case 0: return '0';
case 1: return '1';
case 2: return '2';
case 3: return '3';
case 4: return '4';
case 5: return '5';
case 6: return '6';
case 7: return '7';
case 8: return '8';
case 9: return '9';
}
throw Invalid_Argument("digit2char: Input is not a digit");
}
/*
* Case-insensitive character comparison
*/
bool caseless_cmp(char a, char b)
{
return (std::tolower(static_cast<unsigned char>(a)) ==
std::tolower(static_cast<unsigned char>(b)));
}
}
}