charset.cpp - mozsearch

Enable keyboard shortcuts

/*

* Character Set Handling

* (C) 1999-2007 Jack Lloyd

* Botan is released under the Simplified BSD License (see license.txt)

*/

#include <botan/charset.h>

#include <botan/exceptn.h>

#include <botan/loadstor.h>

#include <cctype>

namespace Botan {

namespace {

void append_utf8_for(std::string& s, uint32_t c)

   if(c >= 0xD800 && c < 0xE000)

      throw Decoding_Error("Invalid Unicode character");

   if(c <= 0x7F)

      const uint8_t b0 = static_cast<uint8_t>(c);

      s.push_back(static_cast<char>(b0));

   else if(c <= 0x7FF)

      const uint8_t b0 = 0xC0 | static_cast<uint8_t>(c >> 6);

      const uint8_t b1 = 0x80 | static_cast<uint8_t>(c & 0x3F);

      s.push_back(static_cast<char>(b0));

      s.push_back(static_cast<char>(b1));

   else if(c <= 0xFFFF)

      const uint8_t b0 = 0xE0 | static_cast<uint8_t>(c >> 12);

      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);

      const uint8_t b2 = 0x80 | static_cast<uint8_t>(c & 0x3F);

      s.push_back(static_cast<char>(b0));

      s.push_back(static_cast<char>(b1));

      s.push_back(static_cast<char>(b2));

   else if(c <= 0x10FFFF)

      const uint8_t b0 = 0xF0 | static_cast<uint8_t>(c >> 18);

      const uint8_t b1 = 0x80 | static_cast<uint8_t>((c >> 12) & 0x3F);

      const uint8_t b2 = 0x80 | static_cast<uint8_t>((c >> 6) & 0x3F);

      const uint8_t b3 = 0x80 | static_cast<uint8_t>(c & 0x3F);

      s.push_back(static_cast<char>(b0));

      s.push_back(static_cast<char>(b1));

      s.push_back(static_cast<char>(b2));

      s.push_back(static_cast<char>(b3));

   else

      throw Decoding_Error("Invalid Unicode character");

std::string ucs2_to_utf8(const uint8_t ucs2[], size_t len)

   if(len % 2 != 0)

      throw Decoding_Error("Invalid length for UCS-2 string");

   const size_t chars = len / 2;

   std::string s;

   for(size_t i = 0; i != chars; ++i)

      const uint16_t c = load_be<uint16_t>(ucs2, i);

      append_utf8_for(s, c);

   return s;

std::string ucs4_to_utf8(const uint8_t ucs4[], size_t len)

   if(len % 4 != 0)

      throw Decoding_Error("Invalid length for UCS-4 string");

   const size_t chars = len / 4;

   std::string s;

   for(size_t i = 0; i != chars; ++i)

      const uint32_t c = load_be<uint32_t>(ucs4, i);

      append_utf8_for(s, c);

   return s;

/*

* Convert from UTF-8 to ISO 8859-1

*/

std::string utf8_to_latin1(const std::string& utf8)

   std::string iso8859;

   size_t position = 0;

   while(position != utf8.size())

      const uint8_t c1 = static_cast<uint8_t>(utf8[position++]);

      if(c1 <= 0x7F)

         iso8859 += static_cast<char>(c1);

      else if(c1 >= 0xC0 && c1 <= 0xC7)

         if(position == utf8.size())

            throw Decoding_Error("UTF-8: sequence truncated");

         const uint8_t c2 = static_cast<uint8_t>(utf8[position++]);

         const uint8_t iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F);

         if(iso_char <= 0x7F)

            throw Decoding_Error("UTF-8: sequence longer than needed");

         iso8859 += static_cast<char>(iso_char);

      else

         throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used");

   return iso8859;

namespace Charset {

namespace {

/*

* Convert from UCS-2 to ISO 8859-1

*/

std::string ucs2_to_latin1(const std::string& ucs2)

   if(ucs2.size() % 2 == 1)

      throw Decoding_Error("UCS-2 string has an odd number of bytes");

   std::string latin1;

   for(size_t i = 0; i != ucs2.size(); i += 2)

      const uint8_t c1 = ucs2[i];

      const uint8_t c2 = ucs2[i+1];

      if(c1 != 0)

         throw Decoding_Error("UCS-2 has non-Latin1 characters");

      latin1 += static_cast<char>(c2);

   return latin1;

/*

* Convert from ISO 8859-1 to UTF-8

*/

std::string latin1_to_utf8(const std::string& iso8859)

   std::string utf8;

   for(size_t i = 0; i != iso8859.size(); ++i)

      const uint8_t c = static_cast<uint8_t>(iso8859[i]);

      if(c <= 0x7F)

         utf8 += static_cast<char>(c);

      else

         utf8 += static_cast<char>((0xC0 | (c >> 6)));

         utf8 += static_cast<char>((0x80 | (c & 0x3F)));

   return utf8;

/*

* Perform character set transcoding

*/

std::string transcode(const std::string& str,

                      Character_Set to, Character_Set from)

   if(to == LOCAL_CHARSET)

      to = LATIN1_CHARSET;

   if(from == LOCAL_CHARSET)

      from = LATIN1_CHARSET;

   if(to == from)

      return str;

   if(from == LATIN1_CHARSET && to == UTF8_CHARSET)

      return latin1_to_utf8(str);

   if(from == UTF8_CHARSET && to == LATIN1_CHARSET)

      return utf8_to_latin1(str);

   if(from == UCS2_CHARSET && to == LATIN1_CHARSET)

      return ucs2_to_latin1(str);

   throw Invalid_Argument("Unknown transcoding operation from " +

                          std::to_string(from) + " to " + std::to_string(to));

/*

* Check if a character represents a digit

*/

bool is_digit(char c)

   if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||

      c == '5' || c == '6' || c == '7' || c == '8' || c == '9')

      return true;

   return false;

/*

* Check if a character represents whitespace

*/

bool is_space(char c)

   if(c == ' ' || c == '\t' || c == '\n' || c == '\r')

      return true;

   return false;

/*

* Convert a character to a digit

*/

uint8_t char2digit(char c)

   switch(c)

      case '0': return 0;

      case '1': return 1;

      case '2': return 2;

      case '3': return 3;

      case '4': return 4;

      case '5': return 5;

      case '6': return 6;

      case '7': return 7;

      case '8': return 8;

      case '9': return 9;

   throw Invalid_Argument("char2digit: Input is not a digit character");

/*

* Convert a digit to a character

*/

char digit2char(uint8_t b)

   switch(b)

      case 0: return '0';

      case 1: return '1';

      case 2: return '2';

      case 3: return '3';

      case 4: return '4';

      case 5: return '5';

      case 6: return '6';

      case 7: return '7';

      case 8: return '8';

      case 9: return '9';

   throw Invalid_Argument("digit2char: Input is not a digit");

/*

* Case-insensitive character comparison

*/

bool caseless_cmp(char a, char b)

   return (std::tolower(static_cast<unsigned char>(a)) ==

           std::tolower(static_cast<unsigned char>(b)));