DOMtoATK.h - mozsearch

Enable keyboard shortcuts

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

/* vim: set ts=2 et sw=2 tw=80: */

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include <glib.h>

#include <cstdint>

#include "mozilla/a11y/HyperTextAccessibleBase.h"

#include "nsCharTraits.h"

#include "nsString.h"

/**

 * ATK offsets are counted in unicode codepoints, while DOM offsets are counted

 * in UTF-16 code units.  That makes a difference for non-BMP characters,

 * which need two UTF-16 code units to be represented (a pair of surrogates),

 * while they are just one unicode character.

 * To keep synchronization between ATK offsets (unicode codepoints) and DOM

 * offsets (UTF-16 code units), after translation from UTF-16 to UTF-8 we add a

 * BOM after each non-BMP character (which would otherwise use 2 UTF-16

 * code units for only 1 unicode codepoint).

 * BOMs (Byte Order Marks, U+FEFF, also known as ZERO WIDTH NO-BREAK SPACE, but

 * that usage is deprecated) normally only appear at the beginning of unicode

 * files, but their occurrence within text (notably after cut&paste) is not

 * uncommon, and are thus considered as non-text.

 * Since the selection requested through ATK may not contain both surrogates

 * at the ends of the selection, we need to fetch one UTF-16 code point more

 * on both side, and get rid of it before returning the string to ATK. The

 * ATKStringConverterHelper class maintains this, NewATKString should be used

 * to call it properly.

 * In the end,

 * - if the start is between the high and low surrogates, the UTF-8 result

 * includes a BOM from it but not the character

 * - if the end is between the high and low surrogates, the UTF-8 result

 * includes the character but *not* the BOM

 * - all non-BMP characters that are fully in the string are in the UTF-8 result

 * as character followed by BOM

*/

namespace mozilla {

namespace a11y {

namespace DOMtoATK {

/**

 * Converts a string of accessible text into ATK gchar* string (by adding

 * BOMs). This can be used when offsets do not need to be adjusted because

 * ends of the string can not fall between surrogates.

*/

gchar* Convert(const nsAString& aStr);

/**

 * Add a BOM after each non-BMP character.

*/

void AddBOMs(nsACString& aDest, const nsACString& aSource);

class ATKStringConverterHelper {

 public:

  ATKStringConverterHelper(void)

#ifdef DEBUG

        mAdjusted(false),

#endif

        mStartShifted(false),

        mEndShifted(false) {

/**

   * In order to properly get non-BMP values, offsets need to be changed

   * to get one character more on each end, so that ConvertUTF16toUTF8 can

   * convert surrogates even if the originally requested offsets fall between

   * them.

*/

  void AdjustOffsets(gint* aStartOffset, gint* aEndOffset, gint count);

/**

   * Converts a string of accessible text with adjusted offsets into ATK

   * gchar* string (by adding BOMs).  Note, AdjustOffsets has to be called

   * before getting the text passed to this.

*/

  gchar* ConvertAdjusted(const nsAString& aStr);

 private:

/**

   * Remove the additional characters requested by PrepareUTF16toUTF8.

*/

  gchar* FinishUTF16toUTF8(nsCString& aStr);

#ifdef DEBUG

  bool mAdjusted;

#endif

  bool mStartShifted;

  bool mEndShifted;

};

/**

 * Get text from aAccessible, using ATKStringConverterHelper to properly

 * introduce appropriate BOMs.

*/

inline gchar* NewATKString(HyperTextAccessibleBase* aAccessible,

                           gint aStartOffset, gint aEndOffset) {

  gint startOffset = aStartOffset, endOffset = aEndOffset;

  ATKStringConverterHelper converter;

  converter.AdjustOffsets(&startOffset, &endOffset,

                          gint(aAccessible->CharacterCount()));

  nsAutoString str;

  aAccessible->TextSubstring(startOffset, endOffset, str);

  if (str.Length() == 0) {

    // Bogus offsets, or empty string, either way we do not need conversion.

    return g_strdup("");

  return converter.ConvertAdjusted(str);

/**

 * Get a character from aAccessible, fetching more data as appropriate to

 * properly get non-BMP characters or a BOM as appropriate.

*/

inline gunichar ATKCharacter(HyperTextAccessibleBase* aAccessible,

                             gint aOffset) {

  // char16_t is unsigned short in Mozilla, gnuichar is guint32 in glib.

  gunichar character = static_cast<gunichar>(aAccessible->CharAt(aOffset));

  if (NS_IS_LOW_SURROGATE(character)) {

    // Trailing surrogate, return BOM instead.

    return 0xFEFF;

  if (NS_IS_HIGH_SURROGATE(character)) {

    // Heading surrogate, get the trailing surrogate and combine them.

    gunichar characterLow =

        static_cast<gunichar>(aAccessible->CharAt(aOffset + 1));

    if (!NS_IS_LOW_SURROGATE(characterLow)) {

      // It should have been a trailing surrogate... Flag the error.

      return 0xFFFD;

    return SURROGATE_TO_UCS4(character, characterLow);

  return character;

}  // namespace DOMtoATK

}  // namespace a11y

}  // namespace mozilla