UrlbarTokenizer.sys.mjs

mozilla-central/browser/components/urlbar/UrlbarTokenizer.sys.mjs (file symbol)

Enable keyboard shortcuts

Source code

File a bug in Firefox :: Address Bar

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**

 * This module exports a tokenizer to be used by the urlbar model.

 * Emitted tokens are objects in the shape { type, value }, where type is one

 * of UrlbarTokenizer.TYPE.

*/

const lazy = {};

ChromeUtils.defineESModuleGetters(lazy, {

  UrlbarPrefs: "resource:///modules/UrlbarPrefs.sys.mjs",

  UrlbarUtils: "resource:///modules/UrlbarUtils.sys.mjs",

  PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs",

});

ChromeUtils.defineLazyGetter(lazy, "logger", () =>

  lazy.UrlbarUtils.getLogger({ prefix: "Tokenizer" })

);

ChromeUtils.defineLazyGetter(lazy, "gFluentStrings", function () {

  return new Localization(["browser/browser.ftl"]);

});

/*

 * This Map stores key-value pairs where each key is a restrict token

 * and each value is a corresponding localized restrict keyword.

 * E.g. "*" maps to "Bookmarks"

*/

let tokenToKeyword = new Map();

export var UrlbarTokenizer = {

  // Regex matching on whitespaces.

  REGEXP_SPACES: /\s+/,

  REGEXP_SPACES_START: /^\s+/,

  // Regex used to guess url-like strings.

  // These are not expected to be 100% correct, we accept some user mistypes

  // and we're unlikely to be able to cover 100% of the cases.

  REGEXP_LIKE_PROTOCOL: /^[A-Z+.-]+:\/*(?!\/)/i,

  REGEXP_USERINFO_INVALID_CHARS: /[^\w.~%!$&'()*+,;=:-]/,

  REGEXP_HOSTPORT_INVALID_CHARS: /[^\[\]A-Z0-9.:-]/i,

  REGEXP_SINGLE_WORD_HOST: /^[^.:]+$/i,

  REGEXP_HOSTPORT_IP_LIKE: /^(?=(.*[.:].*){2})[a-f0-9\.\[\]:]+$/i,

  // This accepts partial IPv4.

  REGEXP_HOSTPORT_INVALID_IP:

    /\.{2,}|\d{5,}|\d{4,}(?![:\]])|^\.|^(\d+\.){4,}\d+$|^\d{4,}$/,

  // This only accepts complete IPv4.

  REGEXP_HOSTPORT_IPV4: /^(\d{1,3}\.){3,}\d{1,3}(:\d+)?$/,

  // This accepts partial IPv6.

  REGEXP_HOSTPORT_IPV6: /^\[([0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?$/i,

  REGEXP_COMMON_EMAIL: /^[\w!#$%&'*+/=?^`{|}~.-]+@[\[\]A-Z0-9.-]+$/i,

  REGEXP_HAS_PORT: /:\d+$/,

  // Regex matching a percent encoded char at the beginning of a string.

  REGEXP_PERCENT_ENCODED_START: /^(%[0-9a-f]{2}){2,}/i,

  // Regex matching scheme and colon, plus, if present, two slashes.

  REGEXP_PREFIX: /^[a-z-]+:(?:\/){0,2}/i,

  TYPE: {

    TEXT: 1,

    POSSIBLE_ORIGIN: 2, // It may be an ip, a domain, but even just a single word used as host.

    POSSIBLE_URL: 3, // Consumers should still check this with a fixup.

    RESTRICT_HISTORY: 4,

    RESTRICT_BOOKMARK: 5,

    RESTRICT_TAG: 6,

    RESTRICT_OPENPAGE: 7,

    RESTRICT_SEARCH: 8,

    RESTRICT_TITLE: 9,

    RESTRICT_URL: 10,

    RESTRICT_ACTION: 11,

},

  // The special characters below can be typed into the urlbar to restrict

  // the search to a certain category, like history, bookmarks or open pages; or

  // to force a match on just the title or url.

  // These restriction characters can be typed alone, or at word boundaries,

  // provided their meaning cannot be confused, for example # could be present

  // in a valid url, and thus it should not be interpreted as a restriction.

  RESTRICT: {

    HISTORY: "^",

    BOOKMARK: "*",

    TAG: "+",

    OPENPAGE: "%",

    SEARCH: "?",

    TITLE: "#",

    URL: "$",

    ACTION: ">",

},

  // The keys of characters in RESTRICT that will enter search mode.

  get SEARCH_MODE_RESTRICT() {

    const keys = [

      this.RESTRICT.HISTORY,

      this.RESTRICT.BOOKMARK,

      this.RESTRICT.OPENPAGE,

      this.RESTRICT.SEARCH,

];

    if (lazy.UrlbarPrefs.get("scotchBonnet.enableOverride")) {

      keys.push(this.RESTRICT.ACTION);

    return new Set(keys);

},

  async loadL10nRestrictKeywords() {

    let l10nKeywords = await lazy.gFluentStrings.formatValues(

      lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => {

        let name = lazy.UrlbarUtils.getResultSourceName(mode.source);

        return { id: `urlbar-search-mode-${name}` };

})

);

    for (let { restrict } of lazy.UrlbarUtils.LOCAL_SEARCH_MODES) {

      tokenToKeyword.set(restrict, l10nKeywords.shift());

},

/**

   * Gets the cached localized restrict keywords. If keywords are not cached

   * fetch the localized keywords first and then return the keywords.

   * @returns {Map} The tokenToKeyword Map.

*/

  async getL10nRestrictKeywords() {

    if (tokenToKeyword.size === 0) {

      await this.loadL10nRestrictKeywords();

    return tokenToKeyword;

},

/**

   * Returns whether the passed in token looks like a URL.

   * This is based on guessing and heuristics, that means if this function

   * returns false, it's surely not a URL, if it returns true, the result must

   * still be verified through URIFixup.

   * @param {string} token

   *        The string token to verify

   * @param {boolean} [requirePath] The url must have a path

   * @returns {boolean} whether the token looks like a URL.

*/

  looksLikeUrl(token, { requirePath = false } = {}) {

    if (token.length < 2) {

      return false;

    // Ignore spaces and require path for the data: protocol.

    if (token.startsWith("data:")) {

      return token.length > 5;

    if (this.REGEXP_SPACES.test(token)) {

      return false;

    // If it starts with something that looks like a protocol, it's likely a url.

    if (this.REGEXP_LIKE_PROTOCOL.test(token)) {

      return true;

    // Guess path and prePath. At this point we should be analyzing strings not

    // having a protocol.

    let slashIndex = token.indexOf("/");

    let prePath = slashIndex != -1 ? token.slice(0, slashIndex) : token;

    if (!this.looksLikeOrigin(prePath, { ignoreKnownDomains: true })) {

      return false;

    let path = slashIndex != -1 ? token.slice(slashIndex) : "";

    lazy.logger.debug("path", path);

    if (requirePath && !path) {

      return false;

    // If there are both path and userinfo, it's likely a url.

    let atIndex = prePath.indexOf("@");

    let userinfo = atIndex != -1 ? prePath.slice(0, atIndex) : "";

    if (path.length && userinfo.length) {

      return true;

    // If the first character after the slash in the path is a letter, then the

    // token may be an "abc/def" url.

    if (/^\/[a-z]/i.test(path)) {

      return true;

    // If the path contains special chars, it is likely a url.

    if (["%", "?", "#"].some(c => path.includes(c))) {

      return true;

    // The above looksLikeOrigin call told us the prePath looks like an origin,

    // now we go into details checking some common origins.

    let hostPort = atIndex != -1 ? prePath.slice(atIndex + 1) : prePath;

    if (this.REGEXP_HOSTPORT_IPV4.test(hostPort)) {

      return true;

    // ipv6 is very complex to support, just check for a few chars.

    if (

      this.REGEXP_HOSTPORT_IPV6.test(hostPort) &&

      ["[", "]", ":"].some(c => hostPort.includes(c))

) {

      return true;

    if (Services.uriFixup.isDomainKnown(hostPort)) {

      return true;

    return false;

},

/**

   * Returns whether the passed in token looks like an origin.

   * This is based on guessing and heuristics, that means if this function

   * returns false, it's surely not an origin, if it returns true, the result

   * must still be verified through URIFixup.

   * @param {string} token

   *        The string token to verify

   * @param {object} options Options object

   * @param {boolean} [options.ignoreKnownDomains] If true, the origin doesn't have to be

   *        in the known domain list

   * @param {boolean} [options.noIp] If true, the origin cannot be an IP address

   * @param {boolean} [options.noPort] If true, the origin cannot have a port number

   * @returns {boolean} whether the token looks like an origin.

*/

  looksLikeOrigin(

    token,

    { ignoreKnownDomains = false, noIp = false, noPort = false } = {}

) {

    if (!token.length) {

      return false;

    let atIndex = token.indexOf("@");

    if (atIndex != -1 && this.REGEXP_COMMON_EMAIL.test(token)) {

      // We prefer handling it as an email rather than an origin with userinfo.

      return false;

    let userinfo = atIndex != -1 ? token.slice(0, atIndex) : "";

    let hostPort = atIndex != -1 ? token.slice(atIndex + 1) : token;

    let hasPort = this.REGEXP_HAS_PORT.test(hostPort);

    lazy.logger.debug("userinfo", userinfo);

    lazy.logger.debug("hostPort", hostPort);

    if (noPort && hasPort) {

      return false;

    if (

      this.REGEXP_HOSTPORT_IPV4.test(hostPort) ||

      this.REGEXP_HOSTPORT_IPV6.test(hostPort)

) {

      return !noIp;

    // Check for invalid chars.

    if (

      this.REGEXP_LIKE_PROTOCOL.test(hostPort) ||

      this.REGEXP_USERINFO_INVALID_CHARS.test(userinfo) ||

      this.REGEXP_HOSTPORT_INVALID_CHARS.test(hostPort) ||

      (!this.REGEXP_SINGLE_WORD_HOST.test(hostPort) &&

        this.REGEXP_HOSTPORT_IP_LIKE.test(hostPort) &&

        this.REGEXP_HOSTPORT_INVALID_IP.test(hostPort))

) {

      return false;

    // If it looks like a single word host, check the known domains.

    if (

      !ignoreKnownDomains &&

      !userinfo &&

      !hasPort &&

      this.REGEXP_SINGLE_WORD_HOST.test(hostPort)

) {

      return Services.uriFixup.isDomainKnown(hostPort);

    return true;

},

/**

   * Tokenizes the searchString from a UrlbarQueryContext.

   * @param {UrlbarQueryContext} queryContext

   *        The query context object to tokenize

   * @returns {UrlbarQueryContext} the same query context object with a new

   *          tokens property.

*/

  tokenize(queryContext) {

    lazy.logger.debug("Tokenizing search string", {

      searchString: queryContext.searchString,

});

    if (!queryContext.trimmedSearchString) {

      queryContext.tokens = [];

      return queryContext;

    let unfiltered = splitString(queryContext);

    let tokens = filterTokens(unfiltered);

    queryContext.tokens = tokens;

    return queryContext;

},

/**

   * Given a token, tells if it's a restriction token.

   * @param {object} token

   *   The token to check.

   * @returns {boolean} Whether the token is a restriction character.

*/

  isRestrictionToken(token) {

    return (

      token &&

      token.type >= this.TYPE.RESTRICT_HISTORY &&

      token.type <= this.TYPE.RESTRICT_URL

);

},

};

const CHAR_TO_TYPE_MAP = new Map(

  Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [

    char,

    UrlbarTokenizer.TYPE[`RESTRICT_${type}`],

])

);

/**

 * Given a queryContext object, splits its searchString into string tokens.

 * @param {UrlbarQueryContext} queryContext

 *        The query context object to tokenize.

 * @param {string} queryContext.searchString

 *        The search string to split.

 * @param {object} queryContext.searchMode

 *        A search mode object.

 * @returns {Array} An array of string tokens.

*/

function splitString({ searchString, searchMode }) {

  // The first step is splitting on unicode whitespaces. We ignore whitespaces

  // if the search string starts with "data:", to better support Web developers

  // and compatiblity with other browsers.

  let trimmed = searchString.trim();

  let tokens;

  if (trimmed.startsWith("data:")) {

    tokens = [trimmed];

  } else if (trimmed.length < 500) {

    tokens = trimmed.split(UrlbarTokenizer.REGEXP_SPACES);

  } else {

    // If the string is very long, tokenizing all of it would be expensive. So

    // we only tokenize a part of it, then let the last token become a

    // catch-all.

    tokens = trimmed.substring(0, 500).split(UrlbarTokenizer.REGEXP_SPACES);

    tokens[tokens.length - 1] += trimmed.substring(500);

  if (!tokens.length) {

    return tokens;

  // If there is no separate restriction token, it's possible we have to split

  // a token, if it's the first one and it includes a leading restriction char

  // or it's the last one and it includes a trailing restriction char.

  // This allows to not require the user to add artificial whitespaces to

  // enforce restrictions, for example typing questions would restrict to

  // search results.

  const hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));

  const firstToken = tokens[0];

  const isFirstTokenAKeyword =

    !Object.values(UrlbarTokenizer.RESTRICT).includes(firstToken) &&

    lazy.PlacesUtils.keywords.isKeywordFromCache(firstToken);

  if (hasRestrictionToken || isFirstTokenAKeyword) {

    return tokens;

  // Check for an unambiguous restriction char at the beginning of the first

  // token, or at the end of the last token. We only count trailing restriction

  // chars if they are the search restriction char, which is "?". This is to

  // allow for a typed question to yield only search results.

  if (

    CHAR_TO_TYPE_MAP.has(firstToken[0]) &&

    !UrlbarTokenizer.REGEXP_PERCENT_ENCODED_START.test(firstToken) &&

    !searchMode

) {

    tokens[0] = firstToken.substring(1);

    tokens.splice(0, 0, firstToken[0]);

    return tokens;

  const lastIndex = tokens.length - 1;

  const lastToken = tokens[lastIndex];

  if (

    lastToken[lastToken.length - 1] == UrlbarTokenizer.RESTRICT.SEARCH &&

    !UrlbarTokenizer.looksLikeUrl(lastToken, { requirePath: true })

) {

    tokens[lastIndex] = lastToken.substring(0, lastToken.length - 1);

    tokens.push(lastToken[lastToken.length - 1]);

  return tokens;

/**

 * Given an array of unfiltered tokens, this function filters them and converts

 * to token objects with a type.

 * @param {Array} tokens

 *        An array of strings, representing search tokens.

 * @returns {Array} An array of token objects.

 * Note: restriction characters are only considered if they appear at the start

 *       or at the end of the tokens list. In case of restriction characters

 *       conflict, the most external ones win. Leading ones win over trailing

 *       ones. Discarded restriction characters are considered text.

*/

function filterTokens(tokens) {

  let filtered = [];

  let restrictions = [];

  const isFirstTokenAKeyword =

    !Object.values(UrlbarTokenizer.RESTRICT).includes(tokens[0]) &&

    lazy.PlacesUtils.keywords.isKeywordFromCache(tokens[0]);

  for (let i = 0; i < tokens.length; ++i) {

    let token = tokens[i];

    let tokenObj = {

      value: token,

      lowerCaseValue: token.toLocaleLowerCase(),

      type: UrlbarTokenizer.TYPE.TEXT,

};

    // For privacy reasons, we don't want to send a data (or other kind of) URI

    // to a search engine. So we want to parse any single long token below.

    if (tokens.length > 1 && token.length > 500) {

      filtered.push(tokenObj);

      break;

    if (isFirstTokenAKeyword) {

      filtered.push(tokenObj);

      continue;

    let restrictionType = CHAR_TO_TYPE_MAP.get(token);

    if (restrictionType) {

      restrictions.push({ index: i, type: restrictionType });

    } else if (UrlbarTokenizer.looksLikeOrigin(token)) {

      tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;

    } else if (UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })) {

      tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;

    filtered.push(tokenObj);

  // Handle restriction characters.

  if (restrictions.length) {

    // We can apply two kind of restrictions: type (bookmark, search, ...) and

    // matching (url, title). These kind of restrictions can be combined, but we

    // can only have one restriction per kind.

    let matchingRestrictionFound = false;

    let typeRestrictionFound = false;

    function assignRestriction(r) {

      if (r && !(matchingRestrictionFound && typeRestrictionFound)) {

        if (

            UrlbarTokenizer.TYPE.RESTRICT_TITLE,

            UrlbarTokenizer.TYPE.RESTRICT_URL,

          ].includes(r.type)

) {

          if (!matchingRestrictionFound) {

            matchingRestrictionFound = true;

            filtered[r.index].type = r.type;

            return true;

        } else if (!typeRestrictionFound) {

          typeRestrictionFound = true;

          filtered[r.index].type = r.type;

          return true;

      return false;

    // Look at the first token.

    let found = assignRestriction(restrictions.find(r => r.index == 0));

    if (found) {

      // If the first token was assigned, look at the next one.

      assignRestriction(restrictions.find(r => r.index == 1));

    // Then look at the last token.

    let lastIndex = tokens.length - 1;

    found = assignRestriction(restrictions.find(r => r.index == lastIndex));

    if (found) {

      // If the last token was assigned, look at the previous one.

      assignRestriction(restrictions.find(r => r.index == lastIndex - 1));

  lazy.logger.info("Filtered Tokens", filtered);

  return filtered;