summaryrefslogtreecommitdiffstats
path: root/browser/components/urlbar/UrlbarTokenizer.sys.mjs
blob: bbe12aa2fc5d9e283d476ca4be49d5c552c850fa (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/**
 * This module exports a tokenizer to be used by the urlbar model.
 * Emitted tokens are objects in the shape { type, value }, where type is one
 * of UrlbarTokenizer.TYPE.
 */

import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";

const lazy = {};
ChromeUtils.defineESModuleGetters(lazy, {
  UrlbarUtils: "resource:///modules/UrlbarUtils.sys.mjs",
});

XPCOMUtils.defineLazyGetter(lazy, "logger", () =>
  lazy.UrlbarUtils.getLogger({ prefix: "Tokenizer" })
);

export var UrlbarTokenizer = {
  // Regex matching on whitespaces.
  REGEXP_SPACES: /\s+/,
  REGEXP_SPACES_START: /^\s+/,

  // Regex used to guess url-like strings.
  // These are not expected to be 100% correct, we accept some user mistypes
  // and we're unlikely to be able to cover 100% of the cases.
  REGEXP_LIKE_PROTOCOL: /^[A-Z+.-]+:\/*(?!\/)/i,
  REGEXP_USERINFO_INVALID_CHARS: /[^\w.~%!$&'()*+,;=:-]/,
  REGEXP_HOSTPORT_INVALID_CHARS: /[^\[\]A-Z0-9.:-]/i,
  REGEXP_SINGLE_WORD_HOST: /^[^.:]+$/i,
  REGEXP_HOSTPORT_IP_LIKE: /^(?=(.*[.:].*){2})[a-f0-9\.\[\]:]+$/i,
  // This accepts partial IPv4.
  REGEXP_HOSTPORT_INVALID_IP:
    /\.{2,}|\d{5,}|\d{4,}(?![:\]])|^\.|^(\d+\.){4,}\d+$|^\d{4,}$/,
  // This only accepts complete IPv4.
  REGEXP_HOSTPORT_IPV4: /^(\d{1,3}\.){3,}\d{1,3}(:\d+)?$/,
  // This accepts partial IPv6.
  REGEXP_HOSTPORT_IPV6: /^\[([0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?$/i,
  REGEXP_COMMON_EMAIL: /^[\w!#$%&'*+/=?^`{|}~.-]+@[\[\]A-Z0-9.-]+$/i,
  REGEXP_HAS_PORT: /:\d+$/,
  // Regex matching a percent encoded char at the beginning of a string.
  REGEXP_PERCENT_ENCODED_START: /^(%[0-9a-f]{2}){2,}/i,
  // Regex matching scheme and colon, plus, if present, two slashes.
  REGEXP_PREFIX: /^[a-z-]+:(?:\/){0,2}/i,

  TYPE: {
    TEXT: 1,
    POSSIBLE_ORIGIN: 2, // It may be an ip, a domain, but even just a single word used as host.
    POSSIBLE_URL: 3, // Consumers should still check this with a fixup.
    RESTRICT_HISTORY: 4,
    RESTRICT_BOOKMARK: 5,
    RESTRICT_TAG: 6,
    RESTRICT_OPENPAGE: 7,
    RESTRICT_SEARCH: 8,
    RESTRICT_TITLE: 9,
    RESTRICT_URL: 10,
    RESTRICT_ACTION: 11,
  },

  // The special characters below can be typed into the urlbar to restrict
  // the search to a certain category, like history, bookmarks or open pages; or
  // to force a match on just the title or url.
  // These restriction characters can be typed alone, or at word boundaries,
  // provided their meaning cannot be confused, for example # could be present
  // in a valid url, and thus it should not be interpreted as a restriction.
  RESTRICT: {
    HISTORY: "^",
    BOOKMARK: "*",
    TAG: "+",
    OPENPAGE: "%",
    SEARCH: "?",
    TITLE: "#",
    URL: "$",
    ACTION: ">",
  },

  // The keys of characters in RESTRICT that will enter search mode.
  get SEARCH_MODE_RESTRICT() {
    return new Set([
      this.RESTRICT.HISTORY,
      this.RESTRICT.BOOKMARK,
      this.RESTRICT.OPENPAGE,
      this.RESTRICT.SEARCH,
      this.RESTRICT.ACTION,
    ]);
  },

  /**
   * Returns whether the passed in token looks like a URL.
   * This is based on guessing and heuristics, that means if this function
   * returns false, it's surely not a URL, if it returns true, the result must
   * still be verified through URIFixup.
   *
   * @param {string} token
   *        The string token to verify
   * @param {boolean} [requirePath] The url must have a path
   * @returns {boolean} whether the token looks like a URL.
   */
  looksLikeUrl(token, { requirePath = false } = {}) {
    if (token.length < 2) {
      return false;
    }
    // Ignore spaces and require path for the data: protocol.
    if (token.startsWith("data:")) {
      return token.length > 5;
    }
    if (this.REGEXP_SPACES.test(token)) {
      return false;
    }
    // If it starts with something that looks like a protocol, it's likely a url.
    if (this.REGEXP_LIKE_PROTOCOL.test(token)) {
      return true;
    }
    // Guess path and prePath. At this point we should be analyzing strings not
    // having a protocol.
    let slashIndex = token.indexOf("/");
    let prePath = slashIndex != -1 ? token.slice(0, slashIndex) : token;
    if (!this.looksLikeOrigin(prePath, { ignoreKnownDomains: true })) {
      return false;
    }

    let path = slashIndex != -1 ? token.slice(slashIndex) : "";
    lazy.logger.debug("path", path);
    if (requirePath && !path) {
      return false;
    }
    // If there are both path and userinfo, it's likely a url.
    let atIndex = prePath.indexOf("@");
    let userinfo = atIndex != -1 ? prePath.slice(0, atIndex) : "";
    if (path.length && userinfo.length) {
      return true;
    }

    // If the first character after the slash in the path is a letter, then the
    // token may be an "abc/def" url.
    if (/^\/[a-z]/i.test(path)) {
      return true;
    }
    // If the path contains special chars, it is likely a url.
    if (["%", "?", "#"].some(c => path.includes(c))) {
      return true;
    }

    // The above looksLikeOrigin call told us the prePath looks like an origin,
    // now we go into details checking some common origins.
    let hostPort = atIndex != -1 ? prePath.slice(atIndex + 1) : prePath;
    if (this.REGEXP_HOSTPORT_IPV4.test(hostPort)) {
      return true;
    }
    // ipv6 is very complex to support, just check for a few chars.
    if (
      this.REGEXP_HOSTPORT_IPV6.test(hostPort) &&
      ["[", "]", ":"].some(c => hostPort.includes(c))
    ) {
      return true;
    }
    if (Services.uriFixup.isDomainKnown(hostPort)) {
      return true;
    }
    return false;
  },

  /**
   * Returns whether the passed in token looks like an origin.
   * This is based on guessing and heuristics, that means if this function
   * returns false, it's surely not an origin, if it returns true, the result
   * must still be verified through URIFixup.
   *
   * @param {string} token
   *        The string token to verify
   * @param {object} options Options object
   * @param {boolean} [options.ignoreKnownDomains] If true, the origin doesn't have to be
   *        in the known domain list
   * @param {boolean} [options.noIp] If true, the origin cannot be an IP address
   * @param {boolean} [options.noPort] If true, the origin cannot have a port number
   * @returns {boolean} whether the token looks like an origin.
   */
  looksLikeOrigin(
    token,
    { ignoreKnownDomains = false, noIp = false, noPort = false } = {}
  ) {
    if (!token.length) {
      return false;
    }
    let atIndex = token.indexOf("@");
    if (atIndex != -1 && this.REGEXP_COMMON_EMAIL.test(token)) {
      // We prefer handling it as an email rather than an origin with userinfo.
      return false;
    }
    let userinfo = atIndex != -1 ? token.slice(0, atIndex) : "";
    let hostPort = atIndex != -1 ? token.slice(atIndex + 1) : token;
    let hasPort = this.REGEXP_HAS_PORT.test(hostPort);
    lazy.logger.debug("userinfo", userinfo);
    lazy.logger.debug("hostPort", hostPort);
    if (noPort && hasPort) {
      return false;
    }
    if (
      this.REGEXP_HOSTPORT_IPV4.test(hostPort) ||
      this.REGEXP_HOSTPORT_IPV6.test(hostPort)
    ) {
      return !noIp;
    }

    // Check for invalid chars.
    if (
      this.REGEXP_LIKE_PROTOCOL.test(hostPort) ||
      this.REGEXP_USERINFO_INVALID_CHARS.test(userinfo) ||
      this.REGEXP_HOSTPORT_INVALID_CHARS.test(hostPort) ||
      (!this.REGEXP_SINGLE_WORD_HOST.test(hostPort) &&
        this.REGEXP_HOSTPORT_IP_LIKE.test(hostPort) &&
        this.REGEXP_HOSTPORT_INVALID_IP.test(hostPort))
    ) {
      return false;
    }

    // If it looks like a single word host, check the known domains.
    if (
      !ignoreKnownDomains &&
      !userinfo &&
      !hasPort &&
      this.REGEXP_SINGLE_WORD_HOST.test(hostPort)
    ) {
      return Services.uriFixup.isDomainKnown(hostPort);
    }

    return true;
  },

  /**
   * Tokenizes the searchString from a UrlbarQueryContext.
   *
   * @param {UrlbarQueryContext} queryContext
   *        The query context object to tokenize
   * @returns {UrlbarQueryContext} the same query context object with a new
   *          tokens property.
   */
  tokenize(queryContext) {
    lazy.logger.debug(
      "Tokenizing search string",
      JSON.stringify(queryContext.searchString)
    );
    if (!queryContext.trimmedSearchString) {
      queryContext.tokens = [];
      return queryContext;
    }
    let unfiltered = splitString(queryContext.searchString);
    let tokens = filterTokens(unfiltered);
    queryContext.tokens = tokens;
    return queryContext;
  },

  /**
   * Given a token, tells if it's a restriction token.
   *
   * @param {object} token
   *   The token to check.
   * @returns {boolean} Whether the token is a restriction character.
   */
  isRestrictionToken(token) {
    return (
      token &&
      token.type >= this.TYPE.RESTRICT_HISTORY &&
      token.type <= this.TYPE.RESTRICT_URL
    );
  },
};

const CHAR_TO_TYPE_MAP = new Map(
  Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [
    char,
    UrlbarTokenizer.TYPE[`RESTRICT_${type}`],
  ])
);

/**
 * Given a search string, splits it into string tokens.
 *
 * @param {string} searchString
 *        The search string to split
 * @returns {Array} An array of string tokens.
 */
function splitString(searchString) {
  // The first step is splitting on unicode whitespaces. We ignore whitespaces
  // if the search string starts with "data:", to better support Web developers
  // and compatiblity with other browsers.
  let trimmed = searchString.trim();
  let tokens = trimmed.startsWith("data:")
    ? [trimmed]
    : trimmed.split(UrlbarTokenizer.REGEXP_SPACES);

  if (!tokens.length) {
    return tokens;
  }

  // If there is no separate restriction token, it's possible we have to split
  // a token, if it's the first one and it includes a leading restriction char
  // or it's the last one and it includes a trailing restriction char.
  // This allows to not require the user to add artificial whitespaces to
  // enforce restrictions, for example typing questions would restrict to
  // search results.
  const hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));
  if (hasRestrictionToken) {
    return tokens;
  }

  // Check for an unambiguous restriction char at the beginning of the first
  // token, or at the end of the last token. We only count trailing restriction
  // chars if they are the search restriction char, which is "?". This is to
  // allow for a typed question to yield only search results.
  const firstToken = tokens[0];
  if (
    CHAR_TO_TYPE_MAP.has(firstToken[0]) &&
    !UrlbarTokenizer.REGEXP_PERCENT_ENCODED_START.test(firstToken)
  ) {
    tokens[0] = firstToken.substring(1);
    tokens.splice(0, 0, firstToken[0]);
    return tokens;
  }

  const lastIndex = tokens.length - 1;
  const lastToken = tokens[lastIndex];
  if (
    lastToken[lastToken.length - 1] == UrlbarTokenizer.RESTRICT.SEARCH &&
    !UrlbarTokenizer.looksLikeUrl(lastToken, { requirePath: true })
  ) {
    tokens[lastIndex] = lastToken.substring(0, lastToken.length - 1);
    tokens.push(lastToken[lastToken.length - 1]);
  }

  return tokens;
}

/**
 * Given an array of unfiltered tokens, this function filters them and converts
 * to token objects with a type.
 *
 * @param {Array} tokens
 *        An array of strings, representing search tokens.
 * @returns {Array} An array of token objects.
 * Note: restriction characters are only considered if they appear at the start
 *       or at the end of the tokens list. In case of restriction characters
 *       conflict, the most external ones win. Leading ones win over trailing
 *       ones. Discarded restriction characters are considered text.
 */
function filterTokens(tokens) {
  let filtered = [];
  let restrictions = [];
  for (let i = 0; i < tokens.length; ++i) {
    let token = tokens[i];
    let tokenObj = {
      value: token,
      lowerCaseValue: token.toLocaleLowerCase(),
      type: UrlbarTokenizer.TYPE.TEXT,
    };
    let restrictionType = CHAR_TO_TYPE_MAP.get(token);
    if (restrictionType) {
      restrictions.push({ index: i, type: restrictionType });
    } else if (UrlbarTokenizer.looksLikeOrigin(token)) {
      tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;
    } else if (UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })) {
      tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;
    }
    filtered.push(tokenObj);
  }

  // Handle restriction characters.
  if (restrictions.length) {
    // We can apply two kind of restrictions: type (bookmark, search, ...) and
    // matching (url, title). These kind of restrictions can be combined, but we
    // can only have one restriction per kind.
    let matchingRestrictionFound = false;
    let typeRestrictionFound = false;
    function assignRestriction(r) {
      if (r && !(matchingRestrictionFound && typeRestrictionFound)) {
        if (
          [
            UrlbarTokenizer.TYPE.RESTRICT_TITLE,
            UrlbarTokenizer.TYPE.RESTRICT_URL,
          ].includes(r.type)
        ) {
          if (!matchingRestrictionFound) {
            matchingRestrictionFound = true;
            filtered[r.index].type = r.type;
            return true;
          }
        } else if (!typeRestrictionFound) {
          typeRestrictionFound = true;
          filtered[r.index].type = r.type;
          return true;
        }
      }
      return false;
    }

    // Look at the first token.
    let found = assignRestriction(restrictions.find(r => r.index == 0));
    if (found) {
      // If the first token was assigned, look at the next one.
      assignRestriction(restrictions.find(r => r.index == 1));
    }
    // Then look at the last token.
    let lastIndex = tokens.length - 1;
    found = assignRestriction(restrictions.find(r => r.index == lastIndex));
    if (found) {
      // If the last token was assigned, look at the previous one.
      assignRestriction(restrictions.find(r => r.index == lastIndex - 1));
    }
  }

  lazy.logger.info("Filtered Tokens", filtered);
  return filtered;
}