summaryrefslogtreecommitdiffstats
path: root/browser/components/urlbar/UrlbarTokenizer.sys.mjs
diff options
context:
space:
mode:
Diffstat (limited to 'browser/components/urlbar/UrlbarTokenizer.sys.mjs')
-rw-r--r--browser/components/urlbar/UrlbarTokenizer.sys.mjs415
1 files changed, 415 insertions, 0 deletions
diff --git a/browser/components/urlbar/UrlbarTokenizer.sys.mjs b/browser/components/urlbar/UrlbarTokenizer.sys.mjs
new file mode 100644
index 0000000000..ee86d74bda
--- /dev/null
+++ b/browser/components/urlbar/UrlbarTokenizer.sys.mjs
@@ -0,0 +1,415 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/**
+ * This module exports a tokenizer to be used by the urlbar model.
+ * Emitted tokens are objects in the shape { type, value }, where type is one
+ * of UrlbarTokenizer.TYPE.
+ */
+
+import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";
+
+const lazy = {};
+ChromeUtils.defineESModuleGetters(lazy, {
+ UrlbarUtils: "resource:///modules/UrlbarUtils.sys.mjs",
+});
+
+XPCOMUtils.defineLazyGetter(lazy, "logger", () =>
+ lazy.UrlbarUtils.getLogger({ prefix: "Tokenizer" })
+);
+
+export var UrlbarTokenizer = {
+ // Regex matching on whitespaces.
+ REGEXP_SPACES: /\s+/,
+ REGEXP_SPACES_START: /^\s+/,
+
+ // Regex used to guess url-like strings.
+ // These are not expected to be 100% correct, we accept some user mistypes
+ // and we're unlikely to be able to cover 100% of the cases.
+ REGEXP_LIKE_PROTOCOL: /^[A-Z+.-]+:\/*(?!\/)/i,
+ REGEXP_USERINFO_INVALID_CHARS: /[^\w.~%!$&'()*+,;=:-]/,
+ REGEXP_HOSTPORT_INVALID_CHARS: /[^\[\]A-Z0-9.:-]/i,
+ REGEXP_SINGLE_WORD_HOST: /^[^.:]+$/i,
+ REGEXP_HOSTPORT_IP_LIKE: /^(?=(.*[.:].*){2})[a-f0-9\.\[\]:]+$/i,
+ // This accepts partial IPv4.
+ REGEXP_HOSTPORT_INVALID_IP: /\.{2,}|\d{5,}|\d{4,}(?![:\]])|^\.|^(\d+\.){4,}\d+$|^\d{4,}$/,
+ // This only accepts complete IPv4.
+ REGEXP_HOSTPORT_IPV4: /^(\d{1,3}\.){3,}\d{1,3}(:\d+)?$/,
+ // This accepts partial IPv6.
+ REGEXP_HOSTPORT_IPV6: /^\[([0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?$/i,
+ REGEXP_COMMON_EMAIL: /^[\w!#$%&'*+/=?^`{|}~.-]+@[\[\]A-Z0-9.-]+$/i,
+ REGEXP_HAS_PORT: /:\d+$/,
+ // Regex matching a percent encoded char at the beginning of a string.
+ REGEXP_PERCENT_ENCODED_START: /^(%[0-9a-f]{2}){2,}/i,
+ // Regex matching scheme and colon, plus, if present, two slashes.
+ REGEXP_PREFIX: /^[a-z-]+:(?:\/){0,2}/i,
+
+ TYPE: {
+ TEXT: 1,
+ POSSIBLE_ORIGIN: 2, // It may be an ip, a domain, but even just a single word used as host.
+ POSSIBLE_URL: 3, // Consumers should still check this with a fixup.
+ RESTRICT_HISTORY: 4,
+ RESTRICT_BOOKMARK: 5,
+ RESTRICT_TAG: 6,
+ RESTRICT_OPENPAGE: 7,
+ RESTRICT_SEARCH: 8,
+ RESTRICT_TITLE: 9,
+ RESTRICT_URL: 10,
+ RESTRICT_ACTION: 11,
+ },
+
+ // The special characters below can be typed into the urlbar to restrict
+ // the search to a certain category, like history, bookmarks or open pages; or
+ // to force a match on just the title or url.
+ // These restriction characters can be typed alone, or at word boundaries,
+ // provided their meaning cannot be confused, for example # could be present
+ // in a valid url, and thus it should not be interpreted as a restriction.
+ RESTRICT: {
+ HISTORY: "^",
+ BOOKMARK: "*",
+ TAG: "+",
+ OPENPAGE: "%",
+ SEARCH: "?",
+ TITLE: "#",
+ URL: "$",
+ ACTION: ">",
+ },
+
+ // The keys of characters in RESTRICT that will enter search mode.
+ get SEARCH_MODE_RESTRICT() {
+ return new Set([
+ this.RESTRICT.HISTORY,
+ this.RESTRICT.BOOKMARK,
+ this.RESTRICT.OPENPAGE,
+ this.RESTRICT.SEARCH,
+ this.RESTRICT.ACTION,
+ ]);
+ },
+
+ /**
+ * Returns whether the passed in token looks like a URL.
+ * This is based on guessing and heuristics, that means if this function
+ * returns false, it's surely not a URL, if it returns true, the result must
+ * still be verified through URIFixup.
+ *
+ * @param {string} token
+ * The string token to verify
+ * @param {boolean} [requirePath] The url must have a path
+ * @returns {boolean} whether the token looks like a URL.
+ */
+ looksLikeUrl(token, { requirePath = false } = {}) {
+ if (token.length < 2) {
+ return false;
+ }
+ // Ignore spaces and require path for the data: protocol.
+ if (token.startsWith("data:")) {
+ return token.length > 5;
+ }
+ if (this.REGEXP_SPACES.test(token)) {
+ return false;
+ }
+ // If it starts with something that looks like a protocol, it's likely a url.
+ if (this.REGEXP_LIKE_PROTOCOL.test(token)) {
+ return true;
+ }
+ // Guess path and prePath. At this point we should be analyzing strings not
+ // having a protocol.
+ let slashIndex = token.indexOf("/");
+ let prePath = slashIndex != -1 ? token.slice(0, slashIndex) : token;
+ if (!this.looksLikeOrigin(prePath, { ignoreKnownDomains: true })) {
+ return false;
+ }
+
+ let path = slashIndex != -1 ? token.slice(slashIndex) : "";
+ lazy.logger.debug("path", path);
+ if (requirePath && !path) {
+ return false;
+ }
+ // If there are both path and userinfo, it's likely a url.
+ let atIndex = prePath.indexOf("@");
+ let userinfo = atIndex != -1 ? prePath.slice(0, atIndex) : "";
+ if (path.length && userinfo.length) {
+ return true;
+ }
+
+ // If the first character after the slash in the path is a letter, then the
+ // token may be an "abc/def" url.
+ if (/^\/[a-z]/i.test(path)) {
+ return true;
+ }
+ // If the path contains special chars, it is likely a url.
+ if (["%", "?", "#"].some(c => path.includes(c))) {
+ return true;
+ }
+
+ // The above looksLikeOrigin call told us the prePath looks like an origin,
+ // now we go into details checking some common origins.
+ let hostPort = atIndex != -1 ? prePath.slice(atIndex + 1) : prePath;
+ if (this.REGEXP_HOSTPORT_IPV4.test(hostPort)) {
+ return true;
+ }
+ // ipv6 is very complex to support, just check for a few chars.
+ if (
+ this.REGEXP_HOSTPORT_IPV6.test(hostPort) &&
+ ["[", "]", ":"].some(c => hostPort.includes(c))
+ ) {
+ return true;
+ }
+ if (Services.uriFixup.isDomainKnown(hostPort)) {
+ return true;
+ }
+ return false;
+ },
+
+ /**
+ * Returns whether the passed in token looks like an origin.
+ * This is based on guessing and heuristics, that means if this function
+ * returns false, it's surely not an origin, if it returns true, the result
+ * must still be verified through URIFixup.
+ *
+ * @param {string} token
+ * The string token to verify
+ * @param {object} options Options object
+ * @param {boolean} [options.ignoreKnownDomains] If true, the origin doesn't have to be
+ * in the known domain list
+ * @param {boolean} [options.noIp] If true, the origin cannot be an IP address
+ * @param {boolean} [options.noPort] If true, the origin cannot have a port number
+ * @returns {boolean} whether the token looks like an origin.
+ */
+ looksLikeOrigin(
+ token,
+ { ignoreKnownDomains = false, noIp = false, noPort = false } = {}
+ ) {
+ if (!token.length) {
+ return false;
+ }
+ let atIndex = token.indexOf("@");
+ if (atIndex != -1 && this.REGEXP_COMMON_EMAIL.test(token)) {
+ // We prefer handling it as an email rather than an origin with userinfo.
+ return false;
+ }
+ let userinfo = atIndex != -1 ? token.slice(0, atIndex) : "";
+ let hostPort = atIndex != -1 ? token.slice(atIndex + 1) : token;
+ let hasPort = this.REGEXP_HAS_PORT.test(hostPort);
+ lazy.logger.debug("userinfo", userinfo);
+ lazy.logger.debug("hostPort", hostPort);
+ if (noPort && hasPort) {
+ return false;
+ }
+ if (
+ this.REGEXP_HOSTPORT_IPV4.test(hostPort) ||
+ this.REGEXP_HOSTPORT_IPV6.test(hostPort)
+ ) {
+ return !noIp;
+ }
+
+ // Check for invalid chars.
+ if (
+ this.REGEXP_LIKE_PROTOCOL.test(hostPort) ||
+ this.REGEXP_USERINFO_INVALID_CHARS.test(userinfo) ||
+ this.REGEXP_HOSTPORT_INVALID_CHARS.test(hostPort) ||
+ (!this.REGEXP_SINGLE_WORD_HOST.test(hostPort) &&
+ this.REGEXP_HOSTPORT_IP_LIKE.test(hostPort) &&
+ this.REGEXP_HOSTPORT_INVALID_IP.test(hostPort))
+ ) {
+ return false;
+ }
+
+ // If it looks like a single word host, check the known domains.
+ if (
+ !ignoreKnownDomains &&
+ !userinfo &&
+ !hasPort &&
+ this.REGEXP_SINGLE_WORD_HOST.test(hostPort)
+ ) {
+ return Services.uriFixup.isDomainKnown(hostPort);
+ }
+
+ return true;
+ },
+
+ /**
+ * Tokenizes the searchString from a UrlbarQueryContext.
+ *
+ * @param {UrlbarQueryContext} queryContext
+ * The query context object to tokenize
+ * @returns {UrlbarQueryContext} the same query context object with a new
+ * tokens property.
+ */
+ tokenize(queryContext) {
+ lazy.logger.debug(
+ "Tokenizing search string",
+ JSON.stringify(queryContext.searchString)
+ );
+ if (!queryContext.trimmedSearchString) {
+ queryContext.tokens = [];
+ return queryContext;
+ }
+ let unfiltered = splitString(queryContext.searchString);
+ let tokens = filterTokens(unfiltered);
+ queryContext.tokens = tokens;
+ return queryContext;
+ },
+
+ /**
+ * Given a token, tells if it's a restriction token.
+ *
+ * @param {object} token
+ * The token to check.
+ * @returns {boolean} Whether the token is a restriction character.
+ */
+ isRestrictionToken(token) {
+ return (
+ token &&
+ token.type >= this.TYPE.RESTRICT_HISTORY &&
+ token.type <= this.TYPE.RESTRICT_URL
+ );
+ },
+};
+
+const CHAR_TO_TYPE_MAP = new Map(
+ Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [
+ char,
+ UrlbarTokenizer.TYPE[`RESTRICT_${type}`],
+ ])
+);
+
+/**
+ * Given a search string, splits it into string tokens.
+ *
+ * @param {string} searchString
+ * The search string to split
+ * @returns {Array} An array of string tokens.
+ */
+function splitString(searchString) {
+ // The first step is splitting on unicode whitespaces. We ignore whitespaces
+ // if the search string starts with "data:", to better support Web developers
+ // and compatiblity with other browsers.
+ let trimmed = searchString.trim();
+ let tokens = trimmed.startsWith("data:")
+ ? [trimmed]
+ : trimmed.split(UrlbarTokenizer.REGEXP_SPACES);
+
+ if (!tokens.length) {
+ return tokens;
+ }
+
+ // If there is no separate restriction token, it's possible we have to split
+ // a token, if it's the first one and it includes a leading restriction char
+ // or it's the last one and it includes a trailing restriction char.
+ // This allows to not require the user to add artificial whitespaces to
+ // enforce restrictions, for example typing questions would restrict to
+ // search results.
+ const hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));
+ if (hasRestrictionToken) {
+ return tokens;
+ }
+
+ // Check for an unambiguous restriction char at the beginning of the first
+ // token, or at the end of the last token. We only count trailing restriction
+ // chars if they are the search restriction char, which is "?". This is to
+ // allow for a typed question to yield only search results.
+ const firstToken = tokens[0];
+ if (
+ CHAR_TO_TYPE_MAP.has(firstToken[0]) &&
+ !UrlbarTokenizer.REGEXP_PERCENT_ENCODED_START.test(firstToken)
+ ) {
+ tokens[0] = firstToken.substring(1);
+ tokens.splice(0, 0, firstToken[0]);
+ return tokens;
+ }
+
+ const lastIndex = tokens.length - 1;
+ const lastToken = tokens[lastIndex];
+ if (
+ lastToken[lastToken.length - 1] == UrlbarTokenizer.RESTRICT.SEARCH &&
+ !UrlbarTokenizer.looksLikeUrl(lastToken, { requirePath: true })
+ ) {
+ tokens[lastIndex] = lastToken.substring(0, lastToken.length - 1);
+ tokens.push(lastToken[lastToken.length - 1]);
+ }
+
+ return tokens;
+}
+
+/**
+ * Given an array of unfiltered tokens, this function filters them and converts
+ * to token objects with a type.
+ *
+ * @param {Array} tokens
+ * An array of strings, representing search tokens.
+ * @returns {Array} An array of token objects.
+ * Note: restriction characters are only considered if they appear at the start
+ * or at the end of the tokens list. In case of restriction characters
+ * conflict, the most external ones win. Leading ones win over trailing
+ * ones. Discarded restriction characters are considered text.
+ */
+function filterTokens(tokens) {
+ let filtered = [];
+ let restrictions = [];
+ for (let i = 0; i < tokens.length; ++i) {
+ let token = tokens[i];
+ let tokenObj = {
+ value: token,
+ lowerCaseValue: token.toLocaleLowerCase(),
+ type: UrlbarTokenizer.TYPE.TEXT,
+ };
+ let restrictionType = CHAR_TO_TYPE_MAP.get(token);
+ if (restrictionType) {
+ restrictions.push({ index: i, type: restrictionType });
+ } else if (UrlbarTokenizer.looksLikeOrigin(token)) {
+ tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;
+ } else if (UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })) {
+ tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;
+ }
+ filtered.push(tokenObj);
+ }
+
+ // Handle restriction characters.
+ if (restrictions.length) {
+ // We can apply two kind of restrictions: type (bookmark, search, ...) and
+ // matching (url, title). These kind of restrictions can be combined, but we
+ // can only have one restriction per kind.
+ let matchingRestrictionFound = false;
+ let typeRestrictionFound = false;
+ function assignRestriction(r) {
+ if (r && !(matchingRestrictionFound && typeRestrictionFound)) {
+ if (
+ [
+ UrlbarTokenizer.TYPE.RESTRICT_TITLE,
+ UrlbarTokenizer.TYPE.RESTRICT_URL,
+ ].includes(r.type)
+ ) {
+ if (!matchingRestrictionFound) {
+ matchingRestrictionFound = true;
+ filtered[r.index].type = r.type;
+ return true;
+ }
+ } else if (!typeRestrictionFound) {
+ typeRestrictionFound = true;
+ filtered[r.index].type = r.type;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // Look at the first token.
+ let found = assignRestriction(restrictions.find(r => r.index == 0));
+ if (found) {
+ // If the first token was assigned, look at the next one.
+ assignRestriction(restrictions.find(r => r.index == 1));
+ }
+ // Then look at the last token.
+ let lastIndex = tokens.length - 1;
+ found = assignRestriction(restrictions.find(r => r.index == lastIndex));
+ if (found) {
+ // If the last token was assigned, look at the previous one.
+ assignRestriction(restrictions.find(r => r.index == lastIndex - 1));
+ }
+ }
+
+ lazy.logger.info("Filtered Tokens", filtered);
+ return filtered;
+}