495 lines
16 KiB
JavaScript
495 lines
16 KiB
JavaScript
/* This Source Code Form is subject to the terms of the Mozilla Public
|
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
|
|
|
/**
|
|
* This module exports a tokenizer to be used by the urlbar model.
|
|
* Emitted tokens are objects in the shape { type, value }, where type is one
|
|
* of UrlbarTokenizer.TYPE.
|
|
*/
|
|
|
|
const lazy = {};
|
|
ChromeUtils.defineESModuleGetters(lazy, {
|
|
UrlbarPrefs: "resource:///modules/UrlbarPrefs.sys.mjs",
|
|
UrlbarUtils: "resource:///modules/UrlbarUtils.sys.mjs",
|
|
PlacesUtils: "resource://gre/modules/PlacesUtils.sys.mjs",
|
|
});
|
|
|
|
ChromeUtils.defineLazyGetter(lazy, "logger", () =>
|
|
lazy.UrlbarUtils.getLogger({ prefix: "Tokenizer" })
|
|
);
|
|
|
|
ChromeUtils.defineLazyGetter(lazy, "gFluentStrings", function () {
|
|
return new Localization(["browser/browser.ftl"]);
|
|
});
|
|
|
|
/**
|
|
* This Map stores key-value pairs where each key is a restrict token
|
|
* and each value is an array containing the localized keyword and the
|
|
* english keyword.
|
|
*
|
|
* For example,
|
|
* "*" maps to "Bookmarks" for english locales
|
|
* "*" maps to "Marcadores, Bookmarks" for es-ES
|
|
*
|
|
* @type {Map<string, string[]>}
|
|
*/
|
|
let tokenToKeywords = new Map();
|
|
|
|
export var UrlbarTokenizer = {
|
|
// Regex matching on whitespaces.
|
|
REGEXP_SPACES: /\s+/,
|
|
REGEXP_SPACES_START: /^\s+/,
|
|
|
|
// Regex used to guess url-like strings.
|
|
// These are not expected to be 100% correct, we accept some user mistypes
|
|
// and we're unlikely to be able to cover 100% of the cases.
|
|
REGEXP_LIKE_PROTOCOL: /^[A-Z+.-]+:\/*(?!\/)/i,
|
|
REGEXP_USERINFO_INVALID_CHARS: /[^\w.~%!$&'()*+,;=:-]/,
|
|
REGEXP_HOSTPORT_INVALID_CHARS: /[^\[\]A-Z0-9.:-]/i,
|
|
REGEXP_SINGLE_WORD_HOST: /^[^.:]+$/i,
|
|
REGEXP_HOSTPORT_IP_LIKE: /^(?=(.*[.:].*){2})[a-f0-9\.\[\]:]+$/i,
|
|
// This accepts partial IPv4.
|
|
REGEXP_HOSTPORT_INVALID_IP:
|
|
/\.{2,}|\d{5,}|\d{4,}(?![:\]])|^\.|^(\d+\.){4,}\d+$|^\d{4,}$/,
|
|
// This only accepts complete IPv4.
|
|
REGEXP_HOSTPORT_IPV4: /^(\d{1,3}\.){3,}\d{1,3}(:\d+)?$/,
|
|
// This accepts partial IPv6.
|
|
REGEXP_HOSTPORT_IPV6: /^\[([0-9a-f]{0,4}:){0,7}[0-9a-f]{0,4}\]?$/i,
|
|
REGEXP_COMMON_EMAIL: /^[\w!#$%&'*+/=?^`{|}~.-]+@[\[\]A-Z0-9.-]+$/i,
|
|
REGEXP_HAS_PORT: /:\d+$/,
|
|
// Regex matching a percent encoded char at the beginning of a string.
|
|
REGEXP_PERCENT_ENCODED_START: /^(%[0-9a-f]{2}){2,}/i,
|
|
// Regex matching scheme and colon, plus, if present, two slashes.
|
|
REGEXP_PREFIX: /^[a-z-]+:(?:\/){0,2}/i,
|
|
|
|
TYPE: {
|
|
TEXT: 1,
|
|
POSSIBLE_ORIGIN: 2, // It may be an ip, a domain, but even just a single word used as host.
|
|
POSSIBLE_URL: 3, // Consumers should still check this with a fixup.
|
|
RESTRICT_HISTORY: 4,
|
|
RESTRICT_BOOKMARK: 5,
|
|
RESTRICT_TAG: 6,
|
|
RESTRICT_OPENPAGE: 7,
|
|
RESTRICT_SEARCH: 8,
|
|
RESTRICT_TITLE: 9,
|
|
RESTRICT_URL: 10,
|
|
RESTRICT_ACTION: 11,
|
|
},
|
|
|
|
// The special characters below can be typed into the urlbar to restrict
|
|
// the search to a certain category, like history, bookmarks or open pages; or
|
|
// to force a match on just the title or url.
|
|
// These restriction characters can be typed alone, or at word boundaries,
|
|
// provided their meaning cannot be confused, for example # could be present
|
|
// in a valid url, and thus it should not be interpreted as a restriction.
|
|
RESTRICT: {
|
|
HISTORY: "^",
|
|
BOOKMARK: "*",
|
|
TAG: "+",
|
|
OPENPAGE: "%",
|
|
SEARCH: "?",
|
|
TITLE: "#",
|
|
URL: "$",
|
|
ACTION: ">",
|
|
},
|
|
|
|
// The keys of characters in RESTRICT that will enter search mode.
|
|
get SEARCH_MODE_RESTRICT() {
|
|
const keys = [
|
|
this.RESTRICT.HISTORY,
|
|
this.RESTRICT.BOOKMARK,
|
|
this.RESTRICT.OPENPAGE,
|
|
this.RESTRICT.SEARCH,
|
|
];
|
|
if (lazy.UrlbarPrefs.get("scotchBonnet.enableOverride")) {
|
|
keys.push(this.RESTRICT.ACTION);
|
|
}
|
|
return new Set(keys);
|
|
},
|
|
|
|
async loadL10nRestrictKeywords() {
|
|
let l10nKeywords = await lazy.gFluentStrings.formatValues(
|
|
lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => {
|
|
let name = lazy.UrlbarUtils.getResultSourceName(mode.source);
|
|
return { id: `urlbar-search-mode-${name}` };
|
|
})
|
|
);
|
|
|
|
let englishSearchStrings = new Localization([
|
|
"preview/enUS-searchFeatures.ftl",
|
|
]);
|
|
|
|
let englishKeywords = await englishSearchStrings.formatValues(
|
|
lazy.UrlbarUtils.LOCAL_SEARCH_MODES.map(mode => {
|
|
let name = lazy.UrlbarUtils.getResultSourceName(mode.source);
|
|
return { id: `urlbar-search-mode-${name}-en` };
|
|
})
|
|
);
|
|
|
|
for (let { restrict } of lazy.UrlbarUtils.LOCAL_SEARCH_MODES) {
|
|
let uniqueKeywords = [
|
|
...new Set([l10nKeywords.shift(), englishKeywords.shift()]),
|
|
];
|
|
|
|
tokenToKeywords.set(restrict, uniqueKeywords);
|
|
}
|
|
},
|
|
|
|
/**
|
|
* Gets the cached localized restrict keywords. If keywords are not cached
|
|
* fetch the localized keywords first and then return the keywords.
|
|
*/
|
|
async getL10nRestrictKeywords() {
|
|
if (tokenToKeywords.size === 0) {
|
|
await this.loadL10nRestrictKeywords();
|
|
}
|
|
|
|
return tokenToKeywords;
|
|
},
|
|
|
|
/**
|
|
* Returns whether the passed in token looks like a URL.
|
|
* This is based on guessing and heuristics, that means if this function
|
|
* returns false, it's surely not a URL, if it returns true, the result must
|
|
* still be verified through URIFixup.
|
|
*
|
|
* @param {string} token
|
|
* The string token to verify
|
|
* @param {object} [options]
|
|
* @param {boolean} [options.requirePath] The url must have a path
|
|
* @returns {boolean} whether the token looks like a URL.
|
|
*/
|
|
looksLikeUrl(token, { requirePath = false } = {}) {
|
|
if (token.length < 2) {
|
|
return false;
|
|
}
|
|
// Ignore spaces and require path for the data: protocol.
|
|
if (token.startsWith("data:")) {
|
|
return token.length > 5;
|
|
}
|
|
if (this.REGEXP_SPACES.test(token)) {
|
|
return false;
|
|
}
|
|
// If it starts with something that looks like a protocol, it's likely a url.
|
|
if (this.REGEXP_LIKE_PROTOCOL.test(token)) {
|
|
return true;
|
|
}
|
|
// Guess path and prePath. At this point we should be analyzing strings not
|
|
// having a protocol.
|
|
let slashIndex = token.indexOf("/");
|
|
let prePath = slashIndex != -1 ? token.slice(0, slashIndex) : token;
|
|
if (!this.looksLikeOrigin(prePath, { ignoreKnownDomains: true })) {
|
|
return false;
|
|
}
|
|
|
|
let path = slashIndex != -1 ? token.slice(slashIndex) : "";
|
|
lazy.logger.debug("path", path);
|
|
if (requirePath && !path) {
|
|
return false;
|
|
}
|
|
// If there are both path and userinfo, it's likely a url.
|
|
let atIndex = prePath.indexOf("@");
|
|
let userinfo = atIndex != -1 ? prePath.slice(0, atIndex) : "";
|
|
if (path.length && userinfo.length) {
|
|
return true;
|
|
}
|
|
|
|
// If the first character after the slash in the path is a letter, then the
|
|
// token may be an "abc/def" url.
|
|
if (/^\/[a-z]/i.test(path)) {
|
|
return true;
|
|
}
|
|
// If the path contains special chars, it is likely a url.
|
|
if (["%", "?", "#"].some(c => path.includes(c))) {
|
|
return true;
|
|
}
|
|
|
|
// The above looksLikeOrigin call told us the prePath looks like an origin,
|
|
// now we go into details checking some common origins.
|
|
let hostPort = atIndex != -1 ? prePath.slice(atIndex + 1) : prePath;
|
|
if (this.REGEXP_HOSTPORT_IPV4.test(hostPort)) {
|
|
return true;
|
|
}
|
|
// ipv6 is very complex to support, just check for a few chars.
|
|
if (
|
|
this.REGEXP_HOSTPORT_IPV6.test(hostPort) &&
|
|
["[", "]", ":"].some(c => hostPort.includes(c))
|
|
) {
|
|
return true;
|
|
}
|
|
if (Services.uriFixup.isDomainKnown(hostPort)) {
|
|
return true;
|
|
}
|
|
return false;
|
|
},
|
|
|
|
/**
|
|
* Returns whether the passed in token looks like an origin.
|
|
* This is based on guessing and heuristics, that means if this function
|
|
* returns false, it's surely not an origin, if it returns true, the result
|
|
* must still be verified through URIFixup.
|
|
*
|
|
* @param {string} token
|
|
* The string token to verify
|
|
* @param {object} options Options object
|
|
* @param {boolean} [options.ignoreKnownDomains] If true, the origin doesn't have to be
|
|
* in the known domain list
|
|
* @param {boolean} [options.noIp] If true, the origin cannot be an IP address
|
|
* @param {boolean} [options.noPort] If true, the origin cannot have a port number
|
|
* @returns {boolean} whether the token looks like an origin.
|
|
*/
|
|
looksLikeOrigin(
|
|
token,
|
|
{ ignoreKnownDomains = false, noIp = false, noPort = false } = {}
|
|
) {
|
|
if (!token.length) {
|
|
return false;
|
|
}
|
|
let atIndex = token.indexOf("@");
|
|
if (atIndex != -1 && this.REGEXP_COMMON_EMAIL.test(token)) {
|
|
// We prefer handling it as an email rather than an origin with userinfo.
|
|
return false;
|
|
}
|
|
let userinfo = atIndex != -1 ? token.slice(0, atIndex) : "";
|
|
let hostPort = atIndex != -1 ? token.slice(atIndex + 1) : token;
|
|
let hasPort = this.REGEXP_HAS_PORT.test(hostPort);
|
|
lazy.logger.debug("userinfo", userinfo);
|
|
lazy.logger.debug("hostPort", hostPort);
|
|
if (noPort && hasPort) {
|
|
return false;
|
|
}
|
|
if (
|
|
this.REGEXP_HOSTPORT_IPV4.test(hostPort) ||
|
|
this.REGEXP_HOSTPORT_IPV6.test(hostPort)
|
|
) {
|
|
return !noIp;
|
|
}
|
|
|
|
// Check for invalid chars.
|
|
if (
|
|
this.REGEXP_LIKE_PROTOCOL.test(hostPort) ||
|
|
this.REGEXP_USERINFO_INVALID_CHARS.test(userinfo) ||
|
|
this.REGEXP_HOSTPORT_INVALID_CHARS.test(hostPort) ||
|
|
(!this.REGEXP_SINGLE_WORD_HOST.test(hostPort) &&
|
|
this.REGEXP_HOSTPORT_IP_LIKE.test(hostPort) &&
|
|
this.REGEXP_HOSTPORT_INVALID_IP.test(hostPort))
|
|
) {
|
|
return false;
|
|
}
|
|
|
|
// If it looks like a single word host, check the known domains.
|
|
if (
|
|
!ignoreKnownDomains &&
|
|
!userinfo &&
|
|
!hasPort &&
|
|
this.REGEXP_SINGLE_WORD_HOST.test(hostPort)
|
|
) {
|
|
return Services.uriFixup.isDomainKnown(hostPort);
|
|
}
|
|
|
|
return true;
|
|
},
|
|
|
|
/**
|
|
* Tokenizes the searchString from a UrlbarQueryContext.
|
|
*
|
|
* @param {UrlbarQueryContext} queryContext
|
|
* The query context object to tokenize
|
|
* @returns {UrlbarQueryContext} the same query context object with a new
|
|
* tokens property.
|
|
*/
|
|
tokenize(queryContext) {
|
|
lazy.logger.debug("Tokenizing search string", {
|
|
searchString: queryContext.searchString,
|
|
});
|
|
if (!queryContext.trimmedSearchString) {
|
|
queryContext.tokens = [];
|
|
return queryContext;
|
|
}
|
|
let unfiltered = splitString(queryContext);
|
|
let tokens = filterTokens(unfiltered);
|
|
queryContext.tokens = tokens;
|
|
return queryContext;
|
|
},
|
|
|
|
/**
|
|
* Given a token, tells if it's a restriction token.
|
|
*
|
|
* @param {object} token
|
|
* The token to check.
|
|
* @returns {boolean} Whether the token is a restriction character.
|
|
*/
|
|
isRestrictionToken(token) {
|
|
return (
|
|
token &&
|
|
token.type >= this.TYPE.RESTRICT_HISTORY &&
|
|
token.type <= this.TYPE.RESTRICT_URL
|
|
);
|
|
},
|
|
};
|
|
|
|
const CHAR_TO_TYPE_MAP = new Map(
|
|
Object.entries(UrlbarTokenizer.RESTRICT).map(([type, char]) => [
|
|
char,
|
|
UrlbarTokenizer.TYPE[`RESTRICT_${type}`],
|
|
])
|
|
);
|
|
|
|
/**
|
|
* Given a queryContext object, splits its searchString into string tokens.
|
|
*
|
|
* @param {UrlbarQueryContext} queryContext
|
|
* The query context object to tokenize.
|
|
* @returns {string[]} An array of string tokens.
|
|
*/
|
|
function splitString({ searchString, searchMode }) {
|
|
// The first step is splitting on unicode whitespaces. We ignore whitespaces
|
|
// if the search string starts with "data:", to better support Web developers
|
|
// and compatiblity with other browsers.
|
|
let trimmed = searchString.trim();
|
|
let tokens;
|
|
if (trimmed.startsWith("data:")) {
|
|
tokens = [trimmed];
|
|
} else if (trimmed.length < 500) {
|
|
tokens = trimmed.split(UrlbarTokenizer.REGEXP_SPACES);
|
|
} else {
|
|
// If the string is very long, tokenizing all of it would be expensive. So
|
|
// we only tokenize a part of it, then let the last token become a
|
|
// catch-all.
|
|
tokens = trimmed.substring(0, 500).split(UrlbarTokenizer.REGEXP_SPACES);
|
|
tokens[tokens.length - 1] += trimmed.substring(500);
|
|
}
|
|
|
|
if (!tokens.length) {
|
|
return tokens;
|
|
}
|
|
|
|
// If there is no separate restriction token, it's possible we have to split
|
|
// a token, if it's the first one and it includes a leading restriction char
|
|
// or it's the last one and it includes a trailing restriction char.
|
|
// This allows to not require the user to add artificial whitespaces to
|
|
// enforce restrictions, for example typing questions would restrict to
|
|
// search results.
|
|
const hasRestrictionToken = tokens.some(t => CHAR_TO_TYPE_MAP.has(t));
|
|
|
|
const firstToken = tokens[0];
|
|
const isFirstTokenAKeyword =
|
|
!Object.values(UrlbarTokenizer.RESTRICT).includes(firstToken) &&
|
|
lazy.PlacesUtils.keywords.isKeywordFromCache(firstToken);
|
|
|
|
if (hasRestrictionToken || isFirstTokenAKeyword) {
|
|
return tokens;
|
|
}
|
|
|
|
// Check for an unambiguous restriction char at the beginning of the first
|
|
// token.
|
|
if (
|
|
CHAR_TO_TYPE_MAP.has(firstToken[0]) &&
|
|
!UrlbarTokenizer.REGEXP_PERCENT_ENCODED_START.test(firstToken) &&
|
|
!searchMode
|
|
) {
|
|
tokens[0] = firstToken.substring(1);
|
|
tokens.splice(0, 0, firstToken[0]);
|
|
return tokens;
|
|
}
|
|
|
|
return tokens;
|
|
}
|
|
|
|
/**
|
|
* Given an array of unfiltered tokens, this function filters them and converts
|
|
* to token objects with a type.
|
|
*
|
|
* @param {Array} tokens
|
|
* An array of strings, representing search tokens.
|
|
* @returns {Array} An array of token objects.
|
|
* Note: restriction characters are only considered if they appear at the start
|
|
* or at the end of the tokens list. In case of restriction characters
|
|
* conflict, the most external ones win. Leading ones win over trailing
|
|
* ones. Discarded restriction characters are considered text.
|
|
*/
|
|
function filterTokens(tokens) {
|
|
let filtered = [];
|
|
let restrictions = [];
|
|
const isFirstTokenAKeyword =
|
|
!Object.values(UrlbarTokenizer.RESTRICT).includes(tokens[0]) &&
|
|
lazy.PlacesUtils.keywords.isKeywordFromCache(tokens[0]);
|
|
|
|
for (let i = 0; i < tokens.length; ++i) {
|
|
let token = tokens[i];
|
|
let tokenObj = {
|
|
value: token,
|
|
lowerCaseValue: token.toLocaleLowerCase(),
|
|
type: UrlbarTokenizer.TYPE.TEXT,
|
|
};
|
|
// For privacy reasons, we don't want to send a data (or other kind of) URI
|
|
// to a search engine. So we want to parse any single long token below.
|
|
if (tokens.length > 1 && token.length > 500) {
|
|
filtered.push(tokenObj);
|
|
break;
|
|
}
|
|
|
|
if (isFirstTokenAKeyword) {
|
|
filtered.push(tokenObj);
|
|
continue;
|
|
}
|
|
|
|
let restrictionType = CHAR_TO_TYPE_MAP.get(token);
|
|
if (restrictionType) {
|
|
restrictions.push({ index: i, type: restrictionType });
|
|
} else if (UrlbarTokenizer.looksLikeOrigin(token)) {
|
|
tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_ORIGIN;
|
|
} else if (UrlbarTokenizer.looksLikeUrl(token, { requirePath: true })) {
|
|
tokenObj.type = UrlbarTokenizer.TYPE.POSSIBLE_URL;
|
|
}
|
|
filtered.push(tokenObj);
|
|
}
|
|
|
|
// Handle restriction characters.
|
|
if (restrictions.length) {
|
|
// We can apply two kind of restrictions: type (bookmark, search, ...) and
|
|
// matching (url, title). These kind of restrictions can be combined, but we
|
|
// can only have one restriction per kind.
|
|
let matchingRestrictionFound = false;
|
|
let typeRestrictionFound = false;
|
|
function assignRestriction(r) {
|
|
if (r && !(matchingRestrictionFound && typeRestrictionFound)) {
|
|
if (
|
|
[
|
|
UrlbarTokenizer.TYPE.RESTRICT_TITLE,
|
|
UrlbarTokenizer.TYPE.RESTRICT_URL,
|
|
].includes(r.type)
|
|
) {
|
|
if (!matchingRestrictionFound) {
|
|
matchingRestrictionFound = true;
|
|
filtered[r.index].type = r.type;
|
|
return true;
|
|
}
|
|
} else if (!typeRestrictionFound) {
|
|
typeRestrictionFound = true;
|
|
filtered[r.index].type = r.type;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Look at the first token.
|
|
let found = assignRestriction(restrictions.find(r => r.index == 0));
|
|
if (found) {
|
|
// If the first token was assigned, look at the next one.
|
|
assignRestriction(restrictions.find(r => r.index == 1));
|
|
}
|
|
// Then look at the last token.
|
|
let lastIndex = tokens.length - 1;
|
|
found = assignRestriction(restrictions.find(r => r.index == lastIndex));
|
|
if (found) {
|
|
// If the last token was assigned, look at the previous one.
|
|
assignRestriction(restrictions.find(r => r.index == lastIndex - 1));
|
|
}
|
|
}
|
|
|
|
lazy.logger.info("Filtered Tokens", filtered);
|
|
return filtered;
|
|
}
|