2765 lines
96 KiB
JavaScript
2765 lines
96 KiB
JavaScript
/*
|
||
DO NOT TOUCH fathom.mjs DIRECTLY. See the README for instructions.
|
||
*/
|
||
|
||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||
|
||
/**
|
||
* A :func:`rule` depends on another rule which itself depends on the first
|
||
* rule again, either directly or indirectly.
|
||
*/
|
||
class CycleError extends Error {
|
||
}
|
||
|
||
/**
|
||
* An examined element was not contained in a browser ``window`` object, but
|
||
* something needed it to be.
|
||
*/
|
||
class NoWindowError extends Error {
|
||
}
|
||
|
||
var exceptions = /*#__PURE__*/Object.freeze({
|
||
__proto__: null,
|
||
CycleError: CycleError,
|
||
NoWindowError: NoWindowError
|
||
});
|
||
|
||
/**
|
||
* Return the passed-in arg. Useful as a default.
|
||
*/
|
||
function identity(x) {
|
||
return x;
|
||
}
|
||
|
||
/*eslint-env browser*/
|
||
|
||
/**
|
||
* From an iterable return the best item, according to an arbitrary comparator
|
||
* function. In case of a tie, the first item wins.
|
||
*
|
||
* @arg by {function} Given an item of the iterable, return a value to compare
|
||
* @arg isBetter {function} Return whether its first arg is better than its
|
||
* second
|
||
*/
|
||
function best(iterable, by, isBetter) {
|
||
let bestSoFar, bestKeySoFar;
|
||
let isFirst = true;
|
||
forEach(
|
||
function (item) {
|
||
const key = by(item);
|
||
if (isBetter(key, bestKeySoFar) || isFirst) {
|
||
bestSoFar = item;
|
||
bestKeySoFar = key;
|
||
isFirst = false;
|
||
}
|
||
},
|
||
iterable);
|
||
if (isFirst) {
|
||
throw new Error('Tried to call best() on empty iterable');
|
||
}
|
||
return bestSoFar;
|
||
}
|
||
|
||
/**
|
||
* Return the maximum item from an iterable, as defined by >.
|
||
*
|
||
* Works with any type that works with >. If multiple items are equally great,
|
||
* return the first.
|
||
*
|
||
* @arg by {function} Given an item of the iterable, returns a value to
|
||
* compare
|
||
*/
|
||
function max(iterable, by = identity) {
|
||
return best(iterable, by, (a, b) => a > b);
|
||
}
|
||
|
||
/**
|
||
* Return an Array of maximum items from an iterable, as defined by > and ===.
|
||
*
|
||
* If an empty iterable is passed in, return [].
|
||
*/
|
||
function maxes(iterable, by = identity) {
|
||
let bests = [];
|
||
let bestKeySoFar;
|
||
let isFirst = true;
|
||
forEach(
|
||
function (item) {
|
||
const key = by(item);
|
||
if (key > bestKeySoFar || isFirst) {
|
||
bests = [item];
|
||
bestKeySoFar = key;
|
||
isFirst = false;
|
||
} else if (key === bestKeySoFar) {
|
||
bests.push(item);
|
||
}
|
||
},
|
||
iterable);
|
||
return bests;
|
||
}
|
||
|
||
/**
|
||
* Return the minimum item from an iterable, as defined by <.
|
||
*
|
||
* If multiple items are equally great, return the first.
|
||
*/
|
||
function min(iterable, by = identity) {
|
||
return best(iterable, by, (a, b) => a < b);
|
||
}
|
||
|
||
/**
|
||
* Return the sum of an iterable, as defined by the + operator.
|
||
*/
|
||
function sum(iterable) {
|
||
let total;
|
||
let isFirst = true;
|
||
forEach(
|
||
function assignOrAdd(addend) {
|
||
if (isFirst) {
|
||
total = addend;
|
||
isFirst = false;
|
||
} else {
|
||
total += addend;
|
||
}
|
||
},
|
||
iterable);
|
||
return total;
|
||
}
|
||
|
||
/**
|
||
* Return the number of items in an iterable, consuming it as a side effect.
|
||
*/
|
||
function length(iterable) {
|
||
let num = 0;
|
||
// eslint-disable-next-line no-unused-vars
|
||
for (let item of iterable) {
|
||
num++;
|
||
}
|
||
return num;
|
||
}
|
||
|
||
/**
|
||
* Iterate, depth first, over a DOM node. Return the original node first.
|
||
*
|
||
* @arg shouldTraverse {function} Given a node, say whether we should
|
||
* include it and its children. Default: always true.
|
||
*/
|
||
function *walk(element, shouldTraverse = element => true) {
|
||
yield element;
|
||
for (let child of element.childNodes) {
|
||
if (shouldTraverse(child)) {
|
||
for (let w of walk(child, shouldTraverse)) {
|
||
yield w;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
const blockTags = new Set(
|
||
['ADDRESS', 'BLOCKQUOTE', 'BODY', 'CENTER', 'DIR', 'DIV', 'DL',
|
||
'FIELDSET', 'FORM', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HR',
|
||
'ISINDEX', 'MENU', 'NOFRAMES', 'NOSCRIPT', 'OL', 'P', 'PRE',
|
||
'TABLE', 'UL', 'DD', 'DT', 'FRAMESET', 'LI', 'TBODY', 'TD',
|
||
'TFOOT', 'TH', 'THEAD', 'TR', 'HTML']);
|
||
/**
|
||
* Return whether a DOM element is a block element by default (rather than by
|
||
* styling).
|
||
*/
|
||
function isBlock(element) {
|
||
return blockTags.has(element.tagName);
|
||
}
|
||
|
||
/**
|
||
* Yield strings of text nodes within a normalized DOM node and its children,
|
||
* without venturing into any contained block elements.
|
||
*
|
||
* @arg shouldTraverse {function} Specify additional elements to exclude by
|
||
* returning false
|
||
*/
|
||
function *inlineTexts(element, shouldTraverse = element => true) {
|
||
// TODO: Could we just use querySelectorAll() with a really long
|
||
// selector rather than walk(), for speed?
|
||
for (let child of walk(element,
|
||
element => !(isBlock(element) ||
|
||
element.tagName === 'SCRIPT' &&
|
||
element.tagName === 'STYLE')
|
||
&& shouldTraverse(element))) {
|
||
if (child.nodeType === child.TEXT_NODE) {
|
||
// wholeText() is not implemented by jsdom, so we use
|
||
// textContent(). The result should be the same, since
|
||
// we're calling it on only text nodes, but it may be
|
||
// slower. On the positive side, it means we don't need to
|
||
// normalize the DOM tree first.
|
||
yield child.textContent;
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Return the total length of the inline text within an element, with
|
||
* whitespace collapsed.
|
||
*
|
||
* @arg shouldTraverse {function} Specify additional elements to exclude by
|
||
* returning false
|
||
*/
|
||
function inlineTextLength(element, shouldTraverse = element => true) {
|
||
return sum(map(text => collapseWhitespace(text).length,
|
||
inlineTexts(element, shouldTraverse)));
|
||
}
|
||
|
||
/**
|
||
* Return a string with each run of whitespace collapsed to a single space.
|
||
*/
|
||
function collapseWhitespace(str) {
|
||
return str.replace(/\s{2,}/g, ' ');
|
||
}
|
||
|
||
/**
|
||
* Return the ratio of the inline text length of the links in an element to the
|
||
* inline text length of the entire element.
|
||
*
|
||
* @arg inlineLength {number} Optionally, the precalculated inline
|
||
* length of the fnode. If omitted, we will calculate it ourselves.
|
||
*/
|
||
function linkDensity(fnode, inlineLength) {
|
||
if (inlineLength === undefined) {
|
||
inlineLength = inlineTextLength(fnode.element);
|
||
}
|
||
const lengthWithoutLinks = inlineTextLength(fnode.element,
|
||
element => element.tagName !== 'A');
|
||
return (inlineLength - lengthWithoutLinks) / inlineLength;
|
||
}
|
||
|
||
/**
|
||
* Return whether an element is a text node that consist wholly of whitespace.
|
||
*/
|
||
function isWhitespace(element) {
|
||
return (element.nodeType === element.TEXT_NODE &&
|
||
element.textContent.trim().length === 0);
|
||
}
|
||
|
||
/**
|
||
* Get a key of a map, first setting it to a default value if it's missing.
|
||
*/
|
||
function setDefault(map, key, defaultMaker) {
|
||
if (map.has(key)) {
|
||
return map.get(key);
|
||
}
|
||
const defaultValue = defaultMaker();
|
||
map.set(key, defaultValue);
|
||
return defaultValue;
|
||
}
|
||
|
||
/**
|
||
* Get a key of a map or, if it's missing, a default value.
|
||
*/
|
||
function getDefault(map, key, defaultMaker) {
|
||
if (map.has(key)) {
|
||
return map.get(key);
|
||
}
|
||
return defaultMaker();
|
||
}
|
||
|
||
/**
|
||
* Return an Array, the reverse topological sort of the given nodes.
|
||
*
|
||
* @arg nodes An iterable of arbitrary things
|
||
* @arg nodesThatNeed {function} Take a node and returns an Array of nodes
|
||
* that depend on it
|
||
*/
|
||
function toposort(nodes, nodesThatNeed) {
|
||
const ret = [];
|
||
const todo = new Set(nodes);
|
||
const inProgress = new Set();
|
||
|
||
function visit(node) {
|
||
if (inProgress.has(node)) {
|
||
throw new CycleError('The graph has a cycle.');
|
||
}
|
||
if (todo.has(node)) {
|
||
inProgress.add(node);
|
||
for (let needer of nodesThatNeed(node)) {
|
||
visit(needer);
|
||
}
|
||
inProgress.delete(node);
|
||
todo.delete(node);
|
||
ret.push(node);
|
||
}
|
||
}
|
||
|
||
while (todo.size > 0) {
|
||
visit(first(todo));
|
||
}
|
||
return ret;
|
||
}
|
||
|
||
/**
|
||
* A Set with the additional methods it ought to have had
|
||
*/
|
||
class NiceSet extends Set {
|
||
/**
|
||
* Remove and return an arbitrary item. Throw an Error if I am empty.
|
||
*/
|
||
pop() {
|
||
for (let v of this.values()) {
|
||
this.delete(v);
|
||
return v;
|
||
}
|
||
throw new Error('Tried to pop from an empty NiceSet.');
|
||
}
|
||
|
||
/**
|
||
* Union another set or other iterable into myself.
|
||
*
|
||
* @return myself, for chaining
|
||
*/
|
||
extend(otherSet) {
|
||
for (let item of otherSet) {
|
||
this.add(item);
|
||
}
|
||
return this;
|
||
}
|
||
|
||
/**
|
||
* Subtract another set from a copy of me.
|
||
*
|
||
* @return a copy of myself excluding the elements in ``otherSet``.
|
||
*/
|
||
minus(otherSet) {
|
||
const ret = new NiceSet(this);
|
||
for (const item of otherSet) {
|
||
ret.delete(item);
|
||
}
|
||
return ret;
|
||
}
|
||
|
||
/**
|
||
* Actually show the items in me.
|
||
*/
|
||
toString() {
|
||
return '{' + Array.from(this).join(', ') + '}';
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Return the first item of an iterable.
|
||
*/
|
||
function first(iterable) {
|
||
for (let i of iterable) {
|
||
return i;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Given any node in a DOM tree, return the root element of the tree, generally
|
||
* an HTML element.
|
||
*/
|
||
function rootElement(element) {
|
||
return element.ownerDocument.documentElement;
|
||
}
|
||
|
||
/**
|
||
* Return the number of times a regex occurs within the string `haystack`.
|
||
*
|
||
* Caller must make sure `regex` has the 'g' option set.
|
||
*/
|
||
function numberOfMatches(regex, haystack) {
|
||
return (haystack.match(regex) || []).length;
|
||
}
|
||
|
||
/**
|
||
* Wrap a scoring callback, and set its element to the page root iff a score is
|
||
* returned.
|
||
*
|
||
* This is used to build rulesets which classify entire pages rather than
|
||
* picking out specific elements.
|
||
*
|
||
* For example, these rules might classify a page as a "login page", influenced
|
||
* by whether they have login buttons or username fields:
|
||
*
|
||
* ``rule(type('loginPage'), score(page(pageContainsLoginButton))),``
|
||
* ``rule(type('loginPage'), score(page(pageContainsUsernameField)))``
|
||
*/
|
||
function page(scoringFunction) {
|
||
function wrapper(fnode) {
|
||
const scoreAndTypeAndNote = scoringFunction(fnode);
|
||
if (scoreAndTypeAndNote.score !== undefined) {
|
||
scoreAndTypeAndNote.element = rootElement(fnode.element);
|
||
}
|
||
return scoreAndTypeAndNote;
|
||
}
|
||
return wrapper;
|
||
}
|
||
|
||
/**
|
||
* Sort the elements by their position in the DOM.
|
||
*
|
||
* @arg fnodes {iterable} fnodes to sort
|
||
* @return {Array} sorted fnodes
|
||
*/
|
||
function domSort(fnodes) {
|
||
function compare(a, b) {
|
||
const element = a.element;
|
||
const position = element.compareDocumentPosition(b.element);
|
||
if (position & element.DOCUMENT_POSITION_FOLLOWING) {
|
||
return -1;
|
||
} else if (position & element.DOCUMENT_POSITION_PRECEDING) {
|
||
return 1;
|
||
} else {
|
||
return 0;
|
||
}
|
||
}
|
||
return Array.from(fnodes).sort(compare);
|
||
}
|
||
|
||
/* istanbul ignore next */
|
||
/**
|
||
* Return the DOM element contained in a passed-in fnode. Return passed-in DOM
|
||
* elements verbatim.
|
||
*
|
||
* @arg fnodeOrElement {Node|Fnode}
|
||
*/
|
||
function toDomElement(fnodeOrElement) {
|
||
return isDomElement(fnodeOrElement) ? fnodeOrElement : fnodeOrElement.element;
|
||
}
|
||
|
||
/**
|
||
* Checks whether any of the element's attribute values satisfy some condition.
|
||
*
|
||
* Example::
|
||
*
|
||
* rule(type('foo'),
|
||
* score(attributesMatch(element,
|
||
* attr => attr.includes('good'),
|
||
* ['id', 'alt']) ? 2 : 1))
|
||
*
|
||
* @arg element {Node} Element whose attributes you want to search
|
||
* @arg predicate {function} A condition to check. Take a string and
|
||
* return a boolean. If an attribute has multiple values (e.g. the class
|
||
* attribute), attributesMatch will check each one.
|
||
* @arg attrs {string[]} An Array of attributes you want to search. If none are
|
||
* provided, search all.
|
||
* @return Whether any of the attribute values satisfy the predicate function
|
||
*/
|
||
function attributesMatch(element, predicate, attrs = []) {
|
||
const attributes = attrs.length === 0 ? Array.from(element.attributes).map(a => a.name) : attrs;
|
||
for (let i = 0; i < attributes.length; i++) {
|
||
const attr = element.getAttribute(attributes[i]);
|
||
// If the attribute is an array, apply the scoring function to each element
|
||
if (attr && ((Array.isArray(attr) && attr.some(predicate)) || predicate(attr))) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
|
||
/* istanbul ignore next */
|
||
/**
|
||
* Yield an element and each of its ancestors.
|
||
*/
|
||
function *ancestors(element) {
|
||
yield element;
|
||
let parent;
|
||
while ((parent = element.parentNode) !== null && parent.nodeType === parent.ELEMENT_NODE) {
|
||
yield parent;
|
||
element = parent;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Return the sigmoid of the argument: 1 / (1 + exp(-x)). This is useful for
|
||
* crunching a feature value that may have a wide range into the range (0, 1)
|
||
* without a hard ceiling: the sigmoid of even a very large number will be a
|
||
* little larger than that of a slightly smaller one.
|
||
*
|
||
* @arg x {Number} a number to be compressed into the range (0, 1)
|
||
*/
|
||
function sigmoid(x) {
|
||
return 1 / (1 + Math.exp(-x));
|
||
}
|
||
|
||
/* istanbul ignore next */
|
||
/**
|
||
* Return whether an element is practically visible, considering things like 0
|
||
* size or opacity, ``visibility: hidden`` and ``overflow: hidden``.
|
||
*
|
||
* Merely being scrolled off the page in either horizontally or vertically
|
||
* doesn't count as invisible; the result of this function is meant to be
|
||
* independent of viewport size.
|
||
*
|
||
* @throws {NoWindowError} The element (or perhaps one of its ancestors) is not
|
||
* in a window, so we can't find the `getComputedStyle()` routine to call.
|
||
* That routine is the source of most of the information we use, so you
|
||
* should pick a different strategy for non-window contexts.
|
||
*/
|
||
function isVisible(fnodeOrElement) {
|
||
// This could be 5x more efficient if https://github.com/w3c/csswg-drafts/issues/4122 happens.
|
||
const element = toDomElement(fnodeOrElement);
|
||
const elementWindow = windowForElement(element);
|
||
const elementRect = element.getBoundingClientRect();
|
||
const elementStyle = elementWindow.getComputedStyle(element);
|
||
// Alternative to reading ``display: none`` due to Bug 1381071.
|
||
if (elementRect.width === 0 && elementRect.height === 0 && elementStyle.overflow !== 'hidden') {
|
||
return false;
|
||
}
|
||
if (elementStyle.visibility === 'hidden') {
|
||
return false;
|
||
}
|
||
// Check if the element is irrevocably off-screen:
|
||
if (elementRect.x + elementRect.width < 0 ||
|
||
elementRect.y + elementRect.height < 0
|
||
) {
|
||
return false;
|
||
}
|
||
for (const ancestor of ancestors(element)) {
|
||
const isElement = ancestor === element;
|
||
const style = isElement ? elementStyle : elementWindow.getComputedStyle(ancestor);
|
||
if (style.opacity === '0') {
|
||
return false;
|
||
}
|
||
if (style.display === 'contents') {
|
||
// ``display: contents`` elements have no box themselves, but children are
|
||
// still rendered.
|
||
continue;
|
||
}
|
||
const rect = isElement ? elementRect : ancestor.getBoundingClientRect();
|
||
if ((rect.width === 0 || rect.height === 0) && elementStyle.overflow === 'hidden') {
|
||
// Zero-sized ancestors don’t make descendants hidden unless the descendant
|
||
// has ``overflow: hidden``.
|
||
return false;
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
/**
|
||
* Return the extracted [r, g, b, a] values from a string like "rgba(0, 5, 255, 0.8)",
|
||
* and scale them to 0..1. If no alpha is specified, return undefined for it.
|
||
*/
|
||
function rgbaFromString(str) {
|
||
const m = str.match(/^rgba?\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*(?:,\s*(\d+(?:\.\d+)?)\s*)?\)$/i);
|
||
if (m) {
|
||
return [m[1] / 255, m[2] / 255, m[3] / 255, m[4] === undefined ? undefined : parseFloat(m[4])];
|
||
} else {
|
||
throw new Error('Color ' + str + ' did not match pattern rgb[a](r, g, b[, a]).');
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Return the saturation 0..1 of a color defined by RGB values 0..1.
|
||
*/
|
||
function saturation(r, g, b) {
|
||
const cMax = Math.max(r, g, b);
|
||
const cMin = Math.min(r, g, b);
|
||
const delta = cMax - cMin;
|
||
const lightness = (cMax + cMin) / 2;
|
||
const denom = (1 - (Math.abs(2 * lightness - 1)));
|
||
// Return 0 if it's black (R, G, and B all 0).
|
||
return (denom === 0) ? 0 : delta / denom;
|
||
}
|
||
|
||
/**
|
||
* Scale a number to the range [0, 1] using a linear slope.
|
||
*
|
||
* For a rising line, the result is 0 until the input reaches zeroAt, then
|
||
* increases linearly until oneAt, at which it becomes 1. To make a falling
|
||
* line, where the result is 1 to the left and 0 to the right, use a zeroAt
|
||
* greater than oneAt.
|
||
*/
|
||
function linearScale(number, zeroAt, oneAt) {
|
||
const isRising = zeroAt < oneAt;
|
||
if (isRising) {
|
||
if (number <= zeroAt) {
|
||
return 0;
|
||
} else if (number >= oneAt) {
|
||
return 1;
|
||
}
|
||
} else {
|
||
if (number >= zeroAt) {
|
||
return 0;
|
||
} else if (number <= oneAt) {
|
||
return 1;
|
||
}
|
||
}
|
||
const slope = 1 / (oneAt - zeroAt);
|
||
return slope * (number - zeroAt);
|
||
}
|
||
|
||
// -------- Routines below this point are private to the framework. --------
|
||
|
||
/**
|
||
* Flatten out an iterable of iterables into a single iterable of non-
|
||
* iterables. Does not consider strings to be iterable.
|
||
*/
|
||
function *flatten(iterable) {
|
||
for (const i of iterable) {
|
||
if (typeof i !== 'string' && isIterable(i)) {
|
||
yield *(flatten(i));
|
||
} else {
|
||
yield i;
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* A lazy, top-level ``Array.map()`` workalike that works on anything iterable
|
||
*/
|
||
function *map(fn, iterable) {
|
||
for (const i of iterable) {
|
||
yield fn(i);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* A lazy, top-level ``Array.forEach()`` workalike that works on anything
|
||
* iterable
|
||
*/
|
||
function forEach(fn, iterable) {
|
||
for (const i of iterable) {
|
||
fn(i);
|
||
}
|
||
}
|
||
|
||
/* istanbul ignore next */
|
||
/**
|
||
* @return whether a thing appears to be a DOM element.
|
||
*/
|
||
function isDomElement(thing) {
|
||
return thing.nodeName !== undefined;
|
||
}
|
||
|
||
function isIterable(thing) {
|
||
return thing && typeof thing[Symbol.iterator] === 'function';
|
||
}
|
||
|
||
/**
|
||
* Return an backward iterator over an Array without reversing it in place.
|
||
*/
|
||
function *reversed(array) {
|
||
for (let i = array.length - 1; i >= 0; i--) {
|
||
yield array[i];
|
||
}
|
||
}
|
||
|
||
/* istanbul ignore next */
|
||
/*
|
||
* Return the window an element is in.
|
||
*
|
||
* @throws {NoWindowError} There isn't such a window.
|
||
*/
|
||
function windowForElement(element) {
|
||
let doc = element.ownerDocument;
|
||
if (doc === null) {
|
||
// The element itself was a document.
|
||
doc = element;
|
||
}
|
||
const win = doc.defaultView;
|
||
if (win === null) {
|
||
throw new NoWindowError();
|
||
}
|
||
return win;
|
||
}
|
||
|
||
var utilsForFrontend = /*#__PURE__*/Object.freeze({
|
||
__proto__: null,
|
||
identity: identity,
|
||
best: best,
|
||
max: max,
|
||
maxes: maxes,
|
||
min: min,
|
||
sum: sum,
|
||
length: length,
|
||
walk: walk,
|
||
isBlock: isBlock,
|
||
inlineTexts: inlineTexts,
|
||
inlineTextLength: inlineTextLength,
|
||
collapseWhitespace: collapseWhitespace,
|
||
linkDensity: linkDensity,
|
||
isWhitespace: isWhitespace,
|
||
setDefault: setDefault,
|
||
getDefault: getDefault,
|
||
toposort: toposort,
|
||
NiceSet: NiceSet,
|
||
first: first,
|
||
rootElement: rootElement,
|
||
numberOfMatches: numberOfMatches,
|
||
page: page,
|
||
domSort: domSort,
|
||
toDomElement: toDomElement,
|
||
attributesMatch: attributesMatch,
|
||
ancestors: ancestors,
|
||
sigmoid: sigmoid,
|
||
isVisible: isVisible,
|
||
rgbaFromString: rgbaFromString,
|
||
saturation: saturation,
|
||
linearScale: linearScale,
|
||
flatten: flatten,
|
||
map: map,
|
||
forEach: forEach,
|
||
isDomElement: isDomElement,
|
||
reversed: reversed,
|
||
windowForElement: windowForElement
|
||
});
|
||
|
||
/**
|
||
* Return the number of stride nodes between 2 DOM nodes *at the same
|
||
* level of the tree*, without going up or down the tree.
|
||
*
|
||
* ``left`` xor ``right`` may also be undefined.
|
||
*/
|
||
function numStrides(left, right) {
|
||
let num = 0;
|
||
|
||
// Walk right from left node until we hit the right node or run out:
|
||
let sibling = left;
|
||
let shouldContinue = sibling && sibling !== right;
|
||
while (shouldContinue) {
|
||
sibling = sibling.nextSibling;
|
||
if ((shouldContinue = sibling && sibling !== right) &&
|
||
!isWhitespace(sibling)) {
|
||
num += 1;
|
||
}
|
||
}
|
||
if (sibling !== right) { // Don't double-punish if left and right are siblings.
|
||
// Walk left from right node:
|
||
sibling = right;
|
||
while (sibling) {
|
||
sibling = sibling.previousSibling;
|
||
if (sibling && !isWhitespace(sibling)) {
|
||
num += 1;
|
||
}
|
||
}
|
||
}
|
||
return num;
|
||
}
|
||
|
||
/**
|
||
* Return a topological distance between 2 DOM nodes or :term:`fnodes<fnode>`
|
||
* weighted according to the similarity of their ancestry in the DOM. For
|
||
* instance, if one node is situated inside ``<div><span><b><theNode>`` and the
|
||
* other node is at ``<differentDiv><span><b><otherNode>``, they are considered
|
||
* close to each other for clustering purposes. This is useful for picking out
|
||
* nodes which have similar purposes.
|
||
*
|
||
* Return ``Number.MAX_VALUE`` if one of the nodes contains the other.
|
||
*
|
||
* This is largely an implementation detail of :func:`clusters`, but you can
|
||
* call it yourself if you wish to implement your own clustering. Takes O(n log
|
||
* n) time.
|
||
*
|
||
* Note that the default costs may change; pass them in explicitly if they are
|
||
* important to you.
|
||
*
|
||
* @arg fnodeA {Node|Fnode}
|
||
* @arg fnodeB {Node|Fnode}
|
||
* @arg differentDepthCost {number} Cost for each level deeper one node is than
|
||
* the other below their common ancestor
|
||
* @arg differentTagCost {number} Cost for a level below the common ancestor
|
||
* where tagNames differ
|
||
* @arg sameTagCost {number} Cost for a level below the common ancestor where
|
||
* tagNames are the same
|
||
* @arg strideCost {number} Cost for each stride node between A and B. Stride
|
||
* nodes are siblings or siblings-of-ancestors that lie between the 2
|
||
* nodes. These interposed nodes make it less likely that the 2 nodes
|
||
* should be together in a cluster.
|
||
* @arg additionalCost {function} Return an additional cost, given 2 fnodes or
|
||
* nodes.
|
||
*
|
||
*/
|
||
function distance(fnodeA,
|
||
fnodeB,
|
||
{differentDepthCost = 2,
|
||
differentTagCost = 2,
|
||
sameTagCost = 1,
|
||
strideCost = 1,
|
||
additionalCost = (fnodeA, fnodeB) => 0} = {}) {
|
||
// I was thinking of something that adds little cost for siblings. Up
|
||
// should probably be more expensive than down (see middle example in the
|
||
// Nokia paper).
|
||
|
||
// TODO: Test and tune default costs. They're off the cuff at the moment.
|
||
|
||
if (fnodeA === fnodeB) {
|
||
return 0;
|
||
}
|
||
|
||
const elementA = isDomElement(fnodeA) ? fnodeA : fnodeA.element;
|
||
const elementB = isDomElement(fnodeB) ? fnodeB : fnodeB.element;
|
||
|
||
// Stacks that go from the common ancestor all the way to A and B:
|
||
const aAncestors = [elementA];
|
||
const bAncestors = [elementB];
|
||
|
||
let aAncestor = elementA;
|
||
let bAncestor = elementB;
|
||
|
||
// Ascend to common parent, stacking them up for later reference:
|
||
while (!aAncestor.contains(elementB)) { // Note: an element does contain() itself.
|
||
aAncestor = aAncestor.parentNode;
|
||
aAncestors.push(aAncestor); //aAncestors = [a, b]. aAncestor = b // if a is outer: no loop here; aAncestors = [a]. aAncestor = a.
|
||
}
|
||
|
||
// In compareDocumentPosition()'s opinion, inside implies after. Basically,
|
||
// before and after pertain to opening tags.
|
||
const comparison = elementA.compareDocumentPosition(elementB);
|
||
|
||
// If either contains the other, abort. We'd either return a misleading
|
||
// number or else walk upward right out of the document while trying to
|
||
// make the ancestor stack.
|
||
if (comparison & (elementA.DOCUMENT_POSITION_CONTAINS | elementA.DOCUMENT_POSITION_CONTAINED_BY)) {
|
||
return Number.MAX_VALUE;
|
||
}
|
||
// Make an ancestor stack for the right node too so we can walk
|
||
// efficiently down to it:
|
||
do {
|
||
bAncestor = bAncestor.parentNode; // Assumes we've early-returned above if A === B. This walks upward from the outer node and up out of the tree. It STARTS OUT with aAncestor === bAncestor!
|
||
bAncestors.push(bAncestor);
|
||
} while (bAncestor !== aAncestor);
|
||
|
||
// Figure out which node is left and which is right, so we can follow
|
||
// sibling links in the appropriate directions when looking for stride
|
||
// nodes:
|
||
let left = aAncestors;
|
||
let right = bAncestors;
|
||
let cost = 0;
|
||
if (comparison & elementA.DOCUMENT_POSITION_FOLLOWING) {
|
||
// A is before, so it could contain the other node. What did I mean to do if one contained the other?
|
||
left = aAncestors;
|
||
right = bAncestors;
|
||
} else if (comparison & elementA.DOCUMENT_POSITION_PRECEDING) {
|
||
// A is after, so it might be contained by the other node.
|
||
left = bAncestors;
|
||
right = aAncestors;
|
||
}
|
||
|
||
// Descend to both nodes in parallel, discounting the traversal
|
||
// cost iff the nodes we hit look similar, implying the nodes dwell
|
||
// within similar structures.
|
||
while (left.length || right.length) {
|
||
const l = left.pop();
|
||
const r = right.pop();
|
||
if (l === undefined || r === undefined) {
|
||
// Punishment for being at different depths: same as ordinary
|
||
// dissimilarity punishment for now
|
||
cost += differentDepthCost;
|
||
} else {
|
||
// TODO: Consider similarity of classList.
|
||
cost += l.tagName === r.tagName ? sameTagCost : differentTagCost;
|
||
}
|
||
// Optimization: strides might be a good dimension to eliminate.
|
||
if (strideCost !== 0) {
|
||
cost += numStrides(l, r) * strideCost;
|
||
}
|
||
}
|
||
|
||
return cost + additionalCost(fnodeA, fnodeB);
|
||
}
|
||
|
||
/**
|
||
* Return the spatial distance between 2 fnodes or elements, assuming a
|
||
* rendered page.
|
||
*
|
||
* Specifically, return the distance in pixels between the centers of
|
||
* ``fnodeA.element.getBoundingClientRect()`` and
|
||
* ``fnodeB.element.getBoundingClientRect()``.
|
||
*/
|
||
function euclidean(fnodeA, fnodeB) {
|
||
/**
|
||
* Return the horizontal distance from the left edge of the viewport to the
|
||
* center of an element, given a DOMRect object for it. It doesn't matter
|
||
* that the distance is affected by the page's scroll offset, since the 2
|
||
* elements have the same offset.
|
||
*/
|
||
function xCenter(domRect) {
|
||
return domRect.left + domRect.width / 2;
|
||
}
|
||
function yCenter(domRect) {
|
||
return domRect.top + domRect.height / 2;
|
||
}
|
||
|
||
const elementA = toDomElement(fnodeA);
|
||
const elementB = toDomElement(fnodeB);
|
||
const aRect = elementA.getBoundingClientRect();
|
||
const bRect = elementB.getBoundingClientRect();
|
||
return Math.sqrt((xCenter(aRect) - xCenter(bRect)) ** 2 +
|
||
(yCenter(aRect) - yCenter(bRect)) ** 2);
|
||
}
|
||
|
||
/** A lower-triangular matrix of inter-cluster distances */
|
||
class DistanceMatrix {
|
||
/**
|
||
* @arg distance {function} Some notion of distance between 2 given nodes
|
||
*/
|
||
constructor(elements, distance) {
|
||
// A sparse adjacency matrix:
|
||
// {A => {},
|
||
// B => {A => 4},
|
||
// C => {A => 4, B => 4},
|
||
// D => {A => 4, B => 4, C => 4}
|
||
// E => {A => 4, B => 4, C => 4, D => 4}}
|
||
//
|
||
// A, B, etc. are arrays of [arrays of arrays of...] nodes, each
|
||
// array being a cluster. In this way, they not only accumulate a
|
||
// cluster but retain the steps along the way.
|
||
//
|
||
// This is an efficient data structure in terms of CPU and memory, in
|
||
// that we don't have to slide a lot of memory around when we delete a
|
||
// row or column from the middle of the matrix while merging. Of
|
||
// course, we lose some practical efficiency by using hash tables, and
|
||
// maps in particular are slow in their early implementations.
|
||
this._matrix = new Map();
|
||
|
||
// Convert elements to clusters:
|
||
const clusters = elements.map(el => [el]);
|
||
|
||
// Init matrix:
|
||
for (let outerCluster of clusters) {
|
||
const innerMap = new Map();
|
||
for (let innerCluster of this._matrix.keys()) {
|
||
innerMap.set(innerCluster, distance(outerCluster[0],
|
||
innerCluster[0]));
|
||
}
|
||
this._matrix.set(outerCluster, innerMap);
|
||
}
|
||
this._numClusters = clusters.length;
|
||
}
|
||
|
||
// Return (distance, a: clusterA, b: clusterB) of closest-together clusters.
|
||
// Replace this to change linkage criterion.
|
||
closest() {
|
||
const self = this;
|
||
|
||
if (this._numClusters < 2) {
|
||
throw new Error('There must be at least 2 clusters in order to return the closest() ones.');
|
||
}
|
||
|
||
// Return the distances between every pair of clusters.
|
||
function clustersAndDistances() {
|
||
const ret = [];
|
||
for (let [outerKey, row] of self._matrix.entries()) {
|
||
for (let [innerKey, storedDistance] of row.entries()) {
|
||
ret.push({a: outerKey, b: innerKey, distance: storedDistance});
|
||
}
|
||
}
|
||
return ret;
|
||
}
|
||
// Optimizing this by inlining the loop and writing it less
|
||
// functionally doesn't help:
|
||
return min(clustersAndDistances(), x => x.distance);
|
||
}
|
||
|
||
// Look up the distance between 2 clusters in me. Try the lookup in the
|
||
// other direction if the first one falls in the nonexistent half of the
|
||
// triangle.
|
||
_cachedDistance(clusterA, clusterB) {
|
||
let ret = this._matrix.get(clusterA).get(clusterB);
|
||
if (ret === undefined) {
|
||
ret = this._matrix.get(clusterB).get(clusterA);
|
||
}
|
||
return ret;
|
||
}
|
||
|
||
// Merge two clusters.
|
||
merge(clusterA, clusterB) {
|
||
// An example showing how rows merge:
|
||
// A: {}
|
||
// B: {A: 1}
|
||
// C: {A: 4, B: 4},
|
||
// D: {A: 4, B: 4, C: 4}
|
||
// E: {A: 4, B: 4, C: 2, D: 4}}
|
||
//
|
||
// Step 2:
|
||
// C: {}
|
||
// D: {C: 4}
|
||
// E: {C: 2, D: 4}}
|
||
// AB: {C: 4, D: 4, E: 4}
|
||
//
|
||
// Step 3:
|
||
// D: {}
|
||
// AB: {D: 4}
|
||
// CE: {D: 4, AB: 4}
|
||
|
||
// Construct new row, finding min distances from either subcluster of
|
||
// the new cluster to old clusters.
|
||
//
|
||
// There will be no repetition in the matrix because, after all,
|
||
// nothing pointed to this new cluster before it existed.
|
||
const newRow = new Map();
|
||
for (let outerKey of this._matrix.keys()) {
|
||
if (outerKey !== clusterA && outerKey !== clusterB) {
|
||
newRow.set(outerKey, Math.min(this._cachedDistance(clusterA, outerKey),
|
||
this._cachedDistance(clusterB, outerKey)));
|
||
}
|
||
}
|
||
|
||
// Delete the rows of the clusters we're merging.
|
||
this._matrix.delete(clusterA);
|
||
this._matrix.delete(clusterB);
|
||
|
||
// Remove inner refs to the clusters we're merging.
|
||
for (let inner of this._matrix.values()) {
|
||
inner.delete(clusterA);
|
||
inner.delete(clusterB);
|
||
}
|
||
|
||
// Attach new row.
|
||
this._matrix.set([clusterA, clusterB], newRow);
|
||
|
||
// There is a net decrease of 1 cluster:
|
||
this._numClusters -= 1;
|
||
}
|
||
|
||
numClusters() {
|
||
return this._numClusters;
|
||
}
|
||
|
||
// Return an Array of nodes for each cluster in me.
|
||
clusters() {
|
||
// TODO: Can't get map to work here. Don't know why.
|
||
return Array.from(this._matrix.keys()).map(e => Array.from(flatten(e)));
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Partition the given nodes into one or more clusters by position in the DOM
|
||
* tree.
|
||
*
|
||
* This implements an agglomerative clustering. It uses single linkage, since
|
||
* we're talking about adjacency here more than Euclidean proximity: the
|
||
* clusters we're talking about in the DOM will tend to be adjacent, not
|
||
* overlapping. We haven't tried other linkage criteria yet.
|
||
*
|
||
* In a later release, we may consider score or notes.
|
||
*
|
||
* @arg {Fnode[]|Node[]} fnodes :term:`fnodes<fnode>` or DOM nodes to group
|
||
* into clusters
|
||
* @arg {number} splittingDistance The closest-nodes :func:`distance` beyond
|
||
* which we will not attempt to unify 2 clusters. Make this larger to make
|
||
* larger clusters.
|
||
* @arg getDistance {function} A function that returns some notion of numerical
|
||
* distance between 2 nodes. Default: :func:`distance`
|
||
* @return {Array} An Array of Arrays, with each Array containing all the
|
||
* nodes in one cluster. Note that neither the clusters nor the nodes are
|
||
* in any particular order. You may find :func:`domSort` helpful to remedy
|
||
* the latter.
|
||
*/
|
||
function clusters(fnodes, splittingDistance, getDistance = distance) {
|
||
const matrix = new DistanceMatrix(fnodes, getDistance);
|
||
let closest;
|
||
|
||
while (matrix.numClusters() > 1 && (closest = matrix.closest()).distance < splittingDistance) {
|
||
matrix.merge(closest.a, closest.b);
|
||
}
|
||
|
||
return matrix.clusters();
|
||
}
|
||
|
||
var clusters$1 = /*#__PURE__*/Object.freeze({
|
||
__proto__: null,
|
||
distance: distance,
|
||
euclidean: euclidean,
|
||
clusters: clusters
|
||
});
|
||
|
||
// The left-hand side of a rule
|
||
|
||
|
||
/**
|
||
* Take nodes that match a given DOM selector. Example:
|
||
* ``dom('meta[property="og:title"]')``
|
||
*
|
||
* Every ruleset has at least one ``dom`` or :func:`element` rule, as that is
|
||
* where nodes begin to flow into the system. If run against a subtree of a
|
||
* document, the root of the subtree is not considered as a possible match.
|
||
*/
|
||
function dom(selector) {
|
||
return new DomLhs(selector);
|
||
}
|
||
|
||
/**
|
||
* Take a single given node if it matches a given DOM selector, without looking
|
||
* through its descendents or ancestors. Otherwise, take no nodes. Example:
|
||
* ``element('input')``
|
||
*
|
||
* This is useful for applications in which you want Fathom to classify an
|
||
* element the user has selected, rather than scanning the whole page for
|
||
* candidates.
|
||
*/
|
||
function element(selector) {
|
||
return new ElementLhs(selector);
|
||
}
|
||
|
||
/**
|
||
* Rules and the LHSs and RHSs that comprise them have no mutable state. This
|
||
* lets us make BoundRulesets from Rulesets without duplicating the rules. It
|
||
* also lets us share a common cache among rules: multiple ones might care
|
||
* about a cached type(), for instance; there isn't a one-to-one relationship
|
||
* of storing with caring. There would also, because of the interdependencies
|
||
* of rules in a ruleset, be little use in segmenting the caches: if you do
|
||
* something that causes one to need to be cleared, you'll need to clear many
|
||
* more as well.
|
||
*
|
||
* Lhses are responsible for maintaining ruleset.maxCache.
|
||
*
|
||
* Lhs and its subclasses are private to the Fathom framework.
|
||
*/
|
||
class Lhs {
|
||
constructor() {
|
||
this._predicate = () => true;
|
||
}
|
||
|
||
/** Return a new Lhs of the appropriate kind, given its first call. */
|
||
static fromFirstCall(firstCall) {
|
||
// firstCall is never 'dom', because dom() directly returns a DomLhs.
|
||
if (firstCall.method === 'type') {
|
||
return new TypeLhs(...firstCall.args);
|
||
} else if (firstCall.method === 'and') {
|
||
return new AndLhs(firstCall.args);
|
||
} else if (firstCall.method === 'nearest') {
|
||
return new NearestLhs(firstCall.args);
|
||
} else {
|
||
throw new Error('The left-hand side of a rule() must start with dom(), type(), and(), or nearest().');
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Prune nodes from consideration early in run execution, before scoring is
|
||
* done.
|
||
*
|
||
* Reserve this for where you are sure it is always correct or when
|
||
* performance demands it. It is generally preferable to use :func:`score`
|
||
* and let the :doc:`trainer<training>` determine the relative significance
|
||
* of each rule. Human intuition as to what is important is often wrong:
|
||
* for example, one might assume that a music player website would include
|
||
* the word "play", but this does not hold once you include sites in other
|
||
* languages.
|
||
*
|
||
* Can be chained after :func:`type` or :func:`dom`.
|
||
*
|
||
* Example: ``dom('p').when(isVisible)``
|
||
*
|
||
* @arg {function} predicate Accepts a fnode and returns a boolean
|
||
*/
|
||
when(predicate) {
|
||
let lhs = this.clone();
|
||
lhs._predicate = predicate;
|
||
return lhs;
|
||
}
|
||
|
||
/**
|
||
* Of all the dom nodes selected by type() or dom(), return only
|
||
* the fnodes that satisfy all the predicates imposed by calls to
|
||
* when()
|
||
*/
|
||
fnodesSatisfyingWhen(fnodes) {
|
||
return Array.from(fnodes).filter(this._predicate);
|
||
}
|
||
|
||
/**
|
||
* Return an iterable of output fnodes selected by this left-hand-side
|
||
* expression.
|
||
*
|
||
* Pre: The rules I depend on have already been run, and their results are
|
||
* in ruleset.typeCache.
|
||
*
|
||
* @arg ruleset {BoundRuleset}
|
||
*/
|
||
// fnodes (ruleset) {}
|
||
|
||
/**
|
||
* Check that a RHS-emitted fact is legal for this kind of LHS, and throw
|
||
* an error if it isn't.
|
||
*/
|
||
checkFact(fact) {}
|
||
|
||
/**
|
||
* Return the single type the output of the LHS is guaranteed to have.
|
||
* Return undefined if there is no such single type we can ascertain.
|
||
*/
|
||
guaranteedType() {}
|
||
|
||
/**
|
||
* Return the type I aggregate if I am an aggregate LHS; return undefined
|
||
* otherwise.
|
||
*/
|
||
aggregatedType() {}
|
||
|
||
/**
|
||
* Return each combination of types my selected nodes could be locally (that
|
||
* is, by this rule only) constrained to have.
|
||
*
|
||
* For example, type(A) would return [A]. and(A, or(B, C)) would return
|
||
* [AB, AC, ABC]. More examples:
|
||
*
|
||
* or(A, B) → typeIn(A, B, C) # Finalizes A, B. combos A, B, AB: finalizes AB. Optimization: there's no point in returning the last combo in ors. Compilation into 2 rules with identical RHSs will inherently implement this optimization.
|
||
* or(A, B) → typeIn(A, B) # Finalizes A, B
|
||
* or(A, B) → A # Finalizes B
|
||
* and(A) -> A # Finalizes nothing
|
||
* and(A, B) -> A # Finalizes nothing. AB: Ø
|
||
* and(A) -> typeIn(A, B) # Finalizes A. A
|
||
* and(A, B) -> typeIn(A, B) # Finalizes nothing. AB
|
||
* and(A, B) -> typeIn(A, B, C) # Finalizes A, B. AB
|
||
* and(A, or(B, C)) -> D # Finalizes A, B, C. AB, AC, ABC: ABC
|
||
* and(A, or(B, C)) -> B # Finalizes A, C. AB, AC, ABC: AC
|
||
* type(A).not(and(A, B)) ->
|
||
*
|
||
* @return {NiceSet[]}
|
||
*/
|
||
// possibleTypeCombinations() {}
|
||
|
||
/**
|
||
* Types mentioned in this LHS.
|
||
*
|
||
* In other words, the types I need to know the assignment status of before
|
||
* I can make my selections
|
||
*
|
||
* @return NiceSet of strings
|
||
*/
|
||
// typesMentioned() {}
|
||
}
|
||
|
||
class DomLhs extends Lhs {
|
||
constructor(selector) {
|
||
super();
|
||
if (selector === undefined) {
|
||
throw new Error('A querySelector()-style selector is required as the argument to ' + this._callName() + '().');
|
||
}
|
||
this.selector = selector;
|
||
}
|
||
|
||
/**
|
||
* Return the name of this kind of LHS, for use in error messages.
|
||
*/
|
||
_callName() {
|
||
return 'dom';
|
||
}
|
||
|
||
clone() {
|
||
return new this.constructor(this.selector);
|
||
}
|
||
|
||
fnodes(ruleset) {
|
||
return this._domNodesToFilteredFnodes(
|
||
ruleset,
|
||
ruleset.doc.querySelectorAll(this.selector));
|
||
}
|
||
|
||
/**
|
||
* Turn a NodeList of DOM nodes into an array of fnodes, and filter out
|
||
* those that don't match the :func:`when()` clause.
|
||
*/
|
||
_domNodesToFilteredFnodes(ruleset, domNodes) {
|
||
let ret = [];
|
||
for (let i = 0; i < domNodes.length; i++) {
|
||
ret.push(ruleset.fnodeForElement(domNodes[i]));
|
||
}
|
||
return this.fnodesSatisfyingWhen(ret);
|
||
}
|
||
|
||
checkFact(fact) {
|
||
if (fact.type === undefined) {
|
||
throw new Error(`The right-hand side of a ${this._callName()}() rule failed to specify a type. This means there is no way for its output to be used by later rules. All it specified was ${fact}.`);
|
||
}
|
||
}
|
||
|
||
asLhs() {
|
||
return this;
|
||
}
|
||
|
||
possibleTypeCombinations() {
|
||
return [];
|
||
}
|
||
|
||
typesMentioned() {
|
||
return new NiceSet();
|
||
}
|
||
}
|
||
|
||
class ElementLhs extends DomLhs {
|
||
_callName() {
|
||
return 'element';
|
||
}
|
||
|
||
fnodes(ruleset) {
|
||
return this._domNodesToFilteredFnodes(
|
||
ruleset,
|
||
ruleset.doc.matches(this.selector) ? [ruleset.doc] : []);
|
||
}
|
||
}
|
||
|
||
/** Internal representation of a LHS constrained by type but not by max() */
|
||
class TypeLhs extends Lhs {
|
||
constructor(type) {
|
||
super();
|
||
if (type === undefined) {
|
||
throw new Error('A type name is required when calling type().');
|
||
}
|
||
this._type = type; // the input type
|
||
}
|
||
|
||
clone() {
|
||
return new this.constructor(this._type);
|
||
}
|
||
|
||
fnodes(ruleset) {
|
||
const cached = getDefault(ruleset.typeCache, this._type, () => []);
|
||
return this.fnodesSatisfyingWhen(cached);
|
||
}
|
||
|
||
/** Override the type previously specified by this constraint. */
|
||
type(inputType) {
|
||
// Preserve the class in case this is a TypeMaxLhs.
|
||
return new this.constructor(inputType);
|
||
}
|
||
|
||
/**
|
||
* Of the nodes selected by a ``type`` call to the left, constrain the LHS
|
||
* to return only the max-scoring one. If there is a tie, more than 1 node
|
||
* will be returned. Example: ``type('titley').max()``
|
||
*/
|
||
max() {
|
||
return new TypeMaxLhs(this._type);
|
||
}
|
||
|
||
/**
|
||
* Take the nodes selected by a ``type`` call to the left, group them into
|
||
* clusters, and return the nodes in the cluster that has the highest total
|
||
* score (on the relevant type).
|
||
*
|
||
* Nodes come out in arbitrary order, so, if you plan to emit them,
|
||
* consider using ``.out('whatever').allThrough(domSort)``. See
|
||
* :func:`domSort`.
|
||
*
|
||
* If multiple clusters have equally high scores, return an arbitrary one,
|
||
* because Fathom has no way to represent arrays of arrays in rulesets.
|
||
*
|
||
* @arg options {Object} The same depth costs taken by :func:`distance`,
|
||
* plus ``splittingDistance``, which is the distance beyond which 2
|
||
* clusters will be considered separate. ``splittingDistance``, if
|
||
* omitted, defaults to 3.
|
||
*/
|
||
bestCluster(options) {
|
||
return new BestClusterLhs(this._type, options);
|
||
}
|
||
|
||
// Other clustering calls could be called biggestCluster() (having the most
|
||
// nodes) and bestAverageCluster() (having the highest average score).
|
||
|
||
guaranteedType() {
|
||
return this._type;
|
||
}
|
||
|
||
possibleTypeCombinations() {
|
||
return [this.typesMentioned()];
|
||
}
|
||
|
||
typesMentioned() {
|
||
return new NiceSet([this._type]);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Abstract LHS that is an aggregate function taken across all fnodes of a type
|
||
*
|
||
* The main point here is that any aggregate function over a (typed) set of
|
||
* nodes depends on first computing all the rules that could emit those nodes
|
||
* (nodes of that type).
|
||
*/
|
||
class AggregateTypeLhs extends TypeLhs {
|
||
aggregatedType() {
|
||
return this._type;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Internal representation of a LHS that has both type and max([NUMBER])
|
||
* constraints. max(NUMBER != 1) support is not yet implemented.
|
||
*/
|
||
class TypeMaxLhs extends AggregateTypeLhs {
|
||
/**
|
||
* Return the max-scoring node (or nodes if there is a tie) of the given
|
||
* type.
|
||
*/
|
||
fnodes(ruleset) {
|
||
// TODO: Optimize better. Walk the dependency tree, and run only the
|
||
// rules that could possibly lead to a max result. As part of this,
|
||
// make RHSs expose their max potential scores.
|
||
const self = this;
|
||
// Work around V8 bug:
|
||
// https://stackoverflow.com/questions/32943776/using-super-within-an-
|
||
// arrow-function-within-an-arrow-function-within-a-method
|
||
const getSuperFnodes = () => super.fnodes(ruleset);
|
||
return setDefault(
|
||
ruleset.maxCache,
|
||
this._type,
|
||
function maxFnodesOfType() {
|
||
return maxes(getSuperFnodes(), fnode => ruleset.weightedScore(fnode.scoresSoFarFor(self._type)));
|
||
});
|
||
}
|
||
}
|
||
|
||
class BestClusterLhs extends AggregateTypeLhs {
|
||
constructor(type, options) {
|
||
super(type);
|
||
this._options = options || {splittingDistance: 3};
|
||
}
|
||
|
||
/**
|
||
* Group the nodes of my type into clusters, and return the cluster with
|
||
* the highest total score for that type.
|
||
*/
|
||
fnodes(ruleset) {
|
||
// Get the nodes of the type:
|
||
const fnodesOfType = Array.from(super.fnodes(ruleset));
|
||
if (fnodesOfType.length === 0) {
|
||
return [];
|
||
}
|
||
// Cluster them:
|
||
const clusts = clusters(
|
||
fnodesOfType,
|
||
this._options.splittingDistance,
|
||
(a, b) => distance(a, b, this._options));
|
||
// Tag each cluster with the total of its nodes' scores:
|
||
const clustsAndSums = clusts.map(
|
||
clust => [clust,
|
||
sum(clust.map(fnode => fnode.scoreFor(this._type)))]);
|
||
// Return the highest-scoring cluster:
|
||
return max(clustsAndSums, clustAndSum => clustAndSum[1])[0];
|
||
}
|
||
}
|
||
|
||
class AndLhs extends Lhs {
|
||
constructor(lhss) {
|
||
super();
|
||
|
||
// For the moment, we accept only type()s as args. TODO: Generalize to
|
||
// type().max() and such later.
|
||
this._args = lhss.map(sideToTypeLhs);
|
||
}
|
||
|
||
*fnodes(ruleset) {
|
||
// Take an arbitrary one for starters. Optimization: we could always
|
||
// choose the pickiest one to start with.
|
||
const fnodes = this._args[0].fnodes(ruleset);
|
||
// Then keep only the fnodes that have the type of every other arg:
|
||
fnodeLoop: for (let fnode of fnodes) {
|
||
for (let otherLhs of this._args.slice(1)) {
|
||
// Optimization: could use a .hasTypeSoFar() below
|
||
if (!fnode.hasType(otherLhs.guaranteedType())) {
|
||
// TODO: This is n^2. Why is there no set intersection in JS?!
|
||
continue fnodeLoop;
|
||
}
|
||
}
|
||
yield fnode;
|
||
}
|
||
}
|
||
|
||
possibleTypeCombinations() {
|
||
return [this.typesMentioned()];
|
||
}
|
||
|
||
typesMentioned() {
|
||
return new NiceSet(this._args.map(arg => arg.guaranteedType()));
|
||
}
|
||
}
|
||
|
||
function sideToTypeLhs(side) {
|
||
const lhs = side.asLhs();
|
||
if (!(lhs.constructor === TypeLhs)) {
|
||
throw new Error('and() and nearest() support only simple type() calls as arguments for now.');
|
||
// TODO: Though we could solve this with a compilation step: and(type(A), type(B).max()) is equivalent to type(B).max() -> type(Bmax); and(type(A), type(Bmax)).
|
||
// In fact, we should be able to compile most (any?) arbitrary and()s, including nested ands and and(type(...).max(), ...) constructions into several and(type(A), type(B), ...) rules.
|
||
}
|
||
return lhs;
|
||
}
|
||
|
||
class NearestLhs extends Lhs {
|
||
constructor([a, b, distance]) {
|
||
super();
|
||
this._a = sideToTypeLhs(a);
|
||
this._b = sideToTypeLhs(b);
|
||
this._distance = distance;
|
||
}
|
||
|
||
/**
|
||
* Return an iterable of {fnodes, transformer} pairs.
|
||
*/
|
||
*fnodes(ruleset) {
|
||
// Go through all the left arg's nodes. For each one, find the closest
|
||
// right-arg's node. O(a * b). Once a right-arg's node is used, we
|
||
// don't eliminate it from consideration, because then order of left-
|
||
// args' nodes would matter.
|
||
|
||
// TODO: Still not sure how to get the distance to factor into the
|
||
// score unless we hard-code nearest() to do that. It's a
|
||
// matter of not being able to bind on the RHS to the output of the
|
||
// distance function on the LHS. Perhaps we could at least make
|
||
// distance part of the note and read it in a props() callback.
|
||
|
||
// We're assuming here that simple type() calls return just plain
|
||
// fnodes, not {fnode, rhsTransformer} pairs:
|
||
const as_ = this._a.fnodes(ruleset);
|
||
const bs = Array.from(this._b.fnodes(ruleset));
|
||
if (bs.length > 0) {
|
||
// If bs is empty, there can be no nearest nodes, so don't emit any.
|
||
for (const a of as_) {
|
||
const nearest = min(bs, b => this._distance(a, b));
|
||
yield {fnode: a,
|
||
rhsTransformer: function setNoteIfEmpty(fact) {
|
||
// If note is explicitly set by the RHS, let it take
|
||
// precedence, even though that makes this entire LHS
|
||
// pointless.
|
||
if (fact.note === undefined) {
|
||
fact.note = nearest; // TODO: Wrap this in an object to make room to return distance later.
|
||
}
|
||
return fact;
|
||
}};
|
||
}
|
||
}
|
||
}
|
||
|
||
checkFact(fact) {
|
||
// Barf if the fact doesn't set a type at least. It should be a *new* type or at least one that doesn't result in cycles, but we can't deduce that.
|
||
}
|
||
|
||
possibleTypeCombinations() {
|
||
return [new NiceSet([this._a.guaranteedType()])];
|
||
}
|
||
|
||
typesMentioned() {
|
||
return new NiceSet([this._a.guaranteedType(),
|
||
this._b.guaranteedType()]);
|
||
}
|
||
|
||
guaranteedType() {
|
||
return this._a.guaranteedType();
|
||
}
|
||
}
|
||
|
||
// The right-hand side of a rule
|
||
|
||
|
||
const TYPE = 1;
|
||
const NOTE = 2;
|
||
const SCORE = 4;
|
||
const ELEMENT = 8;
|
||
const SUBFACTS = {
|
||
type: TYPE,
|
||
note: NOTE,
|
||
score: SCORE,
|
||
element: ELEMENT
|
||
};
|
||
|
||
/**
|
||
* Expose the output of this rule's LHS as a "final result" to the surrounding
|
||
* program. It will be available by calling :func:`~BoundRuleset.get` on the
|
||
* ruleset and passing the key. You can run each node through a callback
|
||
* function first by adding :func:`through()`, or you can run the entire set of
|
||
* nodes through a callback function by adding :func:`allThrough()`.
|
||
*/
|
||
function out(key) {
|
||
return new OutwardRhs(key);
|
||
}
|
||
|
||
class InwardRhs {
|
||
constructor(calls = [], max = Infinity, types) {
|
||
this._calls = calls.slice();
|
||
this._max = max; // max score
|
||
this._types = new NiceSet(types); // empty set if unconstrained
|
||
}
|
||
|
||
/**
|
||
* Declare that the maximum returned subscore is such and such,
|
||
* which helps the optimizer plan efficiently. This doesn't force it to be
|
||
* true; it merely throws an error at runtime if it isn't. To lift an
|
||
* ``atMost`` constraint, call ``atMost()`` (with no args). The reason
|
||
* ``atMost`` and ``typeIn`` apply until explicitly cleared is so that, if
|
||
* someone used them for safety reasons on a lexically distant rule you are
|
||
* extending, you won't stomp on their constraint and break their
|
||
* invariants accidentally.
|
||
*/
|
||
atMost(score) {
|
||
return new this.constructor(this._calls, score, this._types);
|
||
}
|
||
|
||
_checkAtMost(fact) {
|
||
if (fact.score !== undefined && fact.score > this._max) {
|
||
throw new Error(`Score of ${fact.score} exceeds the declared atMost(${this._max}).`);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Determine any of type, note, score, and element using a callback. This
|
||
* overrides any previous call to `props` and, depending on what
|
||
* properties of the callback's return value are filled out, may override
|
||
* the effects of other previous calls as well.
|
||
*
|
||
* The callback should return...
|
||
*
|
||
* * An optional :term:`subscore`
|
||
* * A type (required on ``dom(...)`` rules, defaulting to the input one on
|
||
* ``type(...)`` rules)
|
||
* * Optional notes
|
||
* * An element, defaulting to the input one. Overriding the default
|
||
* enables a callback to walk around the tree and say things about nodes
|
||
* other than the input one.
|
||
*/
|
||
props(callback) {
|
||
function getSubfacts(fnode) {
|
||
const subfacts = callback(fnode);
|
||
// Filter the raw result down to okayed properties so callbacks
|
||
// can't insert arbitrary keys (like conserveScore, which might
|
||
// mess up the optimizer).
|
||
for (let subfact in subfacts) {
|
||
if (!SUBFACTS.hasOwnProperty(subfact) || !(SUBFACTS[subfact] & getSubfacts.possibleSubfacts)) {
|
||
// The ES5.1 spec says in 12.6.4 that it's fine to delete
|
||
// as we iterate.
|
||
delete subfacts[subfact];
|
||
}
|
||
}
|
||
return subfacts;
|
||
}
|
||
// Thse are the subfacts this call could affect:
|
||
getSubfacts.possibleSubfacts = TYPE | NOTE | SCORE | ELEMENT;
|
||
getSubfacts.kind = 'props';
|
||
return new this.constructor(this._calls.concat(getSubfacts),
|
||
this._max,
|
||
this._types);
|
||
}
|
||
|
||
/**
|
||
* Set the type applied to fnodes processed by this RHS.
|
||
*/
|
||
type(theType) {
|
||
// In the future, we might also support providing a callback that receives
|
||
// the fnode and returns a type. We couldn't reason based on these, but the
|
||
// use would be rather a consise way to to override part of what a previous
|
||
// .props() call provides.
|
||
|
||
// Actually emit a given type.
|
||
function getSubfacts() {
|
||
return {type: theType};
|
||
}
|
||
getSubfacts.possibleSubfacts = TYPE;
|
||
getSubfacts.type = theType;
|
||
getSubfacts.kind = 'type';
|
||
return new this.constructor(this._calls.concat(getSubfacts),
|
||
this._max,
|
||
this._types);
|
||
}
|
||
|
||
/**
|
||
* Constrain this rule to emit 1 of a set of given types. Pass no args to lift
|
||
* a previous ``typeIn`` constraint, as you might do when basing a LHS on a
|
||
* common value to factor out repetition.
|
||
*
|
||
* ``typeIn`` is mostly a hint for the query planner when you're emitting types
|
||
* dynamically from ``props`` calls—in fact, an error will be raised if
|
||
* ``props`` is used without a ``typeIn`` or ``type`` to constrain it—but it
|
||
* also checks conformance at runtime to ensure validity.
|
||
*/
|
||
typeIn(...types) {
|
||
// Rationale: If we used the spelling "type('a', 'b', ...)" instead of
|
||
// this, one might expect type('a', 'b').type(fn) to have the latter
|
||
// call override, while expecting type(fn).type('a', 'b') to keep both
|
||
// in effect. Then different calls to type() don't consistently
|
||
// override each other, and the rules get complicated. Plus you can't
|
||
// inherit a type constraint and then sub in another type-returning
|
||
// function that still gets the constraint applied.
|
||
return new this.constructor(this._calls,
|
||
this._max,
|
||
types);
|
||
}
|
||
|
||
/**
|
||
* Check a fact for conformance with any typeIn() call.
|
||
*
|
||
* @arg leftType the type of the LHS, which becomes my emitted type if the
|
||
* fact doesn't specify one
|
||
*/
|
||
_checkTypeIn(result, leftType) {
|
||
if (this._types.size > 0) {
|
||
if (result.type === undefined) {
|
||
if (!this._types.has(leftType)) {
|
||
throw new Error(`A right-hand side claimed, via typeIn(...) to emit one of the types ${this._types} but actually inherited ${leftType} from the left-hand side.`);
|
||
}
|
||
} else if (!this._types.has(result.type)) {
|
||
throw new Error(`A right-hand side claimed, via typeIn(...) to emit one of the types ${this._types} but actually emitted ${result.type}.`);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Whatever the callback returns (even ``undefined``) becomes the note of
|
||
* the fact. This overrides any previous call to ``note``.
|
||
*/
|
||
note(callback) {
|
||
function getSubfacts(fnode) {
|
||
return {note: callback(fnode)};
|
||
}
|
||
getSubfacts.possibleSubfacts = NOTE;
|
||
getSubfacts.kind = 'note';
|
||
return new this.constructor(this._calls.concat(getSubfacts),
|
||
this._max,
|
||
this._types);
|
||
}
|
||
|
||
/**
|
||
* Affect the confidence with which the input node should be considered a
|
||
* member of a type.
|
||
*
|
||
* The parameter is generally between 0 and 1 (inclusive), with 0 meaning
|
||
* the node does not have the "smell" this rule checks for and 1 meaning it
|
||
* does. The range between 0 and 1 is available to represent "fuzzy"
|
||
* confidences. If you have an unbounded range to compress down to [0, 1],
|
||
* consider using :func:`sigmoid` or a scaling thereof.
|
||
*
|
||
* Since every node can have multiple, independent scores (one for each
|
||
* type), this applies to the type explicitly set by the RHS or, if none,
|
||
* to the type named by the ``type`` call on the LHS. If the LHS has none
|
||
* because it's a ``dom(...)`` LHS, an error is raised.
|
||
*
|
||
* @arg {number|function} scoreOrCallback Can either be a static number,
|
||
* generally 0 to 1 inclusive, or else a callback which takes the fnode
|
||
* and returns such a number. If the callback returns a boolean, it is
|
||
* cast to a number.
|
||
*/
|
||
score(scoreOrCallback) {
|
||
let getSubfacts;
|
||
|
||
function getSubfactsFromNumber(fnode) {
|
||
return {score: scoreOrCallback};
|
||
}
|
||
|
||
function getSubfactsFromFunction(fnode) {
|
||
let result = scoreOrCallback(fnode);
|
||
if (typeof result === 'boolean') {
|
||
// Case bools to numbers for convenience. Boolean features are
|
||
// common. Don't cast other things, as it frustrates ruleset
|
||
// debugging.
|
||
result = Number(result);
|
||
}
|
||
return {score: result};
|
||
}
|
||
|
||
if (typeof scoreOrCallback === 'number') {
|
||
getSubfacts = getSubfactsFromNumber;
|
||
} else {
|
||
getSubfacts = getSubfactsFromFunction;
|
||
}
|
||
getSubfacts.possibleSubfacts = SCORE;
|
||
getSubfacts.kind = 'score';
|
||
|
||
return new this.constructor(this._calls.concat(getSubfacts),
|
||
this._max,
|
||
this._types);
|
||
}
|
||
|
||
// Future: why not have an .element() method for completeness?
|
||
|
||
// -------- Methods below this point are private to the framework. --------
|
||
|
||
/**
|
||
* Run all my props().type().note().score() stuff across a given fnode,
|
||
* enforce my max() stuff, and return a fact ({element, type, score,
|
||
* notes}) for incorporation into that fnode (or a different one, if
|
||
* element is specified). Any of the 4 fact properties can be missing;
|
||
* filling in defaults is a job for the caller.
|
||
*
|
||
* @arg leftType The type the LHS takes in
|
||
*/
|
||
fact(fnode, leftType) {
|
||
const doneKinds = new Set();
|
||
const result = {};
|
||
let haveSubfacts = 0;
|
||
for (let call of reversed(this._calls)) {
|
||
// If we've already called a call of this kind, then forget it.
|
||
if (!doneKinds.has(call.kind)) {
|
||
doneKinds.add(call.kind);
|
||
|
||
if (~haveSubfacts & call.possibleSubfacts) {
|
||
// This call might provide a subfact we are missing.
|
||
const newSubfacts = call(fnode);
|
||
|
||
// We start with an empty object, so we're okay here.
|
||
// eslint-disable-next-line guard-for-in
|
||
for (let subfact in newSubfacts) {
|
||
// A props() callback could insert arbitrary keys into
|
||
// the result, but it shouldn't matter, because nothing
|
||
// pays any attention to them.
|
||
if (!result.hasOwnProperty(subfact)) {
|
||
result[subfact] = newSubfacts[subfact];
|
||
}
|
||
haveSubfacts |= SUBFACTS[subfact];
|
||
}
|
||
}
|
||
}
|
||
}
|
||
this._checkAtMost(result);
|
||
this._checkTypeIn(result, leftType);
|
||
return result;
|
||
}
|
||
|
||
/**
|
||
* Return a record describing the types I might emit (which means either to
|
||
* add a type to a fnode or to output a fnode that already has that type).
|
||
* {couldChangeType: whether I might add a type to the fnode,
|
||
* possibleTypes: If couldChangeType, the types I might emit; empty set if
|
||
* we cannot infer it. If not couldChangeType, undefined.}
|
||
*/
|
||
possibleEmissions() {
|
||
// If there is a typeIn() constraint or there is a type() call to the
|
||
// right of all props() calls, we have a constraint. We hunt for the
|
||
// tightest constraint we can find, favoring a type() call because it
|
||
// gives us a single type but then falling back to a typeIn().
|
||
let couldChangeType = false;
|
||
for (let call of reversed(this._calls)) {
|
||
if (call.kind === 'props') {
|
||
couldChangeType = true;
|
||
break;
|
||
} else if (call.kind === 'type') {
|
||
return {couldChangeType: true,
|
||
possibleTypes: new Set([call.type])};
|
||
}
|
||
}
|
||
return {couldChangeType,
|
||
possibleTypes: this._types};
|
||
}
|
||
}
|
||
|
||
class OutwardRhs {
|
||
constructor(key, through = x => x, allThrough = x => x) {
|
||
this.key = key;
|
||
this.callback = through;
|
||
this.allCallback = allThrough;
|
||
}
|
||
|
||
/**
|
||
* Append ``.through`` to :func:`out` to run each :term:`fnode` emitted
|
||
* from the LHS through an arbitrary function before returning it to the
|
||
* containing program. Example::
|
||
*
|
||
* out('titleLengths').through(fnode => fnode.noteFor('title').length)
|
||
*/
|
||
through(callback) {
|
||
return new this.constructor(this.key, callback, this.allCallback);
|
||
}
|
||
|
||
/**
|
||
* Append ``.allThrough`` to :func:`out` to run the entire iterable of
|
||
* emitted :term:`fnodes<fnode>` through an arbitrary function before
|
||
* returning them to the containing program. Example::
|
||
*
|
||
* out('sortedTitles').allThrough(domSort)
|
||
*/
|
||
allThrough(callback) {
|
||
return new this.constructor(this.key, this.callback, callback);
|
||
}
|
||
|
||
asRhs() {
|
||
return this;
|
||
}
|
||
}
|
||
|
||
function props(callback) {
|
||
return new Side({method: 'props', args: [callback]});
|
||
}
|
||
|
||
/** Constrain to an input type on the LHS, or apply a type on the RHS. */
|
||
function type(theType) {
|
||
return new Side({method: 'type', args: [theType]});
|
||
}
|
||
|
||
function note(callback) {
|
||
return new Side({method: 'note', args: [callback]});
|
||
}
|
||
|
||
function score(scoreOrCallback) {
|
||
return new Side({method: 'score', args: [scoreOrCallback]});
|
||
}
|
||
|
||
function atMost(score) {
|
||
return new Side({method: 'atMost', args: [score]});
|
||
}
|
||
|
||
function typeIn(...types) {
|
||
return new Side({method: 'typeIn', args: types});
|
||
}
|
||
|
||
/**
|
||
* Pull nodes that conform to multiple conditions at once.
|
||
*
|
||
* For example: ``and(type('title'), type('english'))``
|
||
*
|
||
* Caveats: ``and`` supports only simple ``type`` calls as arguments for now,
|
||
* and it may fire off more rules as prerequisites than strictly necessary.
|
||
* ``not`` and ``or`` don't exist yet, but you can express ``or`` the long way
|
||
* around by having 2 rules with identical RHSs.
|
||
*/
|
||
function and(...lhss) {
|
||
return new Side({method: 'and', args: lhss});
|
||
}
|
||
|
||
/**
|
||
* Experimental. For each :term:`fnode` from ``typeCallA``, find the closest
|
||
* node from ``typeCallB``, and attach it as a note. The note is attached to
|
||
* the type specified by the RHS, defaulting to the type of ``typeCallA``. If
|
||
* no nodes are emitted from ``typeCallB``, do nothing.
|
||
*
|
||
* For example... ::
|
||
*
|
||
* nearest(type('image'), type('price'))
|
||
*
|
||
* The score of the ``typeCallA`` can be added to the new type's score by using
|
||
* :func:`conserveScore` (though this routine has since been removed)::
|
||
*
|
||
* rule(nearest(type('image'), type('price')),
|
||
* type('imageWithPrice').score(2).conserveScore())
|
||
*
|
||
* Caveats: ``nearest`` supports only simple ``type`` calls as arguments ``a``
|
||
* and ``b`` for now.
|
||
*
|
||
* @arg distance {function} A function that takes 2 fnodes and returns a
|
||
* numerical distance between them. Included options are :func:`distance`,
|
||
* which is a weighted topological distance, and :func:`euclidean`, which
|
||
* is a spatial distance.
|
||
*/
|
||
function nearest(typeCallA, typeCallB, distance = euclidean) {
|
||
return new Side({method: 'nearest', args: [typeCallA, typeCallB, distance]});
|
||
}
|
||
|
||
/**
|
||
* A chain of calls that can be compiled into a Rhs or Lhs, depending on its
|
||
* position in a Rule. This lets us use type() as a leading call for both RHSs
|
||
* and LHSs. I would prefer to do this dynamically, but that wouldn't compile
|
||
* down to old versions of ES.
|
||
*/
|
||
class Side {
|
||
constructor(...calls) {
|
||
// A "call" is like {method: 'dom', args: ['p.smoo']}.
|
||
this._calls = calls;
|
||
}
|
||
|
||
max() {
|
||
return this._and('max');
|
||
}
|
||
|
||
bestCluster(options) {
|
||
return this._and('bestCluster', options);
|
||
}
|
||
|
||
props(callback) {
|
||
return this._and('props', callback);
|
||
}
|
||
|
||
type(...types) {
|
||
return this._and('type', ...types);
|
||
}
|
||
|
||
note(callback) {
|
||
return this._and('note', callback);
|
||
}
|
||
|
||
score(scoreOrCallback) {
|
||
return this._and('score', scoreOrCallback);
|
||
}
|
||
|
||
atMost(score) {
|
||
return this._and('atMost', score);
|
||
}
|
||
|
||
typeIn(...types) {
|
||
return this._and('typeIn', ...types);
|
||
}
|
||
|
||
and(...lhss) {
|
||
return this._and('and', lhss);
|
||
}
|
||
|
||
_and(method, ...args) {
|
||
return new this.constructor(...this._calls.concat({method, args}));
|
||
}
|
||
|
||
asLhs() {
|
||
return this._asSide(Lhs.fromFirstCall(this._calls[0]), this._calls.slice(1));
|
||
}
|
||
|
||
asRhs() {
|
||
return this._asSide(new InwardRhs(), this._calls);
|
||
}
|
||
|
||
_asSide(side, calls) {
|
||
for (let call of calls) {
|
||
side = side[call.method](...call.args);
|
||
}
|
||
return side;
|
||
}
|
||
|
||
when(pred) {
|
||
return this._and('when', pred);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* A wrapper around a DOM node, storing :term:`types<type>`,
|
||
* :term:`scores<score>`, and :term:`notes<note>` that apply to it
|
||
*/
|
||
class Fnode {
|
||
/**
|
||
* @arg element The DOM element described by the fnode.
|
||
* @arg ruleset The ruleset which created the fnode.
|
||
*/
|
||
constructor(element, ruleset) {
|
||
if (element === undefined) {
|
||
throw new Error("Someone tried to make a fnode without specifying the element they're talking about.");
|
||
}
|
||
/**
|
||
* The raw DOM element this fnode describes
|
||
*/
|
||
this.element = element;
|
||
this._ruleset = ruleset;
|
||
|
||
// A map of type => {score: number, note: anything}. `score` is always
|
||
// present and defaults to 1. A note is set iff `note` is present and
|
||
// not undefined.
|
||
this._types = new Map();
|
||
|
||
// Note: conserveScore() is temporarily absent in 3.0.
|
||
//
|
||
// By default, a fnode has an independent score for each of its types.
|
||
// However, a RHS can opt to conserve the score of an upstream type,
|
||
// carrying it forward into another type. To avoid runaway scores in
|
||
// the case that multiple rules choose to do this, we limit the
|
||
// contribution of an upstream type's score to being multiplied in a
|
||
// single time. In this set, we keep track of which upstream types'
|
||
// scores have already been multiplied into each type. LHS fnode => Set
|
||
// of types whose score for that node have been multiplied into this
|
||
// node's score.
|
||
this._conservedScores = new Map();
|
||
}
|
||
|
||
/**
|
||
* Return whether the given type is one of the ones attached to the wrapped
|
||
* HTML node.
|
||
*/
|
||
hasType(type) {
|
||
// Run type(theType) against the ruleset to make sure this doesn't
|
||
// return false just because we haven't lazily run certain rules yet.
|
||
this._computeType(type);
|
||
return this._types.has(type);
|
||
}
|
||
|
||
/**
|
||
* Return the confidence, in the range (0, 1), that the fnode belongs to the
|
||
* given type, 0 by default.
|
||
*/
|
||
scoreFor(type) {
|
||
this._computeType(type);
|
||
return sigmoid(this._ruleset.weightedScore(this.scoresSoFarFor(type)) +
|
||
getDefault(this._ruleset.biases, type, () => 0));
|
||
}
|
||
|
||
/**
|
||
* Return the fnode's note for the given type, ``undefined`` if none.
|
||
*/
|
||
noteFor(type) {
|
||
this._computeType(type);
|
||
return this._noteSoFarFor(type);
|
||
}
|
||
|
||
/**
|
||
* Return whether this fnode has a note for the given type.
|
||
*
|
||
* ``undefined`` is not considered a note and may be overwritten with
|
||
* impunity.
|
||
*/
|
||
hasNoteFor(type) {
|
||
this._computeType(type);
|
||
return this._hasNoteSoFarFor(type);
|
||
}
|
||
|
||
// -------- Methods below this point are private to the framework. --------
|
||
|
||
/**
|
||
* Return an iterable of the types tagged onto me by rules that have
|
||
* already executed.
|
||
*/
|
||
typesSoFar() {
|
||
return this._types.keys();
|
||
}
|
||
|
||
_noteSoFarFor(type) {
|
||
return this._typeRecordForGetting(type).note;
|
||
}
|
||
|
||
_hasNoteSoFarFor(type) {
|
||
return this._noteSoFarFor(type) !== undefined;
|
||
}
|
||
|
||
/**
|
||
* Return the score thus far computed on me for a certain type. Doesn't
|
||
* implicitly run any rules. If no score has yet been determined for the
|
||
* given type, return undefined.
|
||
*/
|
||
scoresSoFarFor(type) {
|
||
return this._typeRecordForGetting(type).score;
|
||
}
|
||
|
||
/**
|
||
* Add a given number to one of our per-type scores. Implicitly assign us
|
||
* the given type. Keep track of which rule it resulted from so we can
|
||
* later mess with the coeffs.
|
||
*/
|
||
addScoreFor(type, score, ruleName) {
|
||
this._typeRecordForSetting(type).score.set(ruleName, score);
|
||
}
|
||
|
||
/**
|
||
* Set the note attached to one of our types. Implicitly assign us that
|
||
* type if we don't have it already.
|
||
*/
|
||
setNoteFor(type, note) {
|
||
if (this._hasNoteSoFarFor(type)) {
|
||
if (note !== undefined) {
|
||
throw new Error(`Someone (likely the right-hand side of a rule) tried to add a note of type ${type} to an element, but one of that type already exists. Overwriting notes is not allowed, since it would make the order of rules matter.`);
|
||
}
|
||
// else the incoming note is undefined and we already have the
|
||
// type, so it's a no-op
|
||
} else {
|
||
// Apply either a type and note or just a type (which means a note
|
||
// that is undefined):
|
||
this._typeRecordForSetting(type).note = note;
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Return a score/note record for a type, creating it if it doesn't exist.
|
||
*/
|
||
_typeRecordForSetting(type) {
|
||
return setDefault(this._types, type, () => ({score: new Map()}));
|
||
}
|
||
|
||
/**
|
||
* Manifest a temporary type record for reading, working around the lack of
|
||
* a .? operator in JS.
|
||
*/
|
||
_typeRecordForGetting(type) {
|
||
return getDefault(this._types, type, () => ({score: new Map()}));
|
||
}
|
||
|
||
/**
|
||
* Make sure any scores, notes, and type-tagging for the given type are
|
||
* computed for my element.
|
||
*/
|
||
_computeType(theType) {
|
||
if (!this._types.has(theType)) { // Prevent infinite recursion when an A->A rule looks at A's note in a callback.
|
||
this._ruleset.get(type(theType));
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Construct and return the proper type of rule class based on the
|
||
* inwardness/outwardness of the RHS.
|
||
*
|
||
* @arg lhs {Lhs} The left-hand side of the rule
|
||
* @arg rhs {Rhs} The right-hand side of the rule
|
||
* @arg options {object} Other, optional information about the rule.
|
||
* Currently, the only recognized option is ``name``, which points to a
|
||
* string that uniquely identifies this rule in a ruleset. The name
|
||
* correlates this rule with one of the coefficients passed into
|
||
* :func:`ruleset`. If no name is given, an identifier is assigned based on
|
||
* the index of this rule in the ruleset, but that is, of course, brittle.
|
||
*/
|
||
function rule(lhs, rhs, options) {
|
||
// Since out() is a valid call only on the RHS (unlike type()), we can take
|
||
// a shortcut here: any outward RHS will already be an OutwardRhs; we don't
|
||
// need to sidetrack it through being a Side. And OutwardRhs has an asRhs()
|
||
// that just returns itself.
|
||
if (typeof rhs === 'string') {
|
||
rhs = out(rhs);
|
||
}
|
||
return new ((rhs instanceof OutwardRhs) ? OutwardRule : InwardRule)(lhs, rhs, options);
|
||
}
|
||
|
||
let nextRuleNumber = 0;
|
||
function newInternalRuleName() {
|
||
return '_' + nextRuleNumber++;
|
||
}
|
||
|
||
/**
|
||
* We place the in/out distinction in Rules because it determines whether the
|
||
* RHS result is cached, and Rules are responsible for maintaining the rulewise
|
||
* cache ruleset.ruleCache.
|
||
*/
|
||
class Rule { // abstract
|
||
constructor(lhs, rhs, options) {
|
||
this.lhs = lhs.asLhs();
|
||
this.rhs = rhs.asRhs();
|
||
// TODO: Make auto-generated rule names be based on the out types of
|
||
// the rules, e.g. _priceish_4. That way, adding rules for one type
|
||
// won't make the coeffs misalign for another.
|
||
this.name = (options ? options.name : undefined) || newInternalRuleName();
|
||
}
|
||
|
||
/**
|
||
* Return a NiceSet of the rules that this one shallowly depends on in the
|
||
* given ruleset. In a BoundRuleset, this may include rules that have
|
||
* already been executed.
|
||
*
|
||
* Depend on emitters of any LHS type this rule finalizes. (See
|
||
* _typesFinalized for a definition.) Depend on adders of any other LHS
|
||
* types (because, after all, we need to know what nodes have that type in
|
||
* order to find the set of LHS nodes). This works for simple rules and
|
||
* complex ones like and().
|
||
*
|
||
* Specific examples (where A is a type):
|
||
* * A.max->* depends on anything emitting A.
|
||
* * Even A.max->A depends on A emitters, because we have to have all the
|
||
* scores factored in first. For example, what if we did
|
||
* max(A)->score(.5)?
|
||
* * A->A depends on anything adding A.
|
||
* * A->(something other than A) depends on anything emitting A. (For
|
||
* example, we need the A score finalized before we could transfer it to
|
||
* B using conserveScore().)
|
||
* * A->out() also depends on anything emitting A. Fnode methods aren't
|
||
* smart enough to lazily run emitter rules as needed. We could make them
|
||
* so if it was shown to be an advantage.
|
||
*/
|
||
prerequisites(ruleset) {
|
||
// Optimization: we could cache the result of this when in a compiled (immutable) ruleset.
|
||
|
||
// Extend prereqs with rules derived from each of the given types. If
|
||
// no rules are found, raise an exception, as that indicates a
|
||
// malformed ruleset.
|
||
function extendOrThrow(prereqs, types, ruleGetter, verb) {
|
||
for (let type of types) {
|
||
const rules = ruleGetter(type);
|
||
if (rules.length > 0) {
|
||
prereqs.extend(rules);
|
||
} else {
|
||
throw new Error(`No rule ${verb} the "${type}" type, but another rule needs it as input.`);
|
||
}
|
||
}
|
||
}
|
||
|
||
const prereqs = new NiceSet();
|
||
|
||
// Add finalized types:
|
||
extendOrThrow(prereqs, this._typesFinalized(), type => ruleset.inwardRulesThatCouldEmit(type), 'emits');
|
||
|
||
// Add mentioned types:
|
||
// We could say this.lhs.typesMentioned().minus(typesFinalized) as an
|
||
// optimization. But since types mentioned are a superset of types
|
||
// finalized and rules adding are a subset of rules emitting, we get
|
||
// the same result without.
|
||
extendOrThrow(prereqs, this.lhs.typesMentioned(), type => ruleset.inwardRulesThatCouldAdd(type), 'adds');
|
||
|
||
return prereqs;
|
||
}
|
||
|
||
/**
|
||
* Return the types that this rule finalizes.
|
||
*
|
||
* To "finalize" a type means to make sure we're finished running all
|
||
* possible rules that might change a node's score or notes w.r.t. a given
|
||
* type. This is generally done because we're about to use those data for
|
||
* something, like computing a new type's score or or an aggregate
|
||
* function. Exhaustively, we're about to...
|
||
* * change the type of the nodes or
|
||
* * aggregate all nodes of a type
|
||
*
|
||
* This base-class implementation just returns what aggregate functions
|
||
* need, since that need spans inward and outward rules.
|
||
*
|
||
* @return Set of types
|
||
*/
|
||
_typesFinalized() {
|
||
// Get the types that are fed to aggregate functions. Aggregate
|
||
// functions are more demanding than a simple type() LHS. A type() LHS
|
||
// itself does not finalize its nodes because the things it could do to
|
||
// them without changing their type (adding notes, adding to score)
|
||
// are immutable or commutative (respectively). Thus, we require a RHS
|
||
// type change in order to require finalization of a simple type()
|
||
// mention. A max(B), OTOH, is not commutative with other B->B rules
|
||
// (imagine type(B).max()->score(.5)), so it must depend on B emitters
|
||
// and thus finalize B. (This will have to be relaxed or rethought when
|
||
// we do the max()/atMost() optimization. Perhaps we can delegate to
|
||
// aggregate functions up in Rule.prerequisites() to ask what their
|
||
// prereqs are. If they implement such an optimization, they can reply.
|
||
// Otherwise, we can assume they are all the nodes of their type.)
|
||
//
|
||
// TODO: Could arbitrary predicates (once we implement those) matter
|
||
// too? Maybe it's not just aggregations.
|
||
const type = this.lhs.aggregatedType();
|
||
return (type === undefined) ? new NiceSet() : new NiceSet([type]);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* A normal rule, whose results head back into the Fathom knowledgebase, to be
|
||
* operated on by further rules.
|
||
*/
|
||
class InwardRule extends Rule {
|
||
// TODO: On construct, complain about useless rules, like a dom() rule that
|
||
// doesn't assign a type.
|
||
|
||
/**
|
||
* Return an iterable of the fnodes emitted by the RHS of this rule.
|
||
* Side effect: update ruleset's store of fnodes, its accounting of which
|
||
* rules are done executing, and its cache of results per type.
|
||
*/
|
||
results(ruleset) {
|
||
if (ruleset.doneRules.has(this)) { // shouldn't happen
|
||
throw new Error('A bug in Fathom caused results() to be called on an inward rule twice. That could cause redundant score contributions, etc.');
|
||
}
|
||
const self = this;
|
||
// For now, we consider most of what a LHS computes to be cheap, aside
|
||
// from type() and type().max(), which are cached by their specialized
|
||
// LHS subclasses.
|
||
const leftResults = this.lhs.fnodes(ruleset);
|
||
// Avoid returning a single fnode more than once. LHSs uniquify
|
||
// themselves, but the RHS can change the element it's talking
|
||
// about and thus end up with dupes.
|
||
const returnedFnodes = new Set();
|
||
|
||
// Merge facts into fnodes:
|
||
forEach(
|
||
// leftResult can be either a fnode or a {fnode, rhsTransformer} pair.
|
||
function updateFnode(leftResult) {
|
||
const leftType = self.lhs.guaranteedType();
|
||
// Get a fnode and a RHS transformer, whether a plain fnode is
|
||
// returned or a {fnode, rhsTransformer} pair:
|
||
const {fnode: leftFnode = leftResult, rhsTransformer = identity} = leftResult;
|
||
// Grab the fact from the RHS, and run the LHS's optional
|
||
// transformer over it to pick up anything special it wants to
|
||
// do:
|
||
const fact = rhsTransformer(self.rhs.fact(leftFnode, leftType));
|
||
self.lhs.checkFact(fact);
|
||
const rightFnode = ruleset.fnodeForElement(fact.element || leftFnode.element);
|
||
// If the RHS doesn't specify a type, default to the
|
||
// type of the LHS, if any:
|
||
const rightType = fact.type || self.lhs.guaranteedType();
|
||
if (fact.score !== undefined) {
|
||
if (rightType !== undefined) {
|
||
rightFnode.addScoreFor(rightType, fact.score, self.name);
|
||
} else {
|
||
throw new Error(`The right-hand side of a rule specified a score (${fact.score}) with neither an explicit type nor one we could infer from the left-hand side.`);
|
||
}
|
||
}
|
||
if (fact.type !== undefined || fact.note !== undefined) {
|
||
// There's a reason to call setNoteFor.
|
||
if (rightType === undefined) {
|
||
throw new Error(`The right-hand side of a rule specified a note (${fact.note}) with neither an explicit type nor one we could infer from the left-hand side. Notes are per-type, per-node, so that's a problem.`);
|
||
} else {
|
||
rightFnode.setNoteFor(rightType, fact.note);
|
||
}
|
||
}
|
||
returnedFnodes.add(rightFnode);
|
||
},
|
||
leftResults);
|
||
|
||
// Update ruleset lookup tables.
|
||
// First, mark this rule as done:
|
||
ruleset.doneRules.add(this);
|
||
// Then, stick each fnode in typeCache under all applicable types.
|
||
// Optimization: we really only need to loop over the types
|
||
// this rule can possibly add.
|
||
for (let fnode of returnedFnodes) {
|
||
for (let type of fnode.typesSoFar()) {
|
||
setDefault(ruleset.typeCache, type, () => new Set()).add(fnode);
|
||
}
|
||
}
|
||
return returnedFnodes.values();
|
||
}
|
||
|
||
/**
|
||
* Return a Set of the types that could be emitted back into the system.
|
||
* To emit a type means to either to add it to a fnode emitted from the RHS
|
||
* or to leave it on such a fnode where it already exists.
|
||
*/
|
||
typesItCouldEmit() {
|
||
const rhs = this.rhs.possibleEmissions();
|
||
if (!rhs.couldChangeType && this.lhs.guaranteedType() !== undefined) {
|
||
// It's a b -> b rule.
|
||
return new Set([this.lhs.guaranteedType()]);
|
||
} else if (rhs.possibleTypes.size > 0) {
|
||
// We can prove the type emission from the RHS alone.
|
||
return rhs.possibleTypes;
|
||
} else {
|
||
throw new Error('Could not determine the emitted type of a rule because its right-hand side calls props() without calling typeIn().');
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Return a Set of types I could add to fnodes I output (where the fnodes
|
||
* did not already have them).
|
||
*/
|
||
typesItCouldAdd() {
|
||
const ret = new Set(this.typesItCouldEmit());
|
||
ret.delete(this.lhs.guaranteedType());
|
||
return ret;
|
||
}
|
||
|
||
/**
|
||
* Add the types we could change to the superclass's result.
|
||
*/
|
||
_typesFinalized() {
|
||
const self = this;
|
||
function typesThatCouldChange() {
|
||
const ret = new NiceSet();
|
||
|
||
// Get types that could change:
|
||
const emissions = self.rhs.possibleEmissions();
|
||
if (emissions.couldChangeType) {
|
||
// Get the possible guaranteed combinations of types on the LHS
|
||
// (taking just this LHS into account). For each combo, if the RHS
|
||
// adds a type that's not in the combo, the types in the combo get
|
||
// unioned into ret.
|
||
for (let combo of self.lhs.possibleTypeCombinations()) {
|
||
for (let rhsType of emissions.possibleTypes) {
|
||
if (!combo.has(rhsType)) {
|
||
ret.extend(combo);
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
// Optimization: the possible combos could be later expanded to be
|
||
// informed by earlier rules which add the types mentioned in the LHS.
|
||
// If the only way for something to get B is to have Q first, then we
|
||
// can add Q to each combo and end up with fewer types finalized. Would
|
||
// this imply the existence of a Q->B->Q cycle and thus be impossible?
|
||
// Think about it. If we do this, we can centralize that logic here,
|
||
// rather than repeating it in all the Lhs subclasses).
|
||
return ret;
|
||
}
|
||
|
||
return typesThatCouldChange().extend(super._typesFinalized());
|
||
}
|
||
}
|
||
|
||
/**
|
||
* A rule whose RHS is an out(). This represents a final goal of a ruleset.
|
||
* Its results go out into the world, not inward back into the Fathom
|
||
* knowledgebase.
|
||
*/
|
||
class OutwardRule extends Rule {
|
||
/**
|
||
* Compute the whole thing, including any .through() and .allThrough().
|
||
* Do not mark me done in ruleset.doneRules; out rules are never marked as
|
||
* done so they can be requested many times without having to cache their
|
||
* (potentially big, since they aren't necessarily fnodes?) results. (We
|
||
* can add caching later if it proves beneficial.)
|
||
*/
|
||
results(ruleset) {
|
||
/**
|
||
* From a LHS's ``{fnode, rhsTransform}`` object or plain fnode, pick off just
|
||
* the fnode and return it.
|
||
*/
|
||
function justFnode(fnodeOrStruct) {
|
||
return (fnodeOrStruct instanceof Fnode) ? fnodeOrStruct : fnodeOrStruct.fnode;
|
||
}
|
||
|
||
return this.rhs.allCallback(map(this.rhs.callback, map(justFnode, this.lhs.fnodes(ruleset))));
|
||
}
|
||
|
||
/**
|
||
* @return the key under which the output of this rule will be available
|
||
*/
|
||
key() {
|
||
return this.rhs.key;
|
||
}
|
||
|
||
/**
|
||
* OutwardRules finalize all types mentioned.
|
||
*/
|
||
_typesFinalized() {
|
||
return this.lhs.typesMentioned().extend(super._typesFinalized());
|
||
}
|
||
}
|
||
|
||
/**
|
||
* A shortcut for creating a new :class:`Ruleset`, for symmetry with
|
||
* :func:`rule`
|
||
*/
|
||
function ruleset(rules, coeffs = [], biases = []) {
|
||
return new Ruleset(rules, coeffs, biases);
|
||
}
|
||
|
||
/**
|
||
* An unbound ruleset. When you bind it by calling :func:`~Ruleset.against()`,
|
||
* the resulting :class:`BoundRuleset` will be immutable.
|
||
*/
|
||
class Ruleset {
|
||
/**
|
||
* @arg rules {Array} Rules returned from :func:`rule`
|
||
* @arg coeffs {Map} A map of rule names to numerical weights, typically
|
||
* returned by the :doc:`trainer<training>`. Example:
|
||
* ``[['someRuleName', 5.04], ...]``. If not given, coefficients
|
||
* default to 1.
|
||
* @arg biases {object} A map of type names to neural-net biases. These
|
||
* enable accurate confidence estimates. Example: ``[['someType',
|
||
* -2.08], ...]``. If absent, biases default to 0.
|
||
*/
|
||
constructor(rules, coeffs = [], biases = []) {
|
||
this._inRules = [];
|
||
this._outRules = new Map(); // key -> rule
|
||
this._rulesThatCouldEmit = new Map(); // type -> [rules]
|
||
this._rulesThatCouldAdd = new Map(); // type -> [rules]
|
||
// Private to the framework:
|
||
this._coeffs = new Map(coeffs); // rule name => coefficient
|
||
this.biases = new Map(biases); // type name => bias
|
||
|
||
// Separate rules into out ones and in ones, and sock them away. We do
|
||
// this here so mistakes raise errors early.
|
||
for (let rule of rules) {
|
||
if (rule instanceof InwardRule) {
|
||
this._inRules.push(rule);
|
||
|
||
// Keep track of what inward rules can emit or add:
|
||
// TODO: Combine these hashes for space efficiency:
|
||
const emittedTypes = rule.typesItCouldEmit();
|
||
for (let type of emittedTypes) {
|
||
setDefault(this._rulesThatCouldEmit, type, () => []).push(rule);
|
||
}
|
||
for (let type of rule.typesItCouldAdd()) {
|
||
setDefault(this._rulesThatCouldAdd, type, () => []).push(rule);
|
||
}
|
||
} else if (rule instanceof OutwardRule) {
|
||
this._outRules.set(rule.key(), rule);
|
||
} else {
|
||
throw new Error(`This element of ruleset()'s first param wasn't a rule: ${rule}`);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Commit this ruleset to running against a specific DOM tree or subtree.
|
||
*
|
||
* When run against a subtree, the root of the subtree is not considered as
|
||
* a possible match.
|
||
*
|
||
* This doesn't actually modify the Ruleset but rather returns a fresh
|
||
* :class:`BoundRuleset`, which contains caches and other stateful, per-DOM
|
||
* bric-a-brac.
|
||
*/
|
||
against(doc) {
|
||
return new BoundRuleset(doc,
|
||
this._inRules,
|
||
this._outRules,
|
||
this._rulesThatCouldEmit,
|
||
this._rulesThatCouldAdd,
|
||
this._coeffs,
|
||
this.biases);
|
||
}
|
||
|
||
/**
|
||
* Return all the rules (both inward and outward) that make up this ruleset.
|
||
*
|
||
* From this, you can construct another ruleset like this one but with your
|
||
* own rules added.
|
||
*/
|
||
rules() {
|
||
return Array.from([...this._inRules, ...this._outRules.values()]);
|
||
}
|
||
}
|
||
|
||
/**
|
||
* A ruleset that is earmarked to analyze a certain DOM
|
||
*
|
||
* Carries a cache of rule results on that DOM. Typically comes from
|
||
* :meth:`~Ruleset.against`.
|
||
*/
|
||
class BoundRuleset {
|
||
/**
|
||
* @arg inRules {Array} Non-out() rules
|
||
* @arg outRules {Map} Output key -> out() rule
|
||
*/
|
||
constructor(doc, inRules, outRules, rulesThatCouldEmit, rulesThatCouldAdd, coeffs, biases) {
|
||
this.doc = doc;
|
||
this._inRules = inRules;
|
||
this._outRules = outRules;
|
||
this._rulesThatCouldEmit = rulesThatCouldEmit;
|
||
this._rulesThatCouldAdd = rulesThatCouldAdd;
|
||
this._coeffs = coeffs;
|
||
|
||
// Private, for the use of only helper classes:
|
||
this.biases = biases;
|
||
this._clearCaches();
|
||
this.elementCache = new WeakMap(); // DOM element => fnode about it
|
||
this.doneRules = new Set(); // InwardRules that have been executed. OutwardRules can be executed more than once because they don't change any fnodes and are thus idempotent.
|
||
}
|
||
|
||
/**
|
||
* Change my coefficients and biases after construction.
|
||
*
|
||
* @arg coeffs See the :class:`Ruleset` constructor.
|
||
* @arg biases See the :class:`Ruleset` constructor.
|
||
*/
|
||
setCoeffsAndBiases(coeffs, biases = []) {
|
||
// Destructuring assignment doesn't make it through rollup properly
|
||
// (https://github.com/rollup/rollup-plugin-commonjs/issues/358):
|
||
this._coeffs = new Map(coeffs);
|
||
this.biases = new Map(biases);
|
||
this._clearCaches();
|
||
}
|
||
|
||
/**
|
||
* Clear the typeCache and maxCache, usually in the wake of changing
|
||
* ``this._coeffs``, because both of thise depend on weighted scores.
|
||
*/
|
||
_clearCaches() {
|
||
this.maxCache = new Map(); // type => Array of max fnode (or fnodes, if tied) of this type
|
||
this.typeCache = new Map(); // type => Set of all fnodes of this type found so far. (The dependency resolution during execution ensures that individual types will be comprehensive just in time.)
|
||
}
|
||
|
||
/**
|
||
* Return an array of zero or more fnodes.
|
||
* @arg thing {string|Lhs|Node} Can be
|
||
*
|
||
* (1) A string which matches up with an "out" rule in the ruleset.
|
||
* If the out rule uses through(), the results of through's
|
||
* callback (which might not be fnodes) will be returned.
|
||
* (2) An arbitrary LHS which we calculate and return the results of.
|
||
* (3) A DOM node, for which we will return the corresponding fnode.
|
||
*
|
||
* Results are cached for cases (1) and (3).
|
||
*/
|
||
get(thing) {
|
||
if (typeof thing === 'string') {
|
||
if (this._outRules.has(thing)) {
|
||
return Array.from(this._execute(this._outRules.get(thing)));
|
||
} else {
|
||
throw new Error(`There is no out() rule with key "${thing}".`);
|
||
}
|
||
} else if (isDomElement(thing)) {
|
||
// Return the fnode and let it run type(foo) on demand, as people
|
||
// ask it things like scoreFor(foo).
|
||
return this.fnodeForElement(thing);
|
||
} else if (thing.asLhs !== undefined) {
|
||
// Make a temporary out rule, and run it. This may add things to
|
||
// the ruleset's cache, but that's fine: it doesn't change any
|
||
// future results; it just might make them faster. For example, if
|
||
// you ask for .get(type('smoo')) twice, the second time will be a
|
||
// cache hit.
|
||
const outRule = rule(thing, out(Symbol('outKey')));
|
||
return Array.from(this._execute(outRule));
|
||
} else {
|
||
throw new Error('ruleset.get() expects a string, an expression like on the left-hand side of a rule, or a DOM node.');
|
||
}
|
||
}
|
||
|
||
/**
|
||
* Return the weighted sum of the per-rule, per-type scores from a fnode.
|
||
*
|
||
* @arg mapOfScores a Map of rule name to the [0, 1] score it computed for
|
||
* the type in question
|
||
*/
|
||
weightedScore(mapOfScores) {
|
||
let total = 0;
|
||
for (const [name, score] of mapOfScores) {
|
||
total += score * getDefault(this._coeffs, name, () => 1);
|
||
}
|
||
return total;
|
||
}
|
||
|
||
// Provide an opaque context object to be made available to all ranker
|
||
// functions.
|
||
// context (object) {
|
||
// self.context = object;
|
||
// }
|
||
|
||
// -------- Methods below this point are private to the framework. --------
|
||
|
||
/**
|
||
* Return all the thus-far-unexecuted rules that will have to run to run
|
||
* the requested rule, in the form of Map(prereq: [rulesItIsNeededBy]).
|
||
*/
|
||
_prerequisitesTo(rule, undonePrereqs = new Map()) {
|
||
for (let prereq of rule.prerequisites(this)) {
|
||
if (!this.doneRules.has(prereq)) {
|
||
// prereq is not already run. (If it were, we wouldn't care
|
||
// about adding it to the graph.)
|
||
const alreadyAdded = undonePrereqs.has(prereq);
|
||
setDefault(undonePrereqs, prereq, () => []).push(rule);
|
||
|
||
// alreadyAdded means we've already computed the prereqs of
|
||
// this prereq and added them to undonePrereqs. So, now
|
||
// that we've hooked up the rule to this prereq in the
|
||
// graph, we can stop. But, if we haven't, then...
|
||
if (!alreadyAdded) {
|
||
this._prerequisitesTo(prereq, undonePrereqs);
|
||
}
|
||
}
|
||
}
|
||
return undonePrereqs;
|
||
}
|
||
|
||
/**
|
||
* Run the given rule (and its dependencies, in the proper order), and
|
||
* return its results.
|
||
*
|
||
* The caller is responsible for ensuring that _execute() is not called
|
||
* more than once for a given InwardRule, lest non-idempotent
|
||
* transformations, like score contributions, be applied to fnodes more
|
||
* than once.
|
||
*
|
||
* The basic idea is to sort rules in topological order (according to input
|
||
* and output types) and then run them. On top of that, we do some
|
||
* optimizations. We keep a cache of results by type (whether partial or
|
||
* comprehensive--either way, the topology ensures that any
|
||
* non-comprehensive typeCache entry is made comprehensive before another
|
||
* rule needs it). And we prune our search for prerequisite rules at the
|
||
* first encountered already-executed rule.
|
||
*/
|
||
_execute(rule) {
|
||
const prereqs = this._prerequisitesTo(rule);
|
||
let sorted;
|
||
try {
|
||
sorted = [rule].concat(toposort(prereqs.keys(),
|
||
prereq => prereqs.get(prereq)));
|
||
} catch (exc) {
|
||
if (exc instanceof CycleError) {
|
||
throw new CycleError('There is a cyclic dependency in the ruleset.');
|
||
} else {
|
||
throw exc;
|
||
}
|
||
}
|
||
let fnodes;
|
||
for (let eachRule of reversed(sorted)) {
|
||
// Sock each set of results away in this.typeCache:
|
||
fnodes = eachRule.results(this);
|
||
}
|
||
return Array.from(fnodes);
|
||
}
|
||
|
||
/** @return {Rule[]} */
|
||
inwardRulesThatCouldEmit(type) {
|
||
return getDefault(this._rulesThatCouldEmit, type, () => []);
|
||
}
|
||
|
||
/** @return {Rule[]} */
|
||
inwardRulesThatCouldAdd(type) {
|
||
return getDefault(this._rulesThatCouldAdd, type, () => []);
|
||
}
|
||
|
||
/**
|
||
* @return the Fathom node that describes the given DOM element. This does
|
||
* not trigger any execution, so the result may be incomplete.
|
||
*/
|
||
fnodeForElement(element) {
|
||
return setDefault(this.elementCache,
|
||
element,
|
||
() => new Fnode(element, this));
|
||
}
|
||
}
|
||
|
||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||
|
||
const version = '3.7.3';
|
||
|
||
export { and, atMost, clusters$1 as clusters, dom, element, exceptions, nearest, note, out, props, rule, ruleset, score, type, typeIn, utilsForFrontend as utils, version };
|