1
0
Fork 0
firefox/toolkit/modules/third_party/fathom/fathom.mjs
Daniel Baumann 5e9a113729
Adding upstream version 140.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
2025-06-25 09:37:52 +02:00

2765 lines
96 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
DO NOT TOUCH fathom.mjs DIRECTLY. See the README for instructions.
*/
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
* A :func:`rule` depends on another rule which itself depends on the first
* rule again, either directly or indirectly.
*/
class CycleError extends Error {
}
/**
* An examined element was not contained in a browser ``window`` object, but
* something needed it to be.
*/
class NoWindowError extends Error {
}
var exceptions = /*#__PURE__*/Object.freeze({
__proto__: null,
CycleError: CycleError,
NoWindowError: NoWindowError
});
/**
* Return the passed-in arg. Useful as a default.
*/
function identity(x) {
return x;
}
/*eslint-env browser*/
/**
* From an iterable return the best item, according to an arbitrary comparator
* function. In case of a tie, the first item wins.
*
* @arg by {function} Given an item of the iterable, return a value to compare
* @arg isBetter {function} Return whether its first arg is better than its
* second
*/
function best(iterable, by, isBetter) {
let bestSoFar, bestKeySoFar;
let isFirst = true;
forEach(
function (item) {
const key = by(item);
if (isBetter(key, bestKeySoFar) || isFirst) {
bestSoFar = item;
bestKeySoFar = key;
isFirst = false;
}
},
iterable);
if (isFirst) {
throw new Error('Tried to call best() on empty iterable');
}
return bestSoFar;
}
/**
* Return the maximum item from an iterable, as defined by >.
*
* Works with any type that works with >. If multiple items are equally great,
* return the first.
*
* @arg by {function} Given an item of the iterable, returns a value to
* compare
*/
function max(iterable, by = identity) {
return best(iterable, by, (a, b) => a > b);
}
/**
* Return an Array of maximum items from an iterable, as defined by > and ===.
*
* If an empty iterable is passed in, return [].
*/
function maxes(iterable, by = identity) {
let bests = [];
let bestKeySoFar;
let isFirst = true;
forEach(
function (item) {
const key = by(item);
if (key > bestKeySoFar || isFirst) {
bests = [item];
bestKeySoFar = key;
isFirst = false;
} else if (key === bestKeySoFar) {
bests.push(item);
}
},
iterable);
return bests;
}
/**
* Return the minimum item from an iterable, as defined by <.
*
* If multiple items are equally great, return the first.
*/
function min(iterable, by = identity) {
return best(iterable, by, (a, b) => a < b);
}
/**
* Return the sum of an iterable, as defined by the + operator.
*/
function sum(iterable) {
let total;
let isFirst = true;
forEach(
function assignOrAdd(addend) {
if (isFirst) {
total = addend;
isFirst = false;
} else {
total += addend;
}
},
iterable);
return total;
}
/**
* Return the number of items in an iterable, consuming it as a side effect.
*/
function length(iterable) {
let num = 0;
// eslint-disable-next-line no-unused-vars
for (let item of iterable) {
num++;
}
return num;
}
/**
* Iterate, depth first, over a DOM node. Return the original node first.
*
* @arg shouldTraverse {function} Given a node, say whether we should
* include it and its children. Default: always true.
*/
function *walk(element, shouldTraverse = element => true) {
yield element;
for (let child of element.childNodes) {
if (shouldTraverse(child)) {
for (let w of walk(child, shouldTraverse)) {
yield w;
}
}
}
}
const blockTags = new Set(
['ADDRESS', 'BLOCKQUOTE', 'BODY', 'CENTER', 'DIR', 'DIV', 'DL',
'FIELDSET', 'FORM', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'HR',
'ISINDEX', 'MENU', 'NOFRAMES', 'NOSCRIPT', 'OL', 'P', 'PRE',
'TABLE', 'UL', 'DD', 'DT', 'FRAMESET', 'LI', 'TBODY', 'TD',
'TFOOT', 'TH', 'THEAD', 'TR', 'HTML']);
/**
* Return whether a DOM element is a block element by default (rather than by
* styling).
*/
function isBlock(element) {
return blockTags.has(element.tagName);
}
/**
* Yield strings of text nodes within a normalized DOM node and its children,
* without venturing into any contained block elements.
*
* @arg shouldTraverse {function} Specify additional elements to exclude by
* returning false
*/
function *inlineTexts(element, shouldTraverse = element => true) {
// TODO: Could we just use querySelectorAll() with a really long
// selector rather than walk(), for speed?
for (let child of walk(element,
element => !(isBlock(element) ||
element.tagName === 'SCRIPT' &&
element.tagName === 'STYLE')
&& shouldTraverse(element))) {
if (child.nodeType === child.TEXT_NODE) {
// wholeText() is not implemented by jsdom, so we use
// textContent(). The result should be the same, since
// we're calling it on only text nodes, but it may be
// slower. On the positive side, it means we don't need to
// normalize the DOM tree first.
yield child.textContent;
}
}
}
/**
* Return the total length of the inline text within an element, with
* whitespace collapsed.
*
* @arg shouldTraverse {function} Specify additional elements to exclude by
* returning false
*/
function inlineTextLength(element, shouldTraverse = element => true) {
return sum(map(text => collapseWhitespace(text).length,
inlineTexts(element, shouldTraverse)));
}
/**
* Return a string with each run of whitespace collapsed to a single space.
*/
function collapseWhitespace(str) {
return str.replace(/\s{2,}/g, ' ');
}
/**
* Return the ratio of the inline text length of the links in an element to the
* inline text length of the entire element.
*
* @arg inlineLength {number} Optionally, the precalculated inline
* length of the fnode. If omitted, we will calculate it ourselves.
*/
function linkDensity(fnode, inlineLength) {
if (inlineLength === undefined) {
inlineLength = inlineTextLength(fnode.element);
}
const lengthWithoutLinks = inlineTextLength(fnode.element,
element => element.tagName !== 'A');
return (inlineLength - lengthWithoutLinks) / inlineLength;
}
/**
* Return whether an element is a text node that consist wholly of whitespace.
*/
function isWhitespace(element) {
return (element.nodeType === element.TEXT_NODE &&
element.textContent.trim().length === 0);
}
/**
* Get a key of a map, first setting it to a default value if it's missing.
*/
function setDefault(map, key, defaultMaker) {
if (map.has(key)) {
return map.get(key);
}
const defaultValue = defaultMaker();
map.set(key, defaultValue);
return defaultValue;
}
/**
* Get a key of a map or, if it's missing, a default value.
*/
function getDefault(map, key, defaultMaker) {
if (map.has(key)) {
return map.get(key);
}
return defaultMaker();
}
/**
* Return an Array, the reverse topological sort of the given nodes.
*
* @arg nodes An iterable of arbitrary things
* @arg nodesThatNeed {function} Take a node and returns an Array of nodes
* that depend on it
*/
function toposort(nodes, nodesThatNeed) {
const ret = [];
const todo = new Set(nodes);
const inProgress = new Set();
function visit(node) {
if (inProgress.has(node)) {
throw new CycleError('The graph has a cycle.');
}
if (todo.has(node)) {
inProgress.add(node);
for (let needer of nodesThatNeed(node)) {
visit(needer);
}
inProgress.delete(node);
todo.delete(node);
ret.push(node);
}
}
while (todo.size > 0) {
visit(first(todo));
}
return ret;
}
/**
* A Set with the additional methods it ought to have had
*/
class NiceSet extends Set {
/**
* Remove and return an arbitrary item. Throw an Error if I am empty.
*/
pop() {
for (let v of this.values()) {
this.delete(v);
return v;
}
throw new Error('Tried to pop from an empty NiceSet.');
}
/**
* Union another set or other iterable into myself.
*
* @return myself, for chaining
*/
extend(otherSet) {
for (let item of otherSet) {
this.add(item);
}
return this;
}
/**
* Subtract another set from a copy of me.
*
* @return a copy of myself excluding the elements in ``otherSet``.
*/
minus(otherSet) {
const ret = new NiceSet(this);
for (const item of otherSet) {
ret.delete(item);
}
return ret;
}
/**
* Actually show the items in me.
*/
toString() {
return '{' + Array.from(this).join(', ') + '}';
}
}
/**
* Return the first item of an iterable.
*/
function first(iterable) {
for (let i of iterable) {
return i;
}
}
/**
* Given any node in a DOM tree, return the root element of the tree, generally
* an HTML element.
*/
function rootElement(element) {
return element.ownerDocument.documentElement;
}
/**
* Return the number of times a regex occurs within the string `haystack`.
*
* Caller must make sure `regex` has the 'g' option set.
*/
function numberOfMatches(regex, haystack) {
return (haystack.match(regex) || []).length;
}
/**
* Wrap a scoring callback, and set its element to the page root iff a score is
* returned.
*
* This is used to build rulesets which classify entire pages rather than
* picking out specific elements.
*
* For example, these rules might classify a page as a "login page", influenced
* by whether they have login buttons or username fields:
*
* ``rule(type('loginPage'), score(page(pageContainsLoginButton))),``
* ``rule(type('loginPage'), score(page(pageContainsUsernameField)))``
*/
function page(scoringFunction) {
function wrapper(fnode) {
const scoreAndTypeAndNote = scoringFunction(fnode);
if (scoreAndTypeAndNote.score !== undefined) {
scoreAndTypeAndNote.element = rootElement(fnode.element);
}
return scoreAndTypeAndNote;
}
return wrapper;
}
/**
* Sort the elements by their position in the DOM.
*
* @arg fnodes {iterable} fnodes to sort
* @return {Array} sorted fnodes
*/
function domSort(fnodes) {
function compare(a, b) {
const element = a.element;
const position = element.compareDocumentPosition(b.element);
if (position & element.DOCUMENT_POSITION_FOLLOWING) {
return -1;
} else if (position & element.DOCUMENT_POSITION_PRECEDING) {
return 1;
} else {
return 0;
}
}
return Array.from(fnodes).sort(compare);
}
/* istanbul ignore next */
/**
* Return the DOM element contained in a passed-in fnode. Return passed-in DOM
* elements verbatim.
*
* @arg fnodeOrElement {Node|Fnode}
*/
function toDomElement(fnodeOrElement) {
return isDomElement(fnodeOrElement) ? fnodeOrElement : fnodeOrElement.element;
}
/**
* Checks whether any of the element's attribute values satisfy some condition.
*
* Example::
*
* rule(type('foo'),
* score(attributesMatch(element,
* attr => attr.includes('good'),
* ['id', 'alt']) ? 2 : 1))
*
* @arg element {Node} Element whose attributes you want to search
* @arg predicate {function} A condition to check. Take a string and
* return a boolean. If an attribute has multiple values (e.g. the class
* attribute), attributesMatch will check each one.
* @arg attrs {string[]} An Array of attributes you want to search. If none are
* provided, search all.
* @return Whether any of the attribute values satisfy the predicate function
*/
function attributesMatch(element, predicate, attrs = []) {
const attributes = attrs.length === 0 ? Array.from(element.attributes).map(a => a.name) : attrs;
for (let i = 0; i < attributes.length; i++) {
const attr = element.getAttribute(attributes[i]);
// If the attribute is an array, apply the scoring function to each element
if (attr && ((Array.isArray(attr) && attr.some(predicate)) || predicate(attr))) {
return true;
}
}
return false;
}
/* istanbul ignore next */
/**
* Yield an element and each of its ancestors.
*/
function *ancestors(element) {
yield element;
let parent;
while ((parent = element.parentNode) !== null && parent.nodeType === parent.ELEMENT_NODE) {
yield parent;
element = parent;
}
}
/**
* Return the sigmoid of the argument: 1 / (1 + exp(-x)). This is useful for
* crunching a feature value that may have a wide range into the range (0, 1)
* without a hard ceiling: the sigmoid of even a very large number will be a
* little larger than that of a slightly smaller one.
*
* @arg x {Number} a number to be compressed into the range (0, 1)
*/
function sigmoid(x) {
return 1 / (1 + Math.exp(-x));
}
/* istanbul ignore next */
/**
* Return whether an element is practically visible, considering things like 0
* size or opacity, ``visibility: hidden`` and ``overflow: hidden``.
*
* Merely being scrolled off the page in either horizontally or vertically
* doesn't count as invisible; the result of this function is meant to be
* independent of viewport size.
*
* @throws {NoWindowError} The element (or perhaps one of its ancestors) is not
* in a window, so we can't find the `getComputedStyle()` routine to call.
* That routine is the source of most of the information we use, so you
* should pick a different strategy for non-window contexts.
*/
function isVisible(fnodeOrElement) {
// This could be 5x more efficient if https://github.com/w3c/csswg-drafts/issues/4122 happens.
const element = toDomElement(fnodeOrElement);
const elementWindow = windowForElement(element);
const elementRect = element.getBoundingClientRect();
const elementStyle = elementWindow.getComputedStyle(element);
// Alternative to reading ``display: none`` due to Bug 1381071.
if (elementRect.width === 0 && elementRect.height === 0 && elementStyle.overflow !== 'hidden') {
return false;
}
if (elementStyle.visibility === 'hidden') {
return false;
}
// Check if the element is irrevocably off-screen:
if (elementRect.x + elementRect.width < 0 ||
elementRect.y + elementRect.height < 0
) {
return false;
}
for (const ancestor of ancestors(element)) {
const isElement = ancestor === element;
const style = isElement ? elementStyle : elementWindow.getComputedStyle(ancestor);
if (style.opacity === '0') {
return false;
}
if (style.display === 'contents') {
// ``display: contents`` elements have no box themselves, but children are
// still rendered.
continue;
}
const rect = isElement ? elementRect : ancestor.getBoundingClientRect();
if ((rect.width === 0 || rect.height === 0) && elementStyle.overflow === 'hidden') {
// Zero-sized ancestors dont make descendants hidden unless the descendant
// has ``overflow: hidden``.
return false;
}
}
return true;
}
/**
* Return the extracted [r, g, b, a] values from a string like "rgba(0, 5, 255, 0.8)",
* and scale them to 0..1. If no alpha is specified, return undefined for it.
*/
function rgbaFromString(str) {
const m = str.match(/^rgba?\s*\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*(?:,\s*(\d+(?:\.\d+)?)\s*)?\)$/i);
if (m) {
return [m[1] / 255, m[2] / 255, m[3] / 255, m[4] === undefined ? undefined : parseFloat(m[4])];
} else {
throw new Error('Color ' + str + ' did not match pattern rgb[a](r, g, b[, a]).');
}
}
/**
* Return the saturation 0..1 of a color defined by RGB values 0..1.
*/
function saturation(r, g, b) {
const cMax = Math.max(r, g, b);
const cMin = Math.min(r, g, b);
const delta = cMax - cMin;
const lightness = (cMax + cMin) / 2;
const denom = (1 - (Math.abs(2 * lightness - 1)));
// Return 0 if it's black (R, G, and B all 0).
return (denom === 0) ? 0 : delta / denom;
}
/**
* Scale a number to the range [0, 1] using a linear slope.
*
* For a rising line, the result is 0 until the input reaches zeroAt, then
* increases linearly until oneAt, at which it becomes 1. To make a falling
* line, where the result is 1 to the left and 0 to the right, use a zeroAt
* greater than oneAt.
*/
function linearScale(number, zeroAt, oneAt) {
const isRising = zeroAt < oneAt;
if (isRising) {
if (number <= zeroAt) {
return 0;
} else if (number >= oneAt) {
return 1;
}
} else {
if (number >= zeroAt) {
return 0;
} else if (number <= oneAt) {
return 1;
}
}
const slope = 1 / (oneAt - zeroAt);
return slope * (number - zeroAt);
}
// -------- Routines below this point are private to the framework. --------
/**
* Flatten out an iterable of iterables into a single iterable of non-
* iterables. Does not consider strings to be iterable.
*/
function *flatten(iterable) {
for (const i of iterable) {
if (typeof i !== 'string' && isIterable(i)) {
yield *(flatten(i));
} else {
yield i;
}
}
}
/**
* A lazy, top-level ``Array.map()`` workalike that works on anything iterable
*/
function *map(fn, iterable) {
for (const i of iterable) {
yield fn(i);
}
}
/**
* A lazy, top-level ``Array.forEach()`` workalike that works on anything
* iterable
*/
function forEach(fn, iterable) {
for (const i of iterable) {
fn(i);
}
}
/* istanbul ignore next */
/**
* @return whether a thing appears to be a DOM element.
*/
function isDomElement(thing) {
return thing.nodeName !== undefined;
}
function isIterable(thing) {
return thing && typeof thing[Symbol.iterator] === 'function';
}
/**
* Return an backward iterator over an Array without reversing it in place.
*/
function *reversed(array) {
for (let i = array.length - 1; i >= 0; i--) {
yield array[i];
}
}
/* istanbul ignore next */
/*
* Return the window an element is in.
*
* @throws {NoWindowError} There isn't such a window.
*/
function windowForElement(element) {
let doc = element.ownerDocument;
if (doc === null) {
// The element itself was a document.
doc = element;
}
const win = doc.defaultView;
if (win === null) {
throw new NoWindowError();
}
return win;
}
var utilsForFrontend = /*#__PURE__*/Object.freeze({
__proto__: null,
identity: identity,
best: best,
max: max,
maxes: maxes,
min: min,
sum: sum,
length: length,
walk: walk,
isBlock: isBlock,
inlineTexts: inlineTexts,
inlineTextLength: inlineTextLength,
collapseWhitespace: collapseWhitespace,
linkDensity: linkDensity,
isWhitespace: isWhitespace,
setDefault: setDefault,
getDefault: getDefault,
toposort: toposort,
NiceSet: NiceSet,
first: first,
rootElement: rootElement,
numberOfMatches: numberOfMatches,
page: page,
domSort: domSort,
toDomElement: toDomElement,
attributesMatch: attributesMatch,
ancestors: ancestors,
sigmoid: sigmoid,
isVisible: isVisible,
rgbaFromString: rgbaFromString,
saturation: saturation,
linearScale: linearScale,
flatten: flatten,
map: map,
forEach: forEach,
isDomElement: isDomElement,
reversed: reversed,
windowForElement: windowForElement
});
/**
* Return the number of stride nodes between 2 DOM nodes *at the same
* level of the tree*, without going up or down the tree.
*
* ``left`` xor ``right`` may also be undefined.
*/
function numStrides(left, right) {
let num = 0;
// Walk right from left node until we hit the right node or run out:
let sibling = left;
let shouldContinue = sibling && sibling !== right;
while (shouldContinue) {
sibling = sibling.nextSibling;
if ((shouldContinue = sibling && sibling !== right) &&
!isWhitespace(sibling)) {
num += 1;
}
}
if (sibling !== right) { // Don't double-punish if left and right are siblings.
// Walk left from right node:
sibling = right;
while (sibling) {
sibling = sibling.previousSibling;
if (sibling && !isWhitespace(sibling)) {
num += 1;
}
}
}
return num;
}
/**
* Return a topological distance between 2 DOM nodes or :term:`fnodes<fnode>`
* weighted according to the similarity of their ancestry in the DOM. For
* instance, if one node is situated inside ``<div><span><b><theNode>`` and the
* other node is at ``<differentDiv><span><b><otherNode>``, they are considered
* close to each other for clustering purposes. This is useful for picking out
* nodes which have similar purposes.
*
* Return ``Number.MAX_VALUE`` if one of the nodes contains the other.
*
* This is largely an implementation detail of :func:`clusters`, but you can
* call it yourself if you wish to implement your own clustering. Takes O(n log
* n) time.
*
* Note that the default costs may change; pass them in explicitly if they are
* important to you.
*
* @arg fnodeA {Node|Fnode}
* @arg fnodeB {Node|Fnode}
* @arg differentDepthCost {number} Cost for each level deeper one node is than
* the other below their common ancestor
* @arg differentTagCost {number} Cost for a level below the common ancestor
* where tagNames differ
* @arg sameTagCost {number} Cost for a level below the common ancestor where
* tagNames are the same
* @arg strideCost {number} Cost for each stride node between A and B. Stride
* nodes are siblings or siblings-of-ancestors that lie between the 2
* nodes. These interposed nodes make it less likely that the 2 nodes
* should be together in a cluster.
* @arg additionalCost {function} Return an additional cost, given 2 fnodes or
* nodes.
*
*/
function distance(fnodeA,
fnodeB,
{differentDepthCost = 2,
differentTagCost = 2,
sameTagCost = 1,
strideCost = 1,
additionalCost = (fnodeA, fnodeB) => 0} = {}) {
// I was thinking of something that adds little cost for siblings. Up
// should probably be more expensive than down (see middle example in the
// Nokia paper).
// TODO: Test and tune default costs. They're off the cuff at the moment.
if (fnodeA === fnodeB) {
return 0;
}
const elementA = isDomElement(fnodeA) ? fnodeA : fnodeA.element;
const elementB = isDomElement(fnodeB) ? fnodeB : fnodeB.element;
// Stacks that go from the common ancestor all the way to A and B:
const aAncestors = [elementA];
const bAncestors = [elementB];
let aAncestor = elementA;
let bAncestor = elementB;
// Ascend to common parent, stacking them up for later reference:
while (!aAncestor.contains(elementB)) { // Note: an element does contain() itself.
aAncestor = aAncestor.parentNode;
aAncestors.push(aAncestor); //aAncestors = [a, b]. aAncestor = b // if a is outer: no loop here; aAncestors = [a]. aAncestor = a.
}
// In compareDocumentPosition()'s opinion, inside implies after. Basically,
// before and after pertain to opening tags.
const comparison = elementA.compareDocumentPosition(elementB);
// If either contains the other, abort. We'd either return a misleading
// number or else walk upward right out of the document while trying to
// make the ancestor stack.
if (comparison & (elementA.DOCUMENT_POSITION_CONTAINS | elementA.DOCUMENT_POSITION_CONTAINED_BY)) {
return Number.MAX_VALUE;
}
// Make an ancestor stack for the right node too so we can walk
// efficiently down to it:
do {
bAncestor = bAncestor.parentNode; // Assumes we've early-returned above if A === B. This walks upward from the outer node and up out of the tree. It STARTS OUT with aAncestor === bAncestor!
bAncestors.push(bAncestor);
} while (bAncestor !== aAncestor);
// Figure out which node is left and which is right, so we can follow
// sibling links in the appropriate directions when looking for stride
// nodes:
let left = aAncestors;
let right = bAncestors;
let cost = 0;
if (comparison & elementA.DOCUMENT_POSITION_FOLLOWING) {
// A is before, so it could contain the other node. What did I mean to do if one contained the other?
left = aAncestors;
right = bAncestors;
} else if (comparison & elementA.DOCUMENT_POSITION_PRECEDING) {
// A is after, so it might be contained by the other node.
left = bAncestors;
right = aAncestors;
}
// Descend to both nodes in parallel, discounting the traversal
// cost iff the nodes we hit look similar, implying the nodes dwell
// within similar structures.
while (left.length || right.length) {
const l = left.pop();
const r = right.pop();
if (l === undefined || r === undefined) {
// Punishment for being at different depths: same as ordinary
// dissimilarity punishment for now
cost += differentDepthCost;
} else {
// TODO: Consider similarity of classList.
cost += l.tagName === r.tagName ? sameTagCost : differentTagCost;
}
// Optimization: strides might be a good dimension to eliminate.
if (strideCost !== 0) {
cost += numStrides(l, r) * strideCost;
}
}
return cost + additionalCost(fnodeA, fnodeB);
}
/**
* Return the spatial distance between 2 fnodes or elements, assuming a
* rendered page.
*
* Specifically, return the distance in pixels between the centers of
* ``fnodeA.element.getBoundingClientRect()`` and
* ``fnodeB.element.getBoundingClientRect()``.
*/
function euclidean(fnodeA, fnodeB) {
/**
* Return the horizontal distance from the left edge of the viewport to the
* center of an element, given a DOMRect object for it. It doesn't matter
* that the distance is affected by the page's scroll offset, since the 2
* elements have the same offset.
*/
function xCenter(domRect) {
return domRect.left + domRect.width / 2;
}
function yCenter(domRect) {
return domRect.top + domRect.height / 2;
}
const elementA = toDomElement(fnodeA);
const elementB = toDomElement(fnodeB);
const aRect = elementA.getBoundingClientRect();
const bRect = elementB.getBoundingClientRect();
return Math.sqrt((xCenter(aRect) - xCenter(bRect)) ** 2 +
(yCenter(aRect) - yCenter(bRect)) ** 2);
}
/** A lower-triangular matrix of inter-cluster distances */
class DistanceMatrix {
/**
* @arg distance {function} Some notion of distance between 2 given nodes
*/
constructor(elements, distance) {
// A sparse adjacency matrix:
// {A => {},
// B => {A => 4},
// C => {A => 4, B => 4},
// D => {A => 4, B => 4, C => 4}
// E => {A => 4, B => 4, C => 4, D => 4}}
//
// A, B, etc. are arrays of [arrays of arrays of...] nodes, each
// array being a cluster. In this way, they not only accumulate a
// cluster but retain the steps along the way.
//
// This is an efficient data structure in terms of CPU and memory, in
// that we don't have to slide a lot of memory around when we delete a
// row or column from the middle of the matrix while merging. Of
// course, we lose some practical efficiency by using hash tables, and
// maps in particular are slow in their early implementations.
this._matrix = new Map();
// Convert elements to clusters:
const clusters = elements.map(el => [el]);
// Init matrix:
for (let outerCluster of clusters) {
const innerMap = new Map();
for (let innerCluster of this._matrix.keys()) {
innerMap.set(innerCluster, distance(outerCluster[0],
innerCluster[0]));
}
this._matrix.set(outerCluster, innerMap);
}
this._numClusters = clusters.length;
}
// Return (distance, a: clusterA, b: clusterB) of closest-together clusters.
// Replace this to change linkage criterion.
closest() {
const self = this;
if (this._numClusters < 2) {
throw new Error('There must be at least 2 clusters in order to return the closest() ones.');
}
// Return the distances between every pair of clusters.
function clustersAndDistances() {
const ret = [];
for (let [outerKey, row] of self._matrix.entries()) {
for (let [innerKey, storedDistance] of row.entries()) {
ret.push({a: outerKey, b: innerKey, distance: storedDistance});
}
}
return ret;
}
// Optimizing this by inlining the loop and writing it less
// functionally doesn't help:
return min(clustersAndDistances(), x => x.distance);
}
// Look up the distance between 2 clusters in me. Try the lookup in the
// other direction if the first one falls in the nonexistent half of the
// triangle.
_cachedDistance(clusterA, clusterB) {
let ret = this._matrix.get(clusterA).get(clusterB);
if (ret === undefined) {
ret = this._matrix.get(clusterB).get(clusterA);
}
return ret;
}
// Merge two clusters.
merge(clusterA, clusterB) {
// An example showing how rows merge:
// A: {}
// B: {A: 1}
// C: {A: 4, B: 4},
// D: {A: 4, B: 4, C: 4}
// E: {A: 4, B: 4, C: 2, D: 4}}
//
// Step 2:
// C: {}
// D: {C: 4}
// E: {C: 2, D: 4}}
// AB: {C: 4, D: 4, E: 4}
//
// Step 3:
// D: {}
// AB: {D: 4}
// CE: {D: 4, AB: 4}
// Construct new row, finding min distances from either subcluster of
// the new cluster to old clusters.
//
// There will be no repetition in the matrix because, after all,
// nothing pointed to this new cluster before it existed.
const newRow = new Map();
for (let outerKey of this._matrix.keys()) {
if (outerKey !== clusterA && outerKey !== clusterB) {
newRow.set(outerKey, Math.min(this._cachedDistance(clusterA, outerKey),
this._cachedDistance(clusterB, outerKey)));
}
}
// Delete the rows of the clusters we're merging.
this._matrix.delete(clusterA);
this._matrix.delete(clusterB);
// Remove inner refs to the clusters we're merging.
for (let inner of this._matrix.values()) {
inner.delete(clusterA);
inner.delete(clusterB);
}
// Attach new row.
this._matrix.set([clusterA, clusterB], newRow);
// There is a net decrease of 1 cluster:
this._numClusters -= 1;
}
numClusters() {
return this._numClusters;
}
// Return an Array of nodes for each cluster in me.
clusters() {
// TODO: Can't get map to work here. Don't know why.
return Array.from(this._matrix.keys()).map(e => Array.from(flatten(e)));
}
}
/**
* Partition the given nodes into one or more clusters by position in the DOM
* tree.
*
* This implements an agglomerative clustering. It uses single linkage, since
* we're talking about adjacency here more than Euclidean proximity: the
* clusters we're talking about in the DOM will tend to be adjacent, not
* overlapping. We haven't tried other linkage criteria yet.
*
* In a later release, we may consider score or notes.
*
* @arg {Fnode[]|Node[]} fnodes :term:`fnodes<fnode>` or DOM nodes to group
* into clusters
* @arg {number} splittingDistance The closest-nodes :func:`distance` beyond
* which we will not attempt to unify 2 clusters. Make this larger to make
* larger clusters.
* @arg getDistance {function} A function that returns some notion of numerical
* distance between 2 nodes. Default: :func:`distance`
* @return {Array} An Array of Arrays, with each Array containing all the
* nodes in one cluster. Note that neither the clusters nor the nodes are
* in any particular order. You may find :func:`domSort` helpful to remedy
* the latter.
*/
function clusters(fnodes, splittingDistance, getDistance = distance) {
const matrix = new DistanceMatrix(fnodes, getDistance);
let closest;
while (matrix.numClusters() > 1 && (closest = matrix.closest()).distance < splittingDistance) {
matrix.merge(closest.a, closest.b);
}
return matrix.clusters();
}
var clusters$1 = /*#__PURE__*/Object.freeze({
__proto__: null,
distance: distance,
euclidean: euclidean,
clusters: clusters
});
// The left-hand side of a rule
/**
* Take nodes that match a given DOM selector. Example:
* ``dom('meta[property="og:title"]')``
*
* Every ruleset has at least one ``dom`` or :func:`element` rule, as that is
* where nodes begin to flow into the system. If run against a subtree of a
* document, the root of the subtree is not considered as a possible match.
*/
function dom(selector) {
return new DomLhs(selector);
}
/**
* Take a single given node if it matches a given DOM selector, without looking
* through its descendents or ancestors. Otherwise, take no nodes. Example:
* ``element('input')``
*
* This is useful for applications in which you want Fathom to classify an
* element the user has selected, rather than scanning the whole page for
* candidates.
*/
function element(selector) {
return new ElementLhs(selector);
}
/**
* Rules and the LHSs and RHSs that comprise them have no mutable state. This
* lets us make BoundRulesets from Rulesets without duplicating the rules. It
* also lets us share a common cache among rules: multiple ones might care
* about a cached type(), for instance; there isn't a one-to-one relationship
* of storing with caring. There would also, because of the interdependencies
* of rules in a ruleset, be little use in segmenting the caches: if you do
* something that causes one to need to be cleared, you'll need to clear many
* more as well.
*
* Lhses are responsible for maintaining ruleset.maxCache.
*
* Lhs and its subclasses are private to the Fathom framework.
*/
class Lhs {
constructor() {
this._predicate = () => true;
}
/** Return a new Lhs of the appropriate kind, given its first call. */
static fromFirstCall(firstCall) {
// firstCall is never 'dom', because dom() directly returns a DomLhs.
if (firstCall.method === 'type') {
return new TypeLhs(...firstCall.args);
} else if (firstCall.method === 'and') {
return new AndLhs(firstCall.args);
} else if (firstCall.method === 'nearest') {
return new NearestLhs(firstCall.args);
} else {
throw new Error('The left-hand side of a rule() must start with dom(), type(), and(), or nearest().');
}
}
/**
* Prune nodes from consideration early in run execution, before scoring is
* done.
*
* Reserve this for where you are sure it is always correct or when
* performance demands it. It is generally preferable to use :func:`score`
* and let the :doc:`trainer<training>` determine the relative significance
* of each rule. Human intuition as to what is important is often wrong:
* for example, one might assume that a music player website would include
* the word "play", but this does not hold once you include sites in other
* languages.
*
* Can be chained after :func:`type` or :func:`dom`.
*
* Example: ``dom('p').when(isVisible)``
*
* @arg {function} predicate Accepts a fnode and returns a boolean
*/
when(predicate) {
let lhs = this.clone();
lhs._predicate = predicate;
return lhs;
}
/**
* Of all the dom nodes selected by type() or dom(), return only
* the fnodes that satisfy all the predicates imposed by calls to
* when()
*/
fnodesSatisfyingWhen(fnodes) {
return Array.from(fnodes).filter(this._predicate);
}
/**
* Return an iterable of output fnodes selected by this left-hand-side
* expression.
*
* Pre: The rules I depend on have already been run, and their results are
* in ruleset.typeCache.
*
* @arg ruleset {BoundRuleset}
*/
// fnodes (ruleset) {}
/**
* Check that a RHS-emitted fact is legal for this kind of LHS, and throw
* an error if it isn't.
*/
checkFact(fact) {}
/**
* Return the single type the output of the LHS is guaranteed to have.
* Return undefined if there is no such single type we can ascertain.
*/
guaranteedType() {}
/**
* Return the type I aggregate if I am an aggregate LHS; return undefined
* otherwise.
*/
aggregatedType() {}
/**
* Return each combination of types my selected nodes could be locally (that
* is, by this rule only) constrained to have.
*
* For example, type(A) would return [A]. and(A, or(B, C)) would return
* [AB, AC, ABC]. More examples:
*
* or(A, B) → typeIn(A, B, C) # Finalizes A, B. combos A, B, AB: finalizes AB. Optimization: there's no point in returning the last combo in ors. Compilation into 2 rules with identical RHSs will inherently implement this optimization.
* or(A, B) → typeIn(A, B) # Finalizes A, B
* or(A, B) → A # Finalizes B
* and(A) -> A # Finalizes nothing
* and(A, B) -> A # Finalizes nothing. AB: Ø
* and(A) -> typeIn(A, B) # Finalizes A. A
* and(A, B) -> typeIn(A, B) # Finalizes nothing. AB
* and(A, B) -> typeIn(A, B, C) # Finalizes A, B. AB
* and(A, or(B, C)) -> D # Finalizes A, B, C. AB, AC, ABC: ABC
* and(A, or(B, C)) -> B # Finalizes A, C. AB, AC, ABC: AC
* type(A).not(and(A, B)) ->
*
* @return {NiceSet[]}
*/
// possibleTypeCombinations() {}
/**
* Types mentioned in this LHS.
*
* In other words, the types I need to know the assignment status of before
* I can make my selections
*
* @return NiceSet of strings
*/
// typesMentioned() {}
}
class DomLhs extends Lhs {
constructor(selector) {
super();
if (selector === undefined) {
throw new Error('A querySelector()-style selector is required as the argument to ' + this._callName() + '().');
}
this.selector = selector;
}
/**
* Return the name of this kind of LHS, for use in error messages.
*/
_callName() {
return 'dom';
}
clone() {
return new this.constructor(this.selector);
}
fnodes(ruleset) {
return this._domNodesToFilteredFnodes(
ruleset,
ruleset.doc.querySelectorAll(this.selector));
}
/**
* Turn a NodeList of DOM nodes into an array of fnodes, and filter out
* those that don't match the :func:`when()` clause.
*/
_domNodesToFilteredFnodes(ruleset, domNodes) {
let ret = [];
for (let i = 0; i < domNodes.length; i++) {
ret.push(ruleset.fnodeForElement(domNodes[i]));
}
return this.fnodesSatisfyingWhen(ret);
}
checkFact(fact) {
if (fact.type === undefined) {
throw new Error(`The right-hand side of a ${this._callName()}() rule failed to specify a type. This means there is no way for its output to be used by later rules. All it specified was ${fact}.`);
}
}
asLhs() {
return this;
}
possibleTypeCombinations() {
return [];
}
typesMentioned() {
return new NiceSet();
}
}
class ElementLhs extends DomLhs {
_callName() {
return 'element';
}
fnodes(ruleset) {
return this._domNodesToFilteredFnodes(
ruleset,
ruleset.doc.matches(this.selector) ? [ruleset.doc] : []);
}
}
/** Internal representation of a LHS constrained by type but not by max() */
class TypeLhs extends Lhs {
constructor(type) {
super();
if (type === undefined) {
throw new Error('A type name is required when calling type().');
}
this._type = type; // the input type
}
clone() {
return new this.constructor(this._type);
}
fnodes(ruleset) {
const cached = getDefault(ruleset.typeCache, this._type, () => []);
return this.fnodesSatisfyingWhen(cached);
}
/** Override the type previously specified by this constraint. */
type(inputType) {
// Preserve the class in case this is a TypeMaxLhs.
return new this.constructor(inputType);
}
/**
* Of the nodes selected by a ``type`` call to the left, constrain the LHS
* to return only the max-scoring one. If there is a tie, more than 1 node
* will be returned. Example: ``type('titley').max()``
*/
max() {
return new TypeMaxLhs(this._type);
}
/**
* Take the nodes selected by a ``type`` call to the left, group them into
* clusters, and return the nodes in the cluster that has the highest total
* score (on the relevant type).
*
* Nodes come out in arbitrary order, so, if you plan to emit them,
* consider using ``.out('whatever').allThrough(domSort)``. See
* :func:`domSort`.
*
* If multiple clusters have equally high scores, return an arbitrary one,
* because Fathom has no way to represent arrays of arrays in rulesets.
*
* @arg options {Object} The same depth costs taken by :func:`distance`,
* plus ``splittingDistance``, which is the distance beyond which 2
* clusters will be considered separate. ``splittingDistance``, if
* omitted, defaults to 3.
*/
bestCluster(options) {
return new BestClusterLhs(this._type, options);
}
// Other clustering calls could be called biggestCluster() (having the most
// nodes) and bestAverageCluster() (having the highest average score).
guaranteedType() {
return this._type;
}
possibleTypeCombinations() {
return [this.typesMentioned()];
}
typesMentioned() {
return new NiceSet([this._type]);
}
}
/**
* Abstract LHS that is an aggregate function taken across all fnodes of a type
*
* The main point here is that any aggregate function over a (typed) set of
* nodes depends on first computing all the rules that could emit those nodes
* (nodes of that type).
*/
class AggregateTypeLhs extends TypeLhs {
aggregatedType() {
return this._type;
}
}
/**
* Internal representation of a LHS that has both type and max([NUMBER])
* constraints. max(NUMBER != 1) support is not yet implemented.
*/
class TypeMaxLhs extends AggregateTypeLhs {
/**
* Return the max-scoring node (or nodes if there is a tie) of the given
* type.
*/
fnodes(ruleset) {
// TODO: Optimize better. Walk the dependency tree, and run only the
// rules that could possibly lead to a max result. As part of this,
// make RHSs expose their max potential scores.
const self = this;
// Work around V8 bug:
// https://stackoverflow.com/questions/32943776/using-super-within-an-
// arrow-function-within-an-arrow-function-within-a-method
const getSuperFnodes = () => super.fnodes(ruleset);
return setDefault(
ruleset.maxCache,
this._type,
function maxFnodesOfType() {
return maxes(getSuperFnodes(), fnode => ruleset.weightedScore(fnode.scoresSoFarFor(self._type)));
});
}
}
class BestClusterLhs extends AggregateTypeLhs {
constructor(type, options) {
super(type);
this._options = options || {splittingDistance: 3};
}
/**
* Group the nodes of my type into clusters, and return the cluster with
* the highest total score for that type.
*/
fnodes(ruleset) {
// Get the nodes of the type:
const fnodesOfType = Array.from(super.fnodes(ruleset));
if (fnodesOfType.length === 0) {
return [];
}
// Cluster them:
const clusts = clusters(
fnodesOfType,
this._options.splittingDistance,
(a, b) => distance(a, b, this._options));
// Tag each cluster with the total of its nodes' scores:
const clustsAndSums = clusts.map(
clust => [clust,
sum(clust.map(fnode => fnode.scoreFor(this._type)))]);
// Return the highest-scoring cluster:
return max(clustsAndSums, clustAndSum => clustAndSum[1])[0];
}
}
class AndLhs extends Lhs {
constructor(lhss) {
super();
// For the moment, we accept only type()s as args. TODO: Generalize to
// type().max() and such later.
this._args = lhss.map(sideToTypeLhs);
}
*fnodes(ruleset) {
// Take an arbitrary one for starters. Optimization: we could always
// choose the pickiest one to start with.
const fnodes = this._args[0].fnodes(ruleset);
// Then keep only the fnodes that have the type of every other arg:
fnodeLoop: for (let fnode of fnodes) {
for (let otherLhs of this._args.slice(1)) {
// Optimization: could use a .hasTypeSoFar() below
if (!fnode.hasType(otherLhs.guaranteedType())) {
// TODO: This is n^2. Why is there no set intersection in JS?!
continue fnodeLoop;
}
}
yield fnode;
}
}
possibleTypeCombinations() {
return [this.typesMentioned()];
}
typesMentioned() {
return new NiceSet(this._args.map(arg => arg.guaranteedType()));
}
}
function sideToTypeLhs(side) {
const lhs = side.asLhs();
if (!(lhs.constructor === TypeLhs)) {
throw new Error('and() and nearest() support only simple type() calls as arguments for now.');
// TODO: Though we could solve this with a compilation step: and(type(A), type(B).max()) is equivalent to type(B).max() -> type(Bmax); and(type(A), type(Bmax)).
// In fact, we should be able to compile most (any?) arbitrary and()s, including nested ands and and(type(...).max(), ...) constructions into several and(type(A), type(B), ...) rules.
}
return lhs;
}
class NearestLhs extends Lhs {
constructor([a, b, distance]) {
super();
this._a = sideToTypeLhs(a);
this._b = sideToTypeLhs(b);
this._distance = distance;
}
/**
* Return an iterable of {fnodes, transformer} pairs.
*/
*fnodes(ruleset) {
// Go through all the left arg's nodes. For each one, find the closest
// right-arg's node. O(a * b). Once a right-arg's node is used, we
// don't eliminate it from consideration, because then order of left-
// args' nodes would matter.
// TODO: Still not sure how to get the distance to factor into the
// score unless we hard-code nearest() to do that. It's a
// matter of not being able to bind on the RHS to the output of the
// distance function on the LHS. Perhaps we could at least make
// distance part of the note and read it in a props() callback.
// We're assuming here that simple type() calls return just plain
// fnodes, not {fnode, rhsTransformer} pairs:
const as_ = this._a.fnodes(ruleset);
const bs = Array.from(this._b.fnodes(ruleset));
if (bs.length > 0) {
// If bs is empty, there can be no nearest nodes, so don't emit any.
for (const a of as_) {
const nearest = min(bs, b => this._distance(a, b));
yield {fnode: a,
rhsTransformer: function setNoteIfEmpty(fact) {
// If note is explicitly set by the RHS, let it take
// precedence, even though that makes this entire LHS
// pointless.
if (fact.note === undefined) {
fact.note = nearest; // TODO: Wrap this in an object to make room to return distance later.
}
return fact;
}};
}
}
}
checkFact(fact) {
// Barf if the fact doesn't set a type at least. It should be a *new* type or at least one that doesn't result in cycles, but we can't deduce that.
}
possibleTypeCombinations() {
return [new NiceSet([this._a.guaranteedType()])];
}
typesMentioned() {
return new NiceSet([this._a.guaranteedType(),
this._b.guaranteedType()]);
}
guaranteedType() {
return this._a.guaranteedType();
}
}
// The right-hand side of a rule
const TYPE = 1;
const NOTE = 2;
const SCORE = 4;
const ELEMENT = 8;
const SUBFACTS = {
type: TYPE,
note: NOTE,
score: SCORE,
element: ELEMENT
};
/**
* Expose the output of this rule's LHS as a "final result" to the surrounding
* program. It will be available by calling :func:`~BoundRuleset.get` on the
* ruleset and passing the key. You can run each node through a callback
* function first by adding :func:`through()`, or you can run the entire set of
* nodes through a callback function by adding :func:`allThrough()`.
*/
function out(key) {
return new OutwardRhs(key);
}
class InwardRhs {
constructor(calls = [], max = Infinity, types) {
this._calls = calls.slice();
this._max = max; // max score
this._types = new NiceSet(types); // empty set if unconstrained
}
/**
* Declare that the maximum returned subscore is such and such,
* which helps the optimizer plan efficiently. This doesn't force it to be
* true; it merely throws an error at runtime if it isn't. To lift an
* ``atMost`` constraint, call ``atMost()`` (with no args). The reason
* ``atMost`` and ``typeIn`` apply until explicitly cleared is so that, if
* someone used them for safety reasons on a lexically distant rule you are
* extending, you won't stomp on their constraint and break their
* invariants accidentally.
*/
atMost(score) {
return new this.constructor(this._calls, score, this._types);
}
_checkAtMost(fact) {
if (fact.score !== undefined && fact.score > this._max) {
throw new Error(`Score of ${fact.score} exceeds the declared atMost(${this._max}).`);
}
}
/**
* Determine any of type, note, score, and element using a callback. This
* overrides any previous call to `props` and, depending on what
* properties of the callback's return value are filled out, may override
* the effects of other previous calls as well.
*
* The callback should return...
*
* * An optional :term:`subscore`
* * A type (required on ``dom(...)`` rules, defaulting to the input one on
* ``type(...)`` rules)
* * Optional notes
* * An element, defaulting to the input one. Overriding the default
* enables a callback to walk around the tree and say things about nodes
* other than the input one.
*/
props(callback) {
function getSubfacts(fnode) {
const subfacts = callback(fnode);
// Filter the raw result down to okayed properties so callbacks
// can't insert arbitrary keys (like conserveScore, which might
// mess up the optimizer).
for (let subfact in subfacts) {
if (!SUBFACTS.hasOwnProperty(subfact) || !(SUBFACTS[subfact] & getSubfacts.possibleSubfacts)) {
// The ES5.1 spec says in 12.6.4 that it's fine to delete
// as we iterate.
delete subfacts[subfact];
}
}
return subfacts;
}
// Thse are the subfacts this call could affect:
getSubfacts.possibleSubfacts = TYPE | NOTE | SCORE | ELEMENT;
getSubfacts.kind = 'props';
return new this.constructor(this._calls.concat(getSubfacts),
this._max,
this._types);
}
/**
* Set the type applied to fnodes processed by this RHS.
*/
type(theType) {
// In the future, we might also support providing a callback that receives
// the fnode and returns a type. We couldn't reason based on these, but the
// use would be rather a consise way to to override part of what a previous
// .props() call provides.
// Actually emit a given type.
function getSubfacts() {
return {type: theType};
}
getSubfacts.possibleSubfacts = TYPE;
getSubfacts.type = theType;
getSubfacts.kind = 'type';
return new this.constructor(this._calls.concat(getSubfacts),
this._max,
this._types);
}
/**
* Constrain this rule to emit 1 of a set of given types. Pass no args to lift
* a previous ``typeIn`` constraint, as you might do when basing a LHS on a
* common value to factor out repetition.
*
* ``typeIn`` is mostly a hint for the query planner when you're emitting types
* dynamically from ``props`` calls—in fact, an error will be raised if
* ``props`` is used without a ``typeIn`` or ``type`` to constrain it—but it
* also checks conformance at runtime to ensure validity.
*/
typeIn(...types) {
// Rationale: If we used the spelling "type('a', 'b', ...)" instead of
// this, one might expect type('a', 'b').type(fn) to have the latter
// call override, while expecting type(fn).type('a', 'b') to keep both
// in effect. Then different calls to type() don't consistently
// override each other, and the rules get complicated. Plus you can't
// inherit a type constraint and then sub in another type-returning
// function that still gets the constraint applied.
return new this.constructor(this._calls,
this._max,
types);
}
/**
* Check a fact for conformance with any typeIn() call.
*
* @arg leftType the type of the LHS, which becomes my emitted type if the
* fact doesn't specify one
*/
_checkTypeIn(result, leftType) {
if (this._types.size > 0) {
if (result.type === undefined) {
if (!this._types.has(leftType)) {
throw new Error(`A right-hand side claimed, via typeIn(...) to emit one of the types ${this._types} but actually inherited ${leftType} from the left-hand side.`);
}
} else if (!this._types.has(result.type)) {
throw new Error(`A right-hand side claimed, via typeIn(...) to emit one of the types ${this._types} but actually emitted ${result.type}.`);
}
}
}
/**
* Whatever the callback returns (even ``undefined``) becomes the note of
* the fact. This overrides any previous call to ``note``.
*/
note(callback) {
function getSubfacts(fnode) {
return {note: callback(fnode)};
}
getSubfacts.possibleSubfacts = NOTE;
getSubfacts.kind = 'note';
return new this.constructor(this._calls.concat(getSubfacts),
this._max,
this._types);
}
/**
* Affect the confidence with which the input node should be considered a
* member of a type.
*
* The parameter is generally between 0 and 1 (inclusive), with 0 meaning
* the node does not have the "smell" this rule checks for and 1 meaning it
* does. The range between 0 and 1 is available to represent "fuzzy"
* confidences. If you have an unbounded range to compress down to [0, 1],
* consider using :func:`sigmoid` or a scaling thereof.
*
* Since every node can have multiple, independent scores (one for each
* type), this applies to the type explicitly set by the RHS or, if none,
* to the type named by the ``type`` call on the LHS. If the LHS has none
* because it's a ``dom(...)`` LHS, an error is raised.
*
* @arg {number|function} scoreOrCallback Can either be a static number,
* generally 0 to 1 inclusive, or else a callback which takes the fnode
* and returns such a number. If the callback returns a boolean, it is
* cast to a number.
*/
score(scoreOrCallback) {
let getSubfacts;
function getSubfactsFromNumber(fnode) {
return {score: scoreOrCallback};
}
function getSubfactsFromFunction(fnode) {
let result = scoreOrCallback(fnode);
if (typeof result === 'boolean') {
// Case bools to numbers for convenience. Boolean features are
// common. Don't cast other things, as it frustrates ruleset
// debugging.
result = Number(result);
}
return {score: result};
}
if (typeof scoreOrCallback === 'number') {
getSubfacts = getSubfactsFromNumber;
} else {
getSubfacts = getSubfactsFromFunction;
}
getSubfacts.possibleSubfacts = SCORE;
getSubfacts.kind = 'score';
return new this.constructor(this._calls.concat(getSubfacts),
this._max,
this._types);
}
// Future: why not have an .element() method for completeness?
// -------- Methods below this point are private to the framework. --------
/**
* Run all my props().type().note().score() stuff across a given fnode,
* enforce my max() stuff, and return a fact ({element, type, score,
* notes}) for incorporation into that fnode (or a different one, if
* element is specified). Any of the 4 fact properties can be missing;
* filling in defaults is a job for the caller.
*
* @arg leftType The type the LHS takes in
*/
fact(fnode, leftType) {
const doneKinds = new Set();
const result = {};
let haveSubfacts = 0;
for (let call of reversed(this._calls)) {
// If we've already called a call of this kind, then forget it.
if (!doneKinds.has(call.kind)) {
doneKinds.add(call.kind);
if (~haveSubfacts & call.possibleSubfacts) {
// This call might provide a subfact we are missing.
const newSubfacts = call(fnode);
// We start with an empty object, so we're okay here.
// eslint-disable-next-line guard-for-in
for (let subfact in newSubfacts) {
// A props() callback could insert arbitrary keys into
// the result, but it shouldn't matter, because nothing
// pays any attention to them.
if (!result.hasOwnProperty(subfact)) {
result[subfact] = newSubfacts[subfact];
}
haveSubfacts |= SUBFACTS[subfact];
}
}
}
}
this._checkAtMost(result);
this._checkTypeIn(result, leftType);
return result;
}
/**
* Return a record describing the types I might emit (which means either to
* add a type to a fnode or to output a fnode that already has that type).
* {couldChangeType: whether I might add a type to the fnode,
* possibleTypes: If couldChangeType, the types I might emit; empty set if
* we cannot infer it. If not couldChangeType, undefined.}
*/
possibleEmissions() {
// If there is a typeIn() constraint or there is a type() call to the
// right of all props() calls, we have a constraint. We hunt for the
// tightest constraint we can find, favoring a type() call because it
// gives us a single type but then falling back to a typeIn().
let couldChangeType = false;
for (let call of reversed(this._calls)) {
if (call.kind === 'props') {
couldChangeType = true;
break;
} else if (call.kind === 'type') {
return {couldChangeType: true,
possibleTypes: new Set([call.type])};
}
}
return {couldChangeType,
possibleTypes: this._types};
}
}
class OutwardRhs {
constructor(key, through = x => x, allThrough = x => x) {
this.key = key;
this.callback = through;
this.allCallback = allThrough;
}
/**
* Append ``.through`` to :func:`out` to run each :term:`fnode` emitted
* from the LHS through an arbitrary function before returning it to the
* containing program. Example::
*
* out('titleLengths').through(fnode => fnode.noteFor('title').length)
*/
through(callback) {
return new this.constructor(this.key, callback, this.allCallback);
}
/**
* Append ``.allThrough`` to :func:`out` to run the entire iterable of
* emitted :term:`fnodes<fnode>` through an arbitrary function before
* returning them to the containing program. Example::
*
* out('sortedTitles').allThrough(domSort)
*/
allThrough(callback) {
return new this.constructor(this.key, this.callback, callback);
}
asRhs() {
return this;
}
}
function props(callback) {
return new Side({method: 'props', args: [callback]});
}
/** Constrain to an input type on the LHS, or apply a type on the RHS. */
function type(theType) {
return new Side({method: 'type', args: [theType]});
}
function note(callback) {
return new Side({method: 'note', args: [callback]});
}
function score(scoreOrCallback) {
return new Side({method: 'score', args: [scoreOrCallback]});
}
function atMost(score) {
return new Side({method: 'atMost', args: [score]});
}
function typeIn(...types) {
return new Side({method: 'typeIn', args: types});
}
/**
* Pull nodes that conform to multiple conditions at once.
*
* For example: ``and(type('title'), type('english'))``
*
* Caveats: ``and`` supports only simple ``type`` calls as arguments for now,
* and it may fire off more rules as prerequisites than strictly necessary.
* ``not`` and ``or`` don't exist yet, but you can express ``or`` the long way
* around by having 2 rules with identical RHSs.
*/
function and(...lhss) {
return new Side({method: 'and', args: lhss});
}
/**
* Experimental. For each :term:`fnode` from ``typeCallA``, find the closest
* node from ``typeCallB``, and attach it as a note. The note is attached to
* the type specified by the RHS, defaulting to the type of ``typeCallA``. If
* no nodes are emitted from ``typeCallB``, do nothing.
*
* For example... ::
*
* nearest(type('image'), type('price'))
*
* The score of the ``typeCallA`` can be added to the new type's score by using
* :func:`conserveScore` (though this routine has since been removed)::
*
* rule(nearest(type('image'), type('price')),
* type('imageWithPrice').score(2).conserveScore())
*
* Caveats: ``nearest`` supports only simple ``type`` calls as arguments ``a``
* and ``b`` for now.
*
* @arg distance {function} A function that takes 2 fnodes and returns a
* numerical distance between them. Included options are :func:`distance`,
* which is a weighted topological distance, and :func:`euclidean`, which
* is a spatial distance.
*/
function nearest(typeCallA, typeCallB, distance = euclidean) {
return new Side({method: 'nearest', args: [typeCallA, typeCallB, distance]});
}
/**
* A chain of calls that can be compiled into a Rhs or Lhs, depending on its
* position in a Rule. This lets us use type() as a leading call for both RHSs
* and LHSs. I would prefer to do this dynamically, but that wouldn't compile
* down to old versions of ES.
*/
class Side {
constructor(...calls) {
// A "call" is like {method: 'dom', args: ['p.smoo']}.
this._calls = calls;
}
max() {
return this._and('max');
}
bestCluster(options) {
return this._and('bestCluster', options);
}
props(callback) {
return this._and('props', callback);
}
type(...types) {
return this._and('type', ...types);
}
note(callback) {
return this._and('note', callback);
}
score(scoreOrCallback) {
return this._and('score', scoreOrCallback);
}
atMost(score) {
return this._and('atMost', score);
}
typeIn(...types) {
return this._and('typeIn', ...types);
}
and(...lhss) {
return this._and('and', lhss);
}
_and(method, ...args) {
return new this.constructor(...this._calls.concat({method, args}));
}
asLhs() {
return this._asSide(Lhs.fromFirstCall(this._calls[0]), this._calls.slice(1));
}
asRhs() {
return this._asSide(new InwardRhs(), this._calls);
}
_asSide(side, calls) {
for (let call of calls) {
side = side[call.method](...call.args);
}
return side;
}
when(pred) {
return this._and('when', pred);
}
}
/**
* A wrapper around a DOM node, storing :term:`types<type>`,
* :term:`scores<score>`, and :term:`notes<note>` that apply to it
*/
class Fnode {
/**
* @arg element The DOM element described by the fnode.
* @arg ruleset The ruleset which created the fnode.
*/
constructor(element, ruleset) {
if (element === undefined) {
throw new Error("Someone tried to make a fnode without specifying the element they're talking about.");
}
/**
* The raw DOM element this fnode describes
*/
this.element = element;
this._ruleset = ruleset;
// A map of type => {score: number, note: anything}. `score` is always
// present and defaults to 1. A note is set iff `note` is present and
// not undefined.
this._types = new Map();
// Note: conserveScore() is temporarily absent in 3.0.
//
// By default, a fnode has an independent score for each of its types.
// However, a RHS can opt to conserve the score of an upstream type,
// carrying it forward into another type. To avoid runaway scores in
// the case that multiple rules choose to do this, we limit the
// contribution of an upstream type's score to being multiplied in a
// single time. In this set, we keep track of which upstream types'
// scores have already been multiplied into each type. LHS fnode => Set
// of types whose score for that node have been multiplied into this
// node's score.
this._conservedScores = new Map();
}
/**
* Return whether the given type is one of the ones attached to the wrapped
* HTML node.
*/
hasType(type) {
// Run type(theType) against the ruleset to make sure this doesn't
// return false just because we haven't lazily run certain rules yet.
this._computeType(type);
return this._types.has(type);
}
/**
* Return the confidence, in the range (0, 1), that the fnode belongs to the
* given type, 0 by default.
*/
scoreFor(type) {
this._computeType(type);
return sigmoid(this._ruleset.weightedScore(this.scoresSoFarFor(type)) +
getDefault(this._ruleset.biases, type, () => 0));
}
/**
* Return the fnode's note for the given type, ``undefined`` if none.
*/
noteFor(type) {
this._computeType(type);
return this._noteSoFarFor(type);
}
/**
* Return whether this fnode has a note for the given type.
*
* ``undefined`` is not considered a note and may be overwritten with
* impunity.
*/
hasNoteFor(type) {
this._computeType(type);
return this._hasNoteSoFarFor(type);
}
// -------- Methods below this point are private to the framework. --------
/**
* Return an iterable of the types tagged onto me by rules that have
* already executed.
*/
typesSoFar() {
return this._types.keys();
}
_noteSoFarFor(type) {
return this._typeRecordForGetting(type).note;
}
_hasNoteSoFarFor(type) {
return this._noteSoFarFor(type) !== undefined;
}
/**
* Return the score thus far computed on me for a certain type. Doesn't
* implicitly run any rules. If no score has yet been determined for the
* given type, return undefined.
*/
scoresSoFarFor(type) {
return this._typeRecordForGetting(type).score;
}
/**
* Add a given number to one of our per-type scores. Implicitly assign us
* the given type. Keep track of which rule it resulted from so we can
* later mess with the coeffs.
*/
addScoreFor(type, score, ruleName) {
this._typeRecordForSetting(type).score.set(ruleName, score);
}
/**
* Set the note attached to one of our types. Implicitly assign us that
* type if we don't have it already.
*/
setNoteFor(type, note) {
if (this._hasNoteSoFarFor(type)) {
if (note !== undefined) {
throw new Error(`Someone (likely the right-hand side of a rule) tried to add a note of type ${type} to an element, but one of that type already exists. Overwriting notes is not allowed, since it would make the order of rules matter.`);
}
// else the incoming note is undefined and we already have the
// type, so it's a no-op
} else {
// Apply either a type and note or just a type (which means a note
// that is undefined):
this._typeRecordForSetting(type).note = note;
}
}
/**
* Return a score/note record for a type, creating it if it doesn't exist.
*/
_typeRecordForSetting(type) {
return setDefault(this._types, type, () => ({score: new Map()}));
}
/**
* Manifest a temporary type record for reading, working around the lack of
* a .? operator in JS.
*/
_typeRecordForGetting(type) {
return getDefault(this._types, type, () => ({score: new Map()}));
}
/**
* Make sure any scores, notes, and type-tagging for the given type are
* computed for my element.
*/
_computeType(theType) {
if (!this._types.has(theType)) { // Prevent infinite recursion when an A->A rule looks at A's note in a callback.
this._ruleset.get(type(theType));
}
}
}
/**
* Construct and return the proper type of rule class based on the
* inwardness/outwardness of the RHS.
*
* @arg lhs {Lhs} The left-hand side of the rule
* @arg rhs {Rhs} The right-hand side of the rule
* @arg options {object} Other, optional information about the rule.
* Currently, the only recognized option is ``name``, which points to a
* string that uniquely identifies this rule in a ruleset. The name
* correlates this rule with one of the coefficients passed into
* :func:`ruleset`. If no name is given, an identifier is assigned based on
* the index of this rule in the ruleset, but that is, of course, brittle.
*/
function rule(lhs, rhs, options) {
// Since out() is a valid call only on the RHS (unlike type()), we can take
// a shortcut here: any outward RHS will already be an OutwardRhs; we don't
// need to sidetrack it through being a Side. And OutwardRhs has an asRhs()
// that just returns itself.
if (typeof rhs === 'string') {
rhs = out(rhs);
}
return new ((rhs instanceof OutwardRhs) ? OutwardRule : InwardRule)(lhs, rhs, options);
}
let nextRuleNumber = 0;
function newInternalRuleName() {
return '_' + nextRuleNumber++;
}
/**
* We place the in/out distinction in Rules because it determines whether the
* RHS result is cached, and Rules are responsible for maintaining the rulewise
* cache ruleset.ruleCache.
*/
class Rule { // abstract
constructor(lhs, rhs, options) {
this.lhs = lhs.asLhs();
this.rhs = rhs.asRhs();
// TODO: Make auto-generated rule names be based on the out types of
// the rules, e.g. _priceish_4. That way, adding rules for one type
// won't make the coeffs misalign for another.
this.name = (options ? options.name : undefined) || newInternalRuleName();
}
/**
* Return a NiceSet of the rules that this one shallowly depends on in the
* given ruleset. In a BoundRuleset, this may include rules that have
* already been executed.
*
* Depend on emitters of any LHS type this rule finalizes. (See
* _typesFinalized for a definition.) Depend on adders of any other LHS
* types (because, after all, we need to know what nodes have that type in
* order to find the set of LHS nodes). This works for simple rules and
* complex ones like and().
*
* Specific examples (where A is a type):
* * A.max->* depends on anything emitting A.
* * Even A.max->A depends on A emitters, because we have to have all the
* scores factored in first. For example, what if we did
* max(A)->score(.5)?
* * A->A depends on anything adding A.
* * A->(something other than A) depends on anything emitting A. (For
* example, we need the A score finalized before we could transfer it to
* B using conserveScore().)
* * A->out() also depends on anything emitting A. Fnode methods aren't
* smart enough to lazily run emitter rules as needed. We could make them
* so if it was shown to be an advantage.
*/
prerequisites(ruleset) {
// Optimization: we could cache the result of this when in a compiled (immutable) ruleset.
// Extend prereqs with rules derived from each of the given types. If
// no rules are found, raise an exception, as that indicates a
// malformed ruleset.
function extendOrThrow(prereqs, types, ruleGetter, verb) {
for (let type of types) {
const rules = ruleGetter(type);
if (rules.length > 0) {
prereqs.extend(rules);
} else {
throw new Error(`No rule ${verb} the "${type}" type, but another rule needs it as input.`);
}
}
}
const prereqs = new NiceSet();
// Add finalized types:
extendOrThrow(prereqs, this._typesFinalized(), type => ruleset.inwardRulesThatCouldEmit(type), 'emits');
// Add mentioned types:
// We could say this.lhs.typesMentioned().minus(typesFinalized) as an
// optimization. But since types mentioned are a superset of types
// finalized and rules adding are a subset of rules emitting, we get
// the same result without.
extendOrThrow(prereqs, this.lhs.typesMentioned(), type => ruleset.inwardRulesThatCouldAdd(type), 'adds');
return prereqs;
}
/**
* Return the types that this rule finalizes.
*
* To "finalize" a type means to make sure we're finished running all
* possible rules that might change a node's score or notes w.r.t. a given
* type. This is generally done because we're about to use those data for
* something, like computing a new type's score or or an aggregate
* function. Exhaustively, we're about to...
* * change the type of the nodes or
* * aggregate all nodes of a type
*
* This base-class implementation just returns what aggregate functions
* need, since that need spans inward and outward rules.
*
* @return Set of types
*/
_typesFinalized() {
// Get the types that are fed to aggregate functions. Aggregate
// functions are more demanding than a simple type() LHS. A type() LHS
// itself does not finalize its nodes because the things it could do to
// them without changing their type (adding notes, adding to score)
// are immutable or commutative (respectively). Thus, we require a RHS
// type change in order to require finalization of a simple type()
// mention. A max(B), OTOH, is not commutative with other B->B rules
// (imagine type(B).max()->score(.5)), so it must depend on B emitters
// and thus finalize B. (This will have to be relaxed or rethought when
// we do the max()/atMost() optimization. Perhaps we can delegate to
// aggregate functions up in Rule.prerequisites() to ask what their
// prereqs are. If they implement such an optimization, they can reply.
// Otherwise, we can assume they are all the nodes of their type.)
//
// TODO: Could arbitrary predicates (once we implement those) matter
// too? Maybe it's not just aggregations.
const type = this.lhs.aggregatedType();
return (type === undefined) ? new NiceSet() : new NiceSet([type]);
}
}
/**
* A normal rule, whose results head back into the Fathom knowledgebase, to be
* operated on by further rules.
*/
class InwardRule extends Rule {
// TODO: On construct, complain about useless rules, like a dom() rule that
// doesn't assign a type.
/**
* Return an iterable of the fnodes emitted by the RHS of this rule.
* Side effect: update ruleset's store of fnodes, its accounting of which
* rules are done executing, and its cache of results per type.
*/
results(ruleset) {
if (ruleset.doneRules.has(this)) { // shouldn't happen
throw new Error('A bug in Fathom caused results() to be called on an inward rule twice. That could cause redundant score contributions, etc.');
}
const self = this;
// For now, we consider most of what a LHS computes to be cheap, aside
// from type() and type().max(), which are cached by their specialized
// LHS subclasses.
const leftResults = this.lhs.fnodes(ruleset);
// Avoid returning a single fnode more than once. LHSs uniquify
// themselves, but the RHS can change the element it's talking
// about and thus end up with dupes.
const returnedFnodes = new Set();
// Merge facts into fnodes:
forEach(
// leftResult can be either a fnode or a {fnode, rhsTransformer} pair.
function updateFnode(leftResult) {
const leftType = self.lhs.guaranteedType();
// Get a fnode and a RHS transformer, whether a plain fnode is
// returned or a {fnode, rhsTransformer} pair:
const {fnode: leftFnode = leftResult, rhsTransformer = identity} = leftResult;
// Grab the fact from the RHS, and run the LHS's optional
// transformer over it to pick up anything special it wants to
// do:
const fact = rhsTransformer(self.rhs.fact(leftFnode, leftType));
self.lhs.checkFact(fact);
const rightFnode = ruleset.fnodeForElement(fact.element || leftFnode.element);
// If the RHS doesn't specify a type, default to the
// type of the LHS, if any:
const rightType = fact.type || self.lhs.guaranteedType();
if (fact.score !== undefined) {
if (rightType !== undefined) {
rightFnode.addScoreFor(rightType, fact.score, self.name);
} else {
throw new Error(`The right-hand side of a rule specified a score (${fact.score}) with neither an explicit type nor one we could infer from the left-hand side.`);
}
}
if (fact.type !== undefined || fact.note !== undefined) {
// There's a reason to call setNoteFor.
if (rightType === undefined) {
throw new Error(`The right-hand side of a rule specified a note (${fact.note}) with neither an explicit type nor one we could infer from the left-hand side. Notes are per-type, per-node, so that's a problem.`);
} else {
rightFnode.setNoteFor(rightType, fact.note);
}
}
returnedFnodes.add(rightFnode);
},
leftResults);
// Update ruleset lookup tables.
// First, mark this rule as done:
ruleset.doneRules.add(this);
// Then, stick each fnode in typeCache under all applicable types.
// Optimization: we really only need to loop over the types
// this rule can possibly add.
for (let fnode of returnedFnodes) {
for (let type of fnode.typesSoFar()) {
setDefault(ruleset.typeCache, type, () => new Set()).add(fnode);
}
}
return returnedFnodes.values();
}
/**
* Return a Set of the types that could be emitted back into the system.
* To emit a type means to either to add it to a fnode emitted from the RHS
* or to leave it on such a fnode where it already exists.
*/
typesItCouldEmit() {
const rhs = this.rhs.possibleEmissions();
if (!rhs.couldChangeType && this.lhs.guaranteedType() !== undefined) {
// It's a b -> b rule.
return new Set([this.lhs.guaranteedType()]);
} else if (rhs.possibleTypes.size > 0) {
// We can prove the type emission from the RHS alone.
return rhs.possibleTypes;
} else {
throw new Error('Could not determine the emitted type of a rule because its right-hand side calls props() without calling typeIn().');
}
}
/**
* Return a Set of types I could add to fnodes I output (where the fnodes
* did not already have them).
*/
typesItCouldAdd() {
const ret = new Set(this.typesItCouldEmit());
ret.delete(this.lhs.guaranteedType());
return ret;
}
/**
* Add the types we could change to the superclass's result.
*/
_typesFinalized() {
const self = this;
function typesThatCouldChange() {
const ret = new NiceSet();
// Get types that could change:
const emissions = self.rhs.possibleEmissions();
if (emissions.couldChangeType) {
// Get the possible guaranteed combinations of types on the LHS
// (taking just this LHS into account). For each combo, if the RHS
// adds a type that's not in the combo, the types in the combo get
// unioned into ret.
for (let combo of self.lhs.possibleTypeCombinations()) {
for (let rhsType of emissions.possibleTypes) {
if (!combo.has(rhsType)) {
ret.extend(combo);
break;
}
}
}
}
// Optimization: the possible combos could be later expanded to be
// informed by earlier rules which add the types mentioned in the LHS.
// If the only way for something to get B is to have Q first, then we
// can add Q to each combo and end up with fewer types finalized. Would
// this imply the existence of a Q->B->Q cycle and thus be impossible?
// Think about it. If we do this, we can centralize that logic here,
// rather than repeating it in all the Lhs subclasses).
return ret;
}
return typesThatCouldChange().extend(super._typesFinalized());
}
}
/**
* A rule whose RHS is an out(). This represents a final goal of a ruleset.
* Its results go out into the world, not inward back into the Fathom
* knowledgebase.
*/
class OutwardRule extends Rule {
/**
* Compute the whole thing, including any .through() and .allThrough().
* Do not mark me done in ruleset.doneRules; out rules are never marked as
* done so they can be requested many times without having to cache their
* (potentially big, since they aren't necessarily fnodes?) results. (We
* can add caching later if it proves beneficial.)
*/
results(ruleset) {
/**
* From a LHS's ``{fnode, rhsTransform}`` object or plain fnode, pick off just
* the fnode and return it.
*/
function justFnode(fnodeOrStruct) {
return (fnodeOrStruct instanceof Fnode) ? fnodeOrStruct : fnodeOrStruct.fnode;
}
return this.rhs.allCallback(map(this.rhs.callback, map(justFnode, this.lhs.fnodes(ruleset))));
}
/**
* @return the key under which the output of this rule will be available
*/
key() {
return this.rhs.key;
}
/**
* OutwardRules finalize all types mentioned.
*/
_typesFinalized() {
return this.lhs.typesMentioned().extend(super._typesFinalized());
}
}
/**
* A shortcut for creating a new :class:`Ruleset`, for symmetry with
* :func:`rule`
*/
function ruleset(rules, coeffs = [], biases = []) {
return new Ruleset(rules, coeffs, biases);
}
/**
* An unbound ruleset. When you bind it by calling :func:`~Ruleset.against()`,
* the resulting :class:`BoundRuleset` will be immutable.
*/
class Ruleset {
/**
* @arg rules {Array} Rules returned from :func:`rule`
* @arg coeffs {Map} A map of rule names to numerical weights, typically
* returned by the :doc:`trainer<training>`. Example:
* ``[['someRuleName', 5.04], ...]``. If not given, coefficients
* default to 1.
* @arg biases {object} A map of type names to neural-net biases. These
* enable accurate confidence estimates. Example: ``[['someType',
* -2.08], ...]``. If absent, biases default to 0.
*/
constructor(rules, coeffs = [], biases = []) {
this._inRules = [];
this._outRules = new Map(); // key -> rule
this._rulesThatCouldEmit = new Map(); // type -> [rules]
this._rulesThatCouldAdd = new Map(); // type -> [rules]
// Private to the framework:
this._coeffs = new Map(coeffs); // rule name => coefficient
this.biases = new Map(biases); // type name => bias
// Separate rules into out ones and in ones, and sock them away. We do
// this here so mistakes raise errors early.
for (let rule of rules) {
if (rule instanceof InwardRule) {
this._inRules.push(rule);
// Keep track of what inward rules can emit or add:
// TODO: Combine these hashes for space efficiency:
const emittedTypes = rule.typesItCouldEmit();
for (let type of emittedTypes) {
setDefault(this._rulesThatCouldEmit, type, () => []).push(rule);
}
for (let type of rule.typesItCouldAdd()) {
setDefault(this._rulesThatCouldAdd, type, () => []).push(rule);
}
} else if (rule instanceof OutwardRule) {
this._outRules.set(rule.key(), rule);
} else {
throw new Error(`This element of ruleset()'s first param wasn't a rule: ${rule}`);
}
}
}
/**
* Commit this ruleset to running against a specific DOM tree or subtree.
*
* When run against a subtree, the root of the subtree is not considered as
* a possible match.
*
* This doesn't actually modify the Ruleset but rather returns a fresh
* :class:`BoundRuleset`, which contains caches and other stateful, per-DOM
* bric-a-brac.
*/
against(doc) {
return new BoundRuleset(doc,
this._inRules,
this._outRules,
this._rulesThatCouldEmit,
this._rulesThatCouldAdd,
this._coeffs,
this.biases);
}
/**
* Return all the rules (both inward and outward) that make up this ruleset.
*
* From this, you can construct another ruleset like this one but with your
* own rules added.
*/
rules() {
return Array.from([...this._inRules, ...this._outRules.values()]);
}
}
/**
* A ruleset that is earmarked to analyze a certain DOM
*
* Carries a cache of rule results on that DOM. Typically comes from
* :meth:`~Ruleset.against`.
*/
class BoundRuleset {
/**
* @arg inRules {Array} Non-out() rules
* @arg outRules {Map} Output key -> out() rule
*/
constructor(doc, inRules, outRules, rulesThatCouldEmit, rulesThatCouldAdd, coeffs, biases) {
this.doc = doc;
this._inRules = inRules;
this._outRules = outRules;
this._rulesThatCouldEmit = rulesThatCouldEmit;
this._rulesThatCouldAdd = rulesThatCouldAdd;
this._coeffs = coeffs;
// Private, for the use of only helper classes:
this.biases = biases;
this._clearCaches();
this.elementCache = new WeakMap(); // DOM element => fnode about it
this.doneRules = new Set(); // InwardRules that have been executed. OutwardRules can be executed more than once because they don't change any fnodes and are thus idempotent.
}
/**
* Change my coefficients and biases after construction.
*
* @arg coeffs See the :class:`Ruleset` constructor.
* @arg biases See the :class:`Ruleset` constructor.
*/
setCoeffsAndBiases(coeffs, biases = []) {
// Destructuring assignment doesn't make it through rollup properly
// (https://github.com/rollup/rollup-plugin-commonjs/issues/358):
this._coeffs = new Map(coeffs);
this.biases = new Map(biases);
this._clearCaches();
}
/**
* Clear the typeCache and maxCache, usually in the wake of changing
* ``this._coeffs``, because both of thise depend on weighted scores.
*/
_clearCaches() {
this.maxCache = new Map(); // type => Array of max fnode (or fnodes, if tied) of this type
this.typeCache = new Map(); // type => Set of all fnodes of this type found so far. (The dependency resolution during execution ensures that individual types will be comprehensive just in time.)
}
/**
* Return an array of zero or more fnodes.
* @arg thing {string|Lhs|Node} Can be
*
* (1) A string which matches up with an "out" rule in the ruleset.
* If the out rule uses through(), the results of through's
* callback (which might not be fnodes) will be returned.
* (2) An arbitrary LHS which we calculate and return the results of.
* (3) A DOM node, for which we will return the corresponding fnode.
*
* Results are cached for cases (1) and (3).
*/
get(thing) {
if (typeof thing === 'string') {
if (this._outRules.has(thing)) {
return Array.from(this._execute(this._outRules.get(thing)));
} else {
throw new Error(`There is no out() rule with key "${thing}".`);
}
} else if (isDomElement(thing)) {
// Return the fnode and let it run type(foo) on demand, as people
// ask it things like scoreFor(foo).
return this.fnodeForElement(thing);
} else if (thing.asLhs !== undefined) {
// Make a temporary out rule, and run it. This may add things to
// the ruleset's cache, but that's fine: it doesn't change any
// future results; it just might make them faster. For example, if
// you ask for .get(type('smoo')) twice, the second time will be a
// cache hit.
const outRule = rule(thing, out(Symbol('outKey')));
return Array.from(this._execute(outRule));
} else {
throw new Error('ruleset.get() expects a string, an expression like on the left-hand side of a rule, or a DOM node.');
}
}
/**
* Return the weighted sum of the per-rule, per-type scores from a fnode.
*
* @arg mapOfScores a Map of rule name to the [0, 1] score it computed for
* the type in question
*/
weightedScore(mapOfScores) {
let total = 0;
for (const [name, score] of mapOfScores) {
total += score * getDefault(this._coeffs, name, () => 1);
}
return total;
}
// Provide an opaque context object to be made available to all ranker
// functions.
// context (object) {
// self.context = object;
// }
// -------- Methods below this point are private to the framework. --------
/**
* Return all the thus-far-unexecuted rules that will have to run to run
* the requested rule, in the form of Map(prereq: [rulesItIsNeededBy]).
*/
_prerequisitesTo(rule, undonePrereqs = new Map()) {
for (let prereq of rule.prerequisites(this)) {
if (!this.doneRules.has(prereq)) {
// prereq is not already run. (If it were, we wouldn't care
// about adding it to the graph.)
const alreadyAdded = undonePrereqs.has(prereq);
setDefault(undonePrereqs, prereq, () => []).push(rule);
// alreadyAdded means we've already computed the prereqs of
// this prereq and added them to undonePrereqs. So, now
// that we've hooked up the rule to this prereq in the
// graph, we can stop. But, if we haven't, then...
if (!alreadyAdded) {
this._prerequisitesTo(prereq, undonePrereqs);
}
}
}
return undonePrereqs;
}
/**
* Run the given rule (and its dependencies, in the proper order), and
* return its results.
*
* The caller is responsible for ensuring that _execute() is not called
* more than once for a given InwardRule, lest non-idempotent
* transformations, like score contributions, be applied to fnodes more
* than once.
*
* The basic idea is to sort rules in topological order (according to input
* and output types) and then run them. On top of that, we do some
* optimizations. We keep a cache of results by type (whether partial or
* comprehensive--either way, the topology ensures that any
* non-comprehensive typeCache entry is made comprehensive before another
* rule needs it). And we prune our search for prerequisite rules at the
* first encountered already-executed rule.
*/
_execute(rule) {
const prereqs = this._prerequisitesTo(rule);
let sorted;
try {
sorted = [rule].concat(toposort(prereqs.keys(),
prereq => prereqs.get(prereq)));
} catch (exc) {
if (exc instanceof CycleError) {
throw new CycleError('There is a cyclic dependency in the ruleset.');
} else {
throw exc;
}
}
let fnodes;
for (let eachRule of reversed(sorted)) {
// Sock each set of results away in this.typeCache:
fnodes = eachRule.results(this);
}
return Array.from(fnodes);
}
/** @return {Rule[]} */
inwardRulesThatCouldEmit(type) {
return getDefault(this._rulesThatCouldEmit, type, () => []);
}
/** @return {Rule[]} */
inwardRulesThatCouldAdd(type) {
return getDefault(this._rulesThatCouldAdd, type, () => []);
}
/**
* @return the Fathom node that describes the given DOM element. This does
* not trigger any execution, so the result may be incomplete.
*/
fnodeForElement(element) {
return setDefault(this.elementCache,
element,
() => new Fnode(element, this));
}
}
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
const version = '3.7.3';
export { and, atMost, clusters$1 as clusters, dom, element, exceptions, nearest, note, out, props, rule, ruleset, score, type, typeIn, utilsForFrontend as utils, version };