/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ const EXPORTED_SYMBOLS = [ "CalExtractToken", "CalExtractParseNode", "CalExtractParser", "extendParseRule", "prepareArguments", ]; /** * CalExtractOptions holds configuration options used by CalExtractParser. * * @typedef {object} CalExtractOptions * @property {RegExp} sentenceBoundary - A pattern used to split text at the * sentence boundary. This should capture * the boundary only and not any other * part of the sentence. Use lookaheads * if needed. */ /** * @type {CalExtractOptions} */ const defaultOptions = { sentenceBoundary: /(?<=\w)[.!?]+\s(?=[A-Z0-9])|[.!?]$/, }; const FLAG_OPTIONAL = 1; const FLAG_MULTIPLE = 2; const FLAG_NONEMPTY = 4; const flagBits = new Map([ ["?", FLAG_OPTIONAL], ["+", FLAG_MULTIPLE | FLAG_NONEMPTY], ["*", FLAG_MULTIPLE | FLAG_OPTIONAL], ]); /** * CalExtractToken represents a lexical unit of valid text. These are produced * during the tokenisation stage of CalExtractParser by matching regular * expressions against a text sequence. */ class CalExtractToken { /** * Identifies the token. Should be in uppercase with no spaces for consistency. * * @type {string} */ type = ""; /** * The text captured by this token. * * @type {string[]} */ text = []; /** * Indicates which sentence in the source text the token was found. * * @type {number} */ sentence = -1; /** * Indicates the position with the sentence the token occurs. * * @type {number} */ position = -1; /** * @param {string} type * @param {string[]} text * @param {number} sentence * @param {number} position */ constructor(type, text, sentence, position) { this.type = type; this.text = text; this.sentence = sentence; this.position = position; } } /** * Function used to produce a value when a CalExtractParseRule is matched. * * @callback CallExtractParseRuleAction * @param {any[]} args - An array containing all the values produced from each * pattern in the rule when they are matched or the * CalExtractToken when lexical tokens are used instead. */ /** * CalExtractParseRule specifies a named pattern that is looked for when parsing * the tokenized source text. Patterns are a sequence of one or more CalExtactToken * types or CalExtractParseRule names. Each pattern specified can optionally * have one (and only one) of the following flags: * * 1) "?" - Optional flag, indicates a pattern may be skipped if not matched. * 2) "*" - Multiple flag, indicates a pattern may match 0 or more times. * 3) "+" - Non-empty multiple flag, indicates a pattern may match 1 or more times. * * Flags must be specified as the last character of the pattern name, example: * ["subject", "text?", "MEET", "text*", "time+"] * * @typedef {object} CalExtractParseRule * * @property {string} name - The name of the rule that can * be used in other patterns. * Should be lowercase for * consistency. * @property {string[]} patterns - The pattern that will be * searched for on the tokenized * string. Can contain flags. * * @property {CalExtractParseRuleAction} action - Produces the result of the * rule being satisfied. */ /** * CalExtractExtParseRule is derived from a CalExtractParseRule to include * additional information needed during parsing. * * @typedef {CalExtractParseRule} CalExtractExtParseRule * * @property {string[]} patterns - The patterns here are stripped of * any flags. * @property {number[]} flags - An array containing the flags * specified for each patterns element. * @property {CalExtractParseNode} graph - A graph used to determine what parse * rule can be applied for an encountered * production. */ /** * CalExtractParseNode is used to represent the patterns of a CalExtractParseRule * as a graph. This graph is traversed during stack reduction until one of the * following end conditions are met: * * 1) There are no more descendant nodes. * 2) The only descendant node is the node itself (cyclic). * 3) All of the descendant nodes are optional, there are no more tokens to * shift and we have traversed the entire stack. */ class CalExtractParseNode { /** * @type {string} */ symbol = null; /** * @type {number} */ flags = null; /** * Contains each possible descendant node of this node. * * @type {CalExtractParseNode[]} */ descendants = null; static FLAG_OPTIONAL = FLAG_OPTIONAL; static FLAG_MULTIPLE = FLAG_MULTIPLE; static FLAG_NONEMPTY = FLAG_NONEMPTY; /** * @param {string} symbol - The pattern this node represents. * @param {number} flags - The computed flags assigned to the pattern. * @param {CalExtractParseNode[]} descendants - Descendant nodes of this node. */ constructor(symbol, flags, descendants = []) { this.symbol = symbol; this.flags = flags; this.descendants = descendants; } /** * Indicates this is the last node in its graph. This will always be false * for cyclic nodes. */ get isEnd() { return !this.descendants.length; } /** * Appends a new descendant to this node. * * @param {CalExtractParseNode} node - The node to append. * * @returns {CalExtractParseNode} The appended node. */ append(node) { this.descendants.push(node); return node; } /** * Provides the descendant CalExtractParseNode of this one given its symbol * name. The result depends on the following rules: * 1) If this node has a descendant that matches the name, return that node. * 2) If the node does not have a matching descendant but has descendants * with the optional flag set, delegate to those nodes. This implements * the "?" and optional aspect of "*". * 3) If none of the above produce a node, null is returned which means this * graph cannot be traversed any further. * * @returns {CalExtractParseNode|null} */ getDescendant(name) { // It is important the direct descendants are checked first. let node = this.descendants.find(node => node.symbol == name); if (node) { return node; } // Now try any optional descendants. for (let node of this.descendants) { let hit = node.isOptional() && node != this && node.getDescendant(name); if (hit) { return hit; } } return null; } /** * Indicates this node can terminate the graph if so desired. This is acceptable * if all of the descendants of this node are optional and there is nothing * more to match on the stack. * * @returns {boolean} */ canEnd() { return this.descendants.filter(desc => desc != this).every(desc => desc.isOptional()); } /** * Indicates whether this node has the optional flag set. * * @returns {boolean} */ isOptional() { return Boolean(this.flags & FLAG_OPTIONAL); } /** * Indicates whether this node is cyclic. A cyclic node is one whose only * descendants is itself thus creating a loop. This occurs one a multiple * flagged node is in the tail position of the graph. */ isCyclic() { return !this.isEnd && this.descendants.every(desc => desc == this); } } /** * CalExtractParser provides an API for detecting interesting information within * a text sequence that can be used for event detection and creation. It is a * naive implementation of a shift-reduce parser, naive in the sense that not * too much attention has been paid to optimisation or semantics. * * This parser works by first splitting the source string into sentences, then * tokenizing each using the token rules specified. The boundary for splitting * into sentences can be specified in the options object. * * After tokenisation, the parser uses the parse rules to shift/reduce each * sentence into a final result. The first parse rule is treated as the intended * rule to reduce the tokens of each sentence to. If all of the tokens have been * processed and the result is not the first rule, parsing is considered to have * failed and null is returned for that sentence. For this reason, it is a good * idea to specify parse rules that are robust but not too specific in their * patterns. */ class CalExtractParser { /** * @type {[RegExp,string?][]} */ tokenRules = []; /** * @type {CalExtractParseRule[]} */ parseRules = []; /** * @type {CalExtractOptions} */ options = null; /** * Use the static createInstance() method instead of this constructor directly. * * @param {[RegExp, string?][]} tokenRules * @param {CalExtractExtParseRule[]} parseRules * @param {CalExtractOptions} [options] - Configuration object. * * @private */ constructor(tokenRules, parseRules, options = defaultOptions) { this.tokenRules = tokenRules; this.parseRules = parseRules; this.options = options; } /** * This method creates a new CalExtractParser instance using the simpler * CalExtractParseRule interface instead of the extended one. It takes care * of creating a graph for each rule and normalizing pattern names that may * be using flags. * * @param {[RegExp, string?][]} tokenRules - A list of rules to apply during * tokenisation. The first element of each rule is a regular expression used * to detect lexical tokens and the second element is the type to assign to * the token. Order matters slightly here, in general, more complex but specific * rules should appear before simpler, more general ones. * * When specifying token rules, they should be anchored to the start of the * string via "^" or tokenize() will produce unexpected results. Some regular * expressions should also include a word boundary to prevent matching within * a large string, example: "at" in "attachment". If a string is to be matched, * but no token is desired you can omit the token type from the rule and it * will be omitted completely. * * @param {CalExtractParseRule[]} parseRules - A list of CalExtractParseRules * that will be extended then used during parsing. Multiple parse rules can * share the same name and will all be considered the same when matching patterns. * Use this to specify variations of the same rule. * * @param {CalExtractOptions} [options] - Configuration object. */ static createInstance(tokenRules, parseRules, options = defaultOptions) { return new CalExtractParser(tokenRules, parseRules.map(extendParseRule), options); } /** * Tokenizes a string to make it easier to match against the parse rule * patterns. If text is encountered that cannot be tokenized, the result for * that sentence is null. * * @param {string} str - The string to tokenize. * * @returns {CalExtractToken[][]} For each sentence encountered, a list of * CalExtractTokens. */ tokenize(str) { let allTokens = []; let sentences = str.split(this.options.sentenceBoundary).filter(Boolean); for (let i = 0; i < sentences.length; i++) { let sentence = sentences[i]; let pos = 0; let tokens = []; let buffer = ""; let matched; while (pos < sentence.length) { buffer = sentence.substr(pos); for (let [pattern, type] of this.tokenRules) { matched = pattern.exec(buffer); if (matched) { if (type) { tokens.push(new CalExtractToken(type, matched[0], i, pos)); } pos += matched[0].length; break; } } if (!matched) { // No rules for the encountered text, bail out. tokens = null; break; } } allTokens.push(tokens); } return allTokens; } /** * Parses a string into an array of values representing the final result of * parsing each sentence encountered. The elements of the resulting array * are either the result of applying the action of the first (top) parse rule * or null if we could not successfully parse the sentence. * * @param {string} str * * @returns {any[]} */ parse(str) { return this.tokenize(str).map(tokens => { if (!this.parseRules.length || !tokens) { return null; } let lookahead = null; let stack = []; while (true) { if (tokens.length) { let next = tokens.shift(); stack.push([next.type, next]); lookahead = tokens[0] ? tokens[0].type : null; while (this.reduceStack(stack, lookahead)) { continue; } } else { // Attempt to reduce anything still on the stack now that the // tokens have all been pushed. while (this.reduceStack(stack, lookahead)) { continue; } break; } } return stack.length == 1 && stack[0][0] == this.parseRules[0].name ? stack[0][1] : null; }); } /** * Attempts to reduce the given stack exactly once using the internal parsing * rules. If successful, the stack will be modified to contain the matched * rule at the location it was found. This methods modifies the stack given. * * @returns {boolean} - True if the stack was reduced false if otherwise. */ reduceStack(stack, lookahead) { for (let i = 0; i < stack.length; i++) { for (let rule of this.parseRules) { let node = rule.graph; let n = i; let matchCount = 0; while (n < stack.length && (node = node.getDescendant(stack[n][0]))) { matchCount++; if ( node.isEnd || (n == stack.length - 1 && !lookahead && (node.isCyclic() || node.canEnd())) ) { let result = [rule.name, null]; let matched = stack.splice(i, matchCount, result); result[1] = rule.action(prepareArguments(rule, matched)); return true; } n++; } } } return false; } } /** * Converts a CalExtractParseRule to a CalExtractExtParseRule. * * @param {CalExtractParseRule} rule * * @returns {CalExtractExtParseRule} */ function extendParseRule(rule) { let { name, action } = rule; let flags = []; let patterns = []; let start = new CalExtractParseNode(null, null); let graph = start; for (let pattern of rule.patterns) { let patternFlag = pattern[pattern.length - 1]; let bits = 0; // Compute the flag value. for (let [flag, value] of flagBits) { if (patternFlag == flag) { bits = bits | value; } } // Removes the flag from patterns that have them. pattern = bits ? pattern.substring(0, pattern.length - 1) : pattern; patterns.push(pattern); graph = graph.append(new CalExtractParseNode(pattern, bits)); // Create a loop node if this flag is set. if (bits & FLAG_MULTIPLE) { graph.append(graph); } flags.push(bits); } return { name, action, patterns, flags, graph: start, }; } /** * Normalizes the matched arguments to be passed to an CalExtractParseRuleAction * by ensuring the number is the same as the patterns for the action. This takes * care of converting multi matches into an array and providing "null" when * an optional pattern is unmatched. * * @param {CalExtractExtRule} rule - The rule the action belongs to. * @param {string[]} matched - An sub-array of the stack containing what * was actually matched. This array will be * modified to match the full rule (inclusive * of optional patterns). * * * @returns {Array} Arguments for a CalExtractParseRuleAction. */ function prepareArguments(rule, matched) { return rule.patterns.map((pattern, index) => { if (rule.flags[index] & FLAG_MULTIPLE) { let c = index; let arrayArg = []; while (c < matched.length && matched[c][0] == pattern) { arrayArg.push(matched[c][1]); c++; } if (!arrayArg.length) { // This rule was not matched, make a blank space for it. matched.splice(index, 0, null); } else { // Move all the matches into a single element so we match the pattern. matched.splice(index, arrayArg.length, arrayArg); } return arrayArg; } else if (matched[index] && matched[index][0] == pattern) { return matched[index][1]; } // The pattern was unmatched, it should be optional. matched.splice(index, 0, null); return null; }); }