From 6bf0a5cb5034a7e684dcc3500e841785237ce2dd Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 19:32:43 +0200 Subject: Adding upstream version 1:115.7.0. Signed-off-by: Daniel Baumann --- .../components/im/modules/GlodaIMSearcher.sys.mjs | 352 +++++++++++++++++++++ 1 file changed, 352 insertions(+) create mode 100644 comm/mail/components/im/modules/GlodaIMSearcher.sys.mjs (limited to 'comm/mail/components/im/modules/GlodaIMSearcher.sys.mjs') diff --git a/comm/mail/components/im/modules/GlodaIMSearcher.sys.mjs b/comm/mail/components/im/modules/GlodaIMSearcher.sys.mjs new file mode 100644 index 0000000000..f97519ddea --- /dev/null +++ b/comm/mail/components/im/modules/GlodaIMSearcher.sys.mjs @@ -0,0 +1,352 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +const { Gloda } = ChromeUtils.import( + "resource:///modules/gloda/GlodaPublic.jsm" +); + +/** + * How much time boost should a 'score point' amount to? The authoritative, + * incontrivertible answer, across all time and space, is a week. + * Note that gloda stores conversation timestamps in seconds. + */ +// var FUZZSCORE_TIMESTAMP_FACTOR = 60 * 60 * 24 * 7; + +// var RANK_USAGE = +// "glodaRank(matchinfo(imConversationsText), 1.0, 2.0, 2.0, 1.5, 1.5)"; + +var DASCORE = "imConversations.time"; +// "(((" + RANK_USAGE + ") * " + +// FUZZSCORE_TIMESTAMP_FACTOR + +// ") + imConversations.time)"; + +/** + * A new optimization decision we are making is that we do not want to carry + * around any data in our ephemeral tables that is not used for whittling the + * result set. The idea is that the btree page cache or OS cache is going to + * save us from the disk seeks and carrying around the extra data is just going + * to be CPU/memory churn that slows us down. + * + * Additionally, we try and avoid row lookups that would have their results + * discarded by the LIMIT. Because of limitations in FTS3 (which might + * be addressed in FTS4 by a feature request), we can't avoid the 'imConversations' + * lookup since that has the message's date and static notability but we can + * defer the 'imConversationsText' lookup. + * + * This is the access pattern we are after here: + * 1) Order the matches with minimized lookup and result storage costs. + * - The innermost MATCH does the doclist magic and provides us with + * matchinfo() support which does not require content row retrieval + * from imConversationsText. Unfortunately, this is not enough to whittle anything + * because we still need static interestingness, so... + * - Based on the match we retrieve the date and notability for that row from + * 'imConversations' using this in conjunction with matchinfo() to provide a score + * that we can then use to LIMIT our results. + * 2) We reissue the MATCH query so that we will be able to use offsets(), but + * we intersect the results of this MATCH against our LIMITed results from + * step 1. + * - We use 'docid IN (phase 1 query)' to accomplish this because it results in + * efficient lookup. If we just use a join, we get O(mn) performance because + * a cartesian join ends up being performed where either we end up performing + * the fulltext query M times and table scan intersect with the results from + * phase 1 or we do the fulltext once but traverse the entire result set from + * phase 1 N times. + * - We believe that the re-execution of the MATCH query should have no disk + * costs because it should still be cached by SQLite or the OS. In the case + * where memory is so constrained this is not true our behavior is still + * probably preferable than the old way because that would have caused lots + * of swapping. + * - This part of the query otherwise resembles the basic gloda query but with + * the inclusion of the offsets() invocation. The imConversations table lookup + * should not involve any disk traffic because the pages should still be + * cached (SQLite or OS) from phase 1. The imConversationsText lookup is new, and + * this is the major disk-seek reduction optimization we are making. (Since + * we avoid this lookup for all of the documents that were excluded by the + * LIMIT.) Since offsets() also needs to retrieve the row from imConversationsText + * there is a nice synergy there. + */ +var NUEVO_FULLTEXT_SQL = + "SELECT imConversations.*, imConversationsText.*, offsets(imConversationsText) AS osets " + + "FROM imConversationsText, imConversations " + + "WHERE" + + " imConversationsText MATCH ?1 " + + " AND imConversationsText.docid IN (" + + "SELECT docid " + + "FROM imConversationsText JOIN imConversations ON imConversationsText.docid = imConversations.id " + + "WHERE imConversationsText MATCH ?1 " + + "ORDER BY " + + DASCORE + + " DESC " + + "LIMIT ?2" + + " )" + + " AND imConversations.id = imConversationsText.docid"; + +function identityFunc(x) { + return x; +} + +function oneLessMaxZero(x) { + if (x <= 1) { + return 0; + } + return x - 1; +} + +function reduceSum(accum, curValue) { + return accum + curValue; +} + +/* + * Columns are: body, subject, attachment names, author, recipients + */ + +/** + * Scores if all search terms match in a column. We bias against author + * slightly and recipient a bit more in this case because a search that + * entirely matches just on a person should give a mention of that person + * in the subject or attachment a fighting chance. + * Keep in mind that because of our indexing in the face of address book + * contacts (namely, we index the name used in the e-mail as well as the + * display name on the address book card associated with the e-mail address) + * a contact is going to bias towards matching multiple times. + */ +var COLUMN_ALL_MATCH_SCORES = [4, 20, 20, 16, 12]; +/** + * Score for each distinct term that matches in the column. This is capped + * by COLUMN_ALL_SCORES. + */ +var COLUMN_PARTIAL_PER_MATCH_SCORES = [1, 4, 4, 4, 3]; +/** + * If a term matches multiple times, what is the marginal score for each + * additional match. We count the total number of matches beyond the + * first match for each term. In other words, if we have 3 terms which + * matched 5, 3, and 0 times, then the total from our perspective is + * (5 - 1) + (3 - 1) + 0 = 4 + 2 + 0 = 6. We take the minimum of that value + * and the value in COLUMN_MULTIPLE_MATCH_LIMIT and multiply by the value in + * COLUMN_MULTIPLE_MATCH_SCORES. + */ +var COLUMN_MULTIPLE_MATCH_SCORES = [1, 0, 0, 0, 0]; +var COLUMN_MULTIPLE_MATCH_LIMIT = [10, 0, 0, 0, 0]; + +/** + * Score the message on its offsets (from stashedColumns). + */ +function scoreOffsets(aMessage, aContext) { + let score = 0; + + let termTemplate = aContext.terms.map(_ => 0); + // for each column, a list of the incidence of each term + let columnTermIncidence = [ + termTemplate.concat(), + termTemplate.concat(), + termTemplate.concat(), + termTemplate.concat(), + termTemplate.concat(), + ]; + + // we need a friendlyParseInt because otherwise the radix stuff happens + // because of the extra arguments map parses. curse you, map! + let offsetNums = aContext.stashedColumns[aMessage.id][0] + .split(" ") + .map(x => parseInt(x)); + for (let i = 0; i < offsetNums.length; i += 4) { + let columnIndex = offsetNums[i]; + let termIndex = offsetNums[i + 1]; + columnTermIncidence[columnIndex][termIndex]++; + } + + for (let iColumn = 0; iColumn < COLUMN_ALL_MATCH_SCORES.length; iColumn++) { + let termIncidence = columnTermIncidence[iColumn]; + if (termIncidence.every(identityFunc)) { + // Bestow all match credit. + score += COLUMN_ALL_MATCH_SCORES[iColumn]; + } else if (termIncidence.some(identityFunc)) { + // Bestow partial match credit. + score += Math.min( + COLUMN_ALL_MATCH_SCORES[iColumn], + COLUMN_PARTIAL_PER_MATCH_SCORES[iColumn] * + termIncidence.filter(identityFunc).length + ); + } + // bestow multiple match credit + score += + Math.min( + termIncidence.map(oneLessMaxZero).reduce(reduceSum, 0), + COLUMN_MULTIPLE_MATCH_LIMIT[iColumn] + ) * COLUMN_MULTIPLE_MATCH_SCORES[iColumn]; + } + + return score; +} + +/** + * The searcher basically looks like a query, but is specialized for fulltext + * search against imConversations. Most of the explicit specialization involves + * crafting a SQL query that attempts to order the matches by likelihood that + * the user was looking for it. This is based on full-text matches combined + * with an explicit (generic) interest score value placed on the message at + * indexing time (TODO). This is followed by using the more generic gloda scoring + * mechanism to explicitly score the IM conversations given the search context in + * addition to the more generic score adjusting rules. + */ +export function GlodaIMSearcher(aListener, aSearchString, aAndTerms) { + this.listener = aListener; + + this.searchString = aSearchString; + this.fulltextTerms = this.parseSearchString(aSearchString); + this.andTerms = aAndTerms != null ? aAndTerms : true; + + this.query = null; + this.collection = null; + + this.scores = null; +} + +GlodaIMSearcher.prototype = { + /** + * Number of messages to retrieve initially. + */ + get retrievalLimit() { + return Services.prefs.getIntPref( + "mailnews.database.global.search.im.limit" + ); + }, + + /** + * Parse the string into terms/phrases by finding matching double-quotes. + */ + parseSearchString(aSearchString) { + aSearchString = aSearchString.trim(); + let terms = []; + + /* + * Add the term as long as the trim on the way in didn't obliterate it. + * + * In the future this might have other helper logic; it did once before. + */ + function addTerm(aTerm) { + if (aTerm) { + terms.push(aTerm); + } + } + + while (aSearchString) { + if (aSearchString.startsWith('"')) { + let endIndex = aSearchString.indexOf(aSearchString[0], 1); + // eat the quote if it has no friend + if (endIndex == -1) { + aSearchString = aSearchString.substring(1); + continue; + } + + addTerm(aSearchString.substring(1, endIndex).trim()); + aSearchString = aSearchString.substring(endIndex + 1); + continue; + } + + let spaceIndex = aSearchString.indexOf(" "); + if (spaceIndex == -1) { + addTerm(aSearchString); + break; + } + + addTerm(aSearchString.substring(0, spaceIndex)); + aSearchString = aSearchString.substring(spaceIndex + 1); + } + + return terms; + }, + + buildFulltextQuery() { + let query = Gloda.newQuery(Gloda.lookupNoun("im-conversation"), { + noMagic: true, + explicitSQL: NUEVO_FULLTEXT_SQL, + limitClauseAlreadyIncluded: true, + // osets is 0-based column number 4 (volatile to column changes) + // save the offset column for extra analysis + stashColumns: [6], + }); + + let fulltextQueryString = ""; + + for (let [iTerm, term] of this.fulltextTerms.entries()) { + if (iTerm) { + fulltextQueryString += this.andTerms ? " " : " OR "; + } + + // Put our term in quotes. This is needed for the tokenizer to be able + // to do useful things. The exception is people clever enough to use + // NEAR. + if (/^NEAR(\/\d+)?$/.test(term)) { + fulltextQueryString += term; + } else if (term.length == 1 && term.charCodeAt(0) >= 0x2000) { + // This is a single-character CJK search query, so add a wildcard. + // Our tokenizer treats anything at/above 0x2000 as CJK for now. + fulltextQueryString += term + "*"; + } else if ( + (term.length == 2 && + term.charCodeAt(0) >= 0x2000 && + term.charCodeAt(1) >= 0x2000) || + term.length >= 3 + ) { + fulltextQueryString += '"' + term + '"'; + } + } + + query.fulltextMatches(fulltextQueryString); + query.limit(this.retrievalLimit); + + return query; + }, + + getCollection(aListenerOverride, aData) { + if (aListenerOverride) { + this.listener = aListenerOverride; + } + + this.query = this.buildFulltextQuery(); + this.collection = this.query.getCollection(this, aData); + this.completed = false; + + return this.collection; + }, + + sortBy: "-dascore", + + onItemsAdded(aItems, aCollection) { + let newScores = Gloda.scoreNounItems( + aItems, + { + terms: this.fulltextTerms, + stashedColumns: aCollection.stashedColumns, + }, + [scoreOffsets] + ); + if (this.scores) { + this.scores = this.scores.concat(newScores); + } else { + this.scores = newScores; + } + + if (this.listener) { + this.listener.onItemsAdded(aItems, aCollection); + } + }, + onItemsModified(aItems, aCollection) { + if (this.listener) { + this.listener.onItemsModified(aItems, aCollection); + } + }, + onItemsRemoved(aItems, aCollection) { + if (this.listener) { + this.listener.onItemsRemoved(aItems, aCollection); + } + }, + onQueryCompleted(aCollection) { + this.completed = true; + if (this.listener) { + this.listener.onQueryCompleted(aCollection); + } + }, +}; -- cgit v1.2.3