diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 17:32:43 +0000 |
commit | 6bf0a5cb5034a7e684dcc3500e841785237ce2dd (patch) | |
tree | a68f146d7fa01f0134297619fbe7e33db084e0aa /comm/mailnews/db/gloda/test/unit/test_fts3_tokenizer.js | |
parent | Initial commit. (diff) | |
download | thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.tar.xz thunderbird-6bf0a5cb5034a7e684dcc3500e841785237ce2dd.zip |
Adding upstream version 1:115.7.0.upstream/1%115.7.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | comm/mailnews/db/gloda/test/unit/test_fts3_tokenizer.js | 299 |
1 files changed, 299 insertions, 0 deletions
diff --git a/comm/mailnews/db/gloda/test/unit/test_fts3_tokenizer.js b/comm/mailnews/db/gloda/test/unit/test_fts3_tokenizer.js new file mode 100644 index 0000000000..d938208c9b --- /dev/null +++ b/comm/mailnews/db/gloda/test/unit/test_fts3_tokenizer.js @@ -0,0 +1,299 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/* + * This test file recycles part of test_intl.js. What we do is insert into the + * fulltext index two messages: + * - one has tokens 'aa' and 'bbb', + * - one is from a previous test and has CJK characters in it. + * + * We want to test that the behavior of the tokenizer is as expected (namely, + * that it drops two-letter tokens unless they're CJK bigrams), and that + * GlodaMsgSearcher.jsm properly drops two-letter tokens (unless CJK) from the search + * terms to avoid issuing a query that will definitely return no results. + */ + +var { + assertExpectedMessagesIndexed, + glodaTestHelperInitialize, + waitForGlodaIndexer, +} = ChromeUtils.import("resource://testing-common/gloda/GlodaTestHelper.jsm"); +var { waitForGlodaDBFlush } = ChromeUtils.import( + "resource://testing-common/gloda/GlodaTestHelperFunctions.jsm" +); +var { queryExpect, sqlExpectCount } = ChromeUtils.import( + "resource://testing-common/gloda/GlodaQueryHelper.jsm" +); +var { Gloda } = ChromeUtils.import("resource:///modules/gloda/GlodaPublic.jsm"); +var { GlodaDatastore } = ChromeUtils.import( + "resource:///modules/gloda/GlodaDatastore.jsm" +); +var { GlodaFolder } = ChromeUtils.import( + "resource:///modules/gloda/GlodaDataModel.jsm" +); +var { GlodaMsgSearcher } = ChromeUtils.import( + "resource:///modules/gloda/GlodaMsgSearcher.jsm" +); +var { MessageGenerator, SyntheticMessageSet } = ChromeUtils.import( + "resource://testing-common/mailnews/MessageGenerator.jsm" +); +var { MessageInjection } = ChromeUtils.import( + "resource://testing-common/mailnews/MessageInjection.jsm" +); + +/* ===== Tests ===== */ + +/** + * To make the encoding pairs: + * - For the subject bit: + * import email + * h = email.Header.Header(charset=CHARSET) + * h.append(STRING) + * h.encode() + * - For the body bit + * s.encode(CHARSET) + */ +var intlPhrases = [ + // -- CJK case + { + name: "CJK: Vending Machine", + actual: "\u81ea\u52d5\u552e\u8ca8\u6a5f", + encodings: { + "utf-8": [ + "=?utf-8?b?6Ieq5YuV5ZSu6LKo5qmf?=", + "\xe8\x87\xaa\xe5\x8b\x95\xe5\x94\xae\xe8\xb2\xa8\xe6\xa9\x9f", + ], + }, + searchPhrases: [ + // Match bi-gram driven matches starting from the front. + { body: '"\u81ea\u52d5"', match: true }, + ], + }, + // -- Regular case. Make sure two-letter tokens do not match, since the + // tokenizer is supposed to drop them. Also make sure that a three-letter + // token matches. + { + name: "Boring ASCII", + actual: "aa bbb", + encodings: { + "utf-8": ["=?utf-8?q?aa_bbb?=", "aa bbb"], + }, + searchPhrases: [ + { body: "aa", match: false }, + { body: "bbb", match: true }, + ], + }, +]; + +var msgGen; +var messageInjection; + +add_setup(function () { + msgGen = new MessageGenerator(); + messageInjection = new MessageInjection({ mode: "local" }, msgGen); + glodaTestHelperInitialize(messageInjection); +}); + +add_task(async function test_index_cjk() { + await indexPhrase(intlPhrases[0]); +}); + +add_task(async function test_index_regular() { + await indexPhrase(intlPhrases[1]); +}); + +/** + * - Check that the 'aa' token was never emitted (we don't emit two-letter + * tokens unless they're CJK). + * - Check that the '\u81ea\u52d5' token was emitted, because it's CJK. + * - Check that the 'bbb' token was duly emitted (three letters is more than two + * letters so it's tokenized). + */ +add_task(async function test_token_count() { + // Force a db flush so I can investigate the database if I want. + await waitForGlodaDBFlush(); + await sqlExpectCount( + 0, + "SELECT COUNT(*) FROM messagesText where messagesText MATCH 'aa'" + ); + await sqlExpectCount( + 1, + "SELECT COUNT(*) FROM messagesText where messagesText MATCH 'bbb'" + ); + await sqlExpectCount( + 1, + "SELECT COUNT(*) FROM messagesText where messagesText MATCH '\u81ea\u52d5'" + ); +}); + +add_task(async function test_fulltextsearch_cjk() { + await test_fulltextsearch(intlPhrases[0]); +}); + +add_task(async function test_fulltextsearch_regular() { + await test_fulltextsearch(intlPhrases[1]); +}); + +/** + * We make sure that the Gloda module that builds the query drops two-letter + * tokens, otherwise this would result in an empty search (no matches for + * two-letter tokens). + */ +add_task(async function test_query_builder() { + // aa should be dropped, and we have one message containing the bbb token. + await msgSearchExpectCount(1, "aa bbb"); + // The CJK part should not be dropped, and match message 1; the bbb token + // should not be dropped, and match message 2; 0 results returned because no + // message has the two tokens in it. + await msgSearchExpectCount(0, "\u81ea\u52d5 bbb"); +}); + +/** + * For each phrase in the intlPhrases array (we are parameterized over it using + * parameterizeTest in the 'tests' declaration), create a message where the + * subject, body, and attachment name are populated using the encodings in + * the phrase's "encodings" attribute, one encoding per message. Make sure + * that the strings as exposed by the gloda representation are equal to the + * expected/actual value. + * Stash each created synthetic message in a resultList list on the phrase so + * that we can use them as expected query results in + * |test_fulltextsearch|. + */ +async function indexPhrase(aPhrase) { + // Create a synthetic message for each of the delightful encoding types. + let messages = []; + aPhrase.resultList = []; + for (let charset in aPhrase.encodings) { + let [quoted, bodyEncoded] = aPhrase.encodings[charset]; + + let smsg = msgGen.makeMessage({ + subject: quoted, + body: { charset, encoding: "8bit", body: bodyEncoded }, + attachments: [{ filename: quoted, body: "gabba gabba hey" }], + // Save off the actual value for checking. + callerData: [charset, aPhrase.actual], + }); + + messages.push(smsg); + aPhrase.resultList.push(smsg); + } + let synSet = new SyntheticMessageSet(messages); + await messageInjection.addSetsToFolders( + [messageInjection.getInboxFolder()], + [synSet] + ); + + await waitForGlodaIndexer(); + Assert.ok( + ...assertExpectedMessagesIndexed([synSet], { verifier: verify_index }) + ); +} + +/** + * Does the per-message verification for indexPhrase. Knows what is right for + * each message because of the callerData attribute on the synthetic message. + */ +function verify_index(smsg, gmsg) { + let [charset, actual] = smsg.callerData; + let subject = gmsg.subject; + let indexedBodyText = gmsg.indexedBodyText.trim(); + let attachmentName = gmsg.attachmentNames[0]; + dump("Using character set:\n" + charset + "\nActual:\n" + actual + "\n"); + dump("Subject:\n" + subject + "\nSubject length:\n" + subject.length + "\n"); + Assert.equal(actual, subject); + dump("Body: " + indexedBodyText + " (len: " + indexedBodyText.length + ")\n"); + Assert.equal(actual, indexedBodyText); + dump( + "Attachment name:" + + attachmentName + + " (len: " + + attachmentName.length + + ")\n" + ); + Assert.equal(actual, attachmentName); +} + +/** + * For each phrase, make sure that all of the searchPhrases either match or fail + * to match as appropriate. + */ +async function test_fulltextsearch(aPhrase) { + for (let searchPhrase of aPhrase.searchPhrases) { + let query = Gloda.newQuery(GlodaConstants.NOUN_MESSAGE); + query.bodyMatches(searchPhrase.body); + await queryExpect(query, searchPhrase.match ? aPhrase.resultList : []); + } +} + +/** + * Pass a query string to the GlodaMsgSearcher, run the corresponding SQL query, + * and check the resulted count is what we want. + * + * Use like so: + * await msgSearchExpectCount(1, "I like cheese"); + */ +async function msgSearchExpectCount(aCount, aFulltextStr) { + // Let the GlodaMsgSearcher build its query + let searcher = new GlodaMsgSearcher(null, aFulltextStr); + let conn = GlodaDatastore.asyncConnection; + let query = searcher.buildFulltextQuery(); + + // Brace yourself, brutal monkey-patching NOW + let sql, args; + let oldFunc = GlodaDatastore._queryFromSQLString; + GlodaDatastore._queryFromSQLString = function (aSql, aArgs) { + sql = aSql; + args = aArgs; + }; + query.getCollection(); + GlodaDatastore._queryFromSQLString = oldFunc; + + // Bind the parameters + let stmt = conn.createStatement(sql); + for (let [iBinding, bindingValue] of args.entries()) { + GlodaDatastore._bindVariant(stmt, iBinding, bindingValue); + } + + let promiseResolve; + let promise = new Promise(resolve => { + promiseResolve = resolve; + }); + + let i = 0; + stmt.executeAsync({ + handleResult(aResultSet) { + for ( + let row = aResultSet.getNextRow(); + row; + row = aResultSet.getNextRow() + ) { + i++; + } + }, + + handleError(aError) { + do_throw(new Error("Error: " + aError.message)); + }, + + handleCompletion(aReason) { + if (aReason != Ci.mozIStorageStatementCallback.REASON_FINISHED) { + do_throw(new Error("Query canceled or aborted!")); + } + + if (i != aCount) { + throw new Error( + "Didn't get the expected number of rows: got " + + i + + " expected " + + aCount + + " SQL: " + + sql + ); + } + promiseResolve(); + }, + }); + stmt.finalize(); + await promise; +} |