/* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ /* * This test file recycles part of test_intl.js. What we do is insert into the * fulltext index two messages: * - one has tokens 'aa' and 'bbb', * - one is from a previous test and has CJK characters in it. * * We want to test that the behavior of the tokenizer is as expected (namely, * that it drops two-letter tokens unless they're CJK bigrams), and that * GlodaMsgSearcher.jsm properly drops two-letter tokens (unless CJK) from the search * terms to avoid issuing a query that will definitely return no results. */ var { assertExpectedMessagesIndexed, glodaTestHelperInitialize, waitForGlodaIndexer, } = ChromeUtils.import("resource://testing-common/gloda/GlodaTestHelper.jsm"); var { waitForGlodaDBFlush } = ChromeUtils.import( "resource://testing-common/gloda/GlodaTestHelperFunctions.jsm" ); var { queryExpect, sqlExpectCount } = ChromeUtils.import( "resource://testing-common/gloda/GlodaQueryHelper.jsm" ); var { Gloda } = ChromeUtils.import("resource:///modules/gloda/GlodaPublic.jsm"); var { GlodaDatastore } = ChromeUtils.import( "resource:///modules/gloda/GlodaDatastore.jsm" ); var { GlodaFolder } = ChromeUtils.import( "resource:///modules/gloda/GlodaDataModel.jsm" ); var { GlodaMsgSearcher } = ChromeUtils.import( "resource:///modules/gloda/GlodaMsgSearcher.jsm" ); var { MessageGenerator, SyntheticMessageSet } = ChromeUtils.import( "resource://testing-common/mailnews/MessageGenerator.jsm" ); var { MessageInjection } = ChromeUtils.import( "resource://testing-common/mailnews/MessageInjection.jsm" ); /* ===== Tests ===== */ /** * To make the encoding pairs: * - For the subject bit: * import email * h = email.Header.Header(charset=CHARSET) * h.append(STRING) * h.encode() * - For the body bit * s.encode(CHARSET) */ var intlPhrases = [ // -- CJK case { name: "CJK: Vending Machine", actual: "\u81ea\u52d5\u552e\u8ca8\u6a5f", encodings: { "utf-8": [ "=?utf-8?b?6Ieq5YuV5ZSu6LKo5qmf?=", "\xe8\x87\xaa\xe5\x8b\x95\xe5\x94\xae\xe8\xb2\xa8\xe6\xa9\x9f", ], }, searchPhrases: [ // Match bi-gram driven matches starting from the front. { body: '"\u81ea\u52d5"', match: true }, ], }, // -- Regular case. Make sure two-letter tokens do not match, since the // tokenizer is supposed to drop them. Also make sure that a three-letter // token matches. { name: "Boring ASCII", actual: "aa bbb", encodings: { "utf-8": ["=?utf-8?q?aa_bbb?=", "aa bbb"], }, searchPhrases: [ { body: "aa", match: false }, { body: "bbb", match: true }, ], }, ]; var msgGen; var messageInjection; add_setup(function () { msgGen = new MessageGenerator(); messageInjection = new MessageInjection({ mode: "local" }, msgGen); glodaTestHelperInitialize(messageInjection); }); add_task(async function test_index_cjk() { await indexPhrase(intlPhrases[0]); }); add_task(async function test_index_regular() { await indexPhrase(intlPhrases[1]); }); /** * - Check that the 'aa' token was never emitted (we don't emit two-letter * tokens unless they're CJK). * - Check that the '\u81ea\u52d5' token was emitted, because it's CJK. * - Check that the 'bbb' token was duly emitted (three letters is more than two * letters so it's tokenized). */ add_task(async function test_token_count() { // Force a db flush so I can investigate the database if I want. await waitForGlodaDBFlush(); await sqlExpectCount( 0, "SELECT COUNT(*) FROM messagesText where messagesText MATCH 'aa'" ); await sqlExpectCount( 1, "SELECT COUNT(*) FROM messagesText where messagesText MATCH 'bbb'" ); await sqlExpectCount( 1, "SELECT COUNT(*) FROM messagesText where messagesText MATCH '\u81ea\u52d5'" ); }); add_task(async function test_fulltextsearch_cjk() { await test_fulltextsearch(intlPhrases[0]); }); add_task(async function test_fulltextsearch_regular() { await test_fulltextsearch(intlPhrases[1]); }); /** * We make sure that the Gloda module that builds the query drops two-letter * tokens, otherwise this would result in an empty search (no matches for * two-letter tokens). */ add_task(async function test_query_builder() { // aa should be dropped, and we have one message containing the bbb token. await msgSearchExpectCount(1, "aa bbb"); // The CJK part should not be dropped, and match message 1; the bbb token // should not be dropped, and match message 2; 0 results returned because no // message has the two tokens in it. await msgSearchExpectCount(0, "\u81ea\u52d5 bbb"); }); /** * For each phrase in the intlPhrases array (we are parameterized over it using * parameterizeTest in the 'tests' declaration), create a message where the * subject, body, and attachment name are populated using the encodings in * the phrase's "encodings" attribute, one encoding per message. Make sure * that the strings as exposed by the gloda representation are equal to the * expected/actual value. * Stash each created synthetic message in a resultList list on the phrase so * that we can use them as expected query results in * |test_fulltextsearch|. */ async function indexPhrase(aPhrase) { // Create a synthetic message for each of the delightful encoding types. let messages = []; aPhrase.resultList = []; for (let charset in aPhrase.encodings) { let [quoted, bodyEncoded] = aPhrase.encodings[charset]; let smsg = msgGen.makeMessage({ subject: quoted, body: { charset, encoding: "8bit", body: bodyEncoded }, attachments: [{ filename: quoted, body: "gabba gabba hey" }], // Save off the actual value for checking. callerData: [charset, aPhrase.actual], }); messages.push(smsg); aPhrase.resultList.push(smsg); } let synSet = new SyntheticMessageSet(messages); await messageInjection.addSetsToFolders( [messageInjection.getInboxFolder()], [synSet] ); await waitForGlodaIndexer(); Assert.ok( ...assertExpectedMessagesIndexed([synSet], { verifier: verify_index }) ); } /** * Does the per-message verification for indexPhrase. Knows what is right for * each message because of the callerData attribute on the synthetic message. */ function verify_index(smsg, gmsg) { let [charset, actual] = smsg.callerData; let subject = gmsg.subject; let indexedBodyText = gmsg.indexedBodyText.trim(); let attachmentName = gmsg.attachmentNames[0]; dump("Using character set:\n" + charset + "\nActual:\n" + actual + "\n"); dump("Subject:\n" + subject + "\nSubject length:\n" + subject.length + "\n"); Assert.equal(actual, subject); dump("Body: " + indexedBodyText + " (len: " + indexedBodyText.length + ")\n"); Assert.equal(actual, indexedBodyText); dump( "Attachment name:" + attachmentName + " (len: " + attachmentName.length + ")\n" ); Assert.equal(actual, attachmentName); } /** * For each phrase, make sure that all of the searchPhrases either match or fail * to match as appropriate. */ async function test_fulltextsearch(aPhrase) { for (let searchPhrase of aPhrase.searchPhrases) { let query = Gloda.newQuery(GlodaConstants.NOUN_MESSAGE); query.bodyMatches(searchPhrase.body); await queryExpect(query, searchPhrase.match ? aPhrase.resultList : []); } } /** * Pass a query string to the GlodaMsgSearcher, run the corresponding SQL query, * and check the resulted count is what we want. * * Use like so: * await msgSearchExpectCount(1, "I like cheese"); */ async function msgSearchExpectCount(aCount, aFulltextStr) { // Let the GlodaMsgSearcher build its query let searcher = new GlodaMsgSearcher(null, aFulltextStr); let conn = GlodaDatastore.asyncConnection; let query = searcher.buildFulltextQuery(); // Brace yourself, brutal monkey-patching NOW let sql, args; let oldFunc = GlodaDatastore._queryFromSQLString; GlodaDatastore._queryFromSQLString = function (aSql, aArgs) { sql = aSql; args = aArgs; }; query.getCollection(); GlodaDatastore._queryFromSQLString = oldFunc; // Bind the parameters let stmt = conn.createStatement(sql); for (let [iBinding, bindingValue] of args.entries()) { GlodaDatastore._bindVariant(stmt, iBinding, bindingValue); } let promiseResolve; let promise = new Promise(resolve => { promiseResolve = resolve; }); let i = 0; stmt.executeAsync({ handleResult(aResultSet) { for ( let row = aResultSet.getNextRow(); row; row = aResultSet.getNextRow() ) { i++; } }, handleError(aError) { do_throw(new Error("Error: " + aError.message)); }, handleCompletion(aReason) { if (aReason != Ci.mozIStorageStatementCallback.REASON_FINISHED) { do_throw(new Error("Query canceled or aborted!")); } if (i != aCount) { throw new Error( "Didn't get the expected number of rows: got " + i + " expected " + aCount + " SQL: " + sql ); } promiseResolve(); }, }); stmt.finalize(); await promise; }