From d8bbc7858622b6d9c278469aab701ca0b609cddf Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 15 May 2024 05:35:49 +0200 Subject: Merging upstream version 126.0. Signed-off-by: Daniel Baumann --- third_party/rust/suggest/src/benchmarks/README.md | 28 + third_party/rust/suggest/src/benchmarks/client.rs | 97 ++ third_party/rust/suggest/src/benchmarks/ingest.rs | 116 ++ third_party/rust/suggest/src/benchmarks/mod.rs | 40 + .../rust/suggest/src/bin/debug_ingestion_sizes.rs | 9 + third_party/rust/suggest/src/config.rs | 5 + third_party/rust/suggest/src/db.rs | 720 +++++++------ third_party/rust/suggest/src/lib.rs | 4 +- third_party/rust/suggest/src/pocket.rs | 2 +- third_party/rust/suggest/src/provider.rs | 48 + third_party/rust/suggest/src/rs.rs | 223 +++- third_party/rust/suggest/src/schema.rs | 187 +++- third_party/rust/suggest/src/store.rs | 1113 ++++++++++++++++++-- third_party/rust/suggest/src/suggest.udl | 11 + third_party/rust/suggest/src/suggestion.rs | 34 + third_party/rust/suggest/src/yelp.rs | 28 +- 16 files changed, 2207 insertions(+), 458 deletions(-) create mode 100644 third_party/rust/suggest/src/benchmarks/README.md create mode 100644 third_party/rust/suggest/src/benchmarks/client.rs create mode 100644 third_party/rust/suggest/src/benchmarks/ingest.rs create mode 100644 third_party/rust/suggest/src/benchmarks/mod.rs create mode 100644 third_party/rust/suggest/src/bin/debug_ingestion_sizes.rs (limited to 'third_party/rust/suggest/src') diff --git a/third_party/rust/suggest/src/benchmarks/README.md b/third_party/rust/suggest/src/benchmarks/README.md new file mode 100644 index 0000000000..45150d8413 --- /dev/null +++ b/third_party/rust/suggest/src/benchmarks/README.md @@ -0,0 +1,28 @@ +# Suggest benchmarking code + +Use `cargo suggest-bench` to run these benchmarks. + +The main benchmarking code lives here, while the criterion integration code lives in the `benches/` +directory. + +## Benchmarks + +### ingest-[provider-type] + +Time it takes to ingest all suggestions for a provider type on an empty database. +The bechmark downloads network resources in advance in order to exclude the network request time +from these measurements. + +### Benchmarks it would be nice to have + +- Ingestion with synthetic data. This would isolate the benchmark from changes to the RS database. +- Fetching suggestions + +## cargo suggest-debug-ingestion-sizes + +Run this to get row counts for all database tables. This can be very useful for improving +benchmarks, since targeting the tables with the largest number of rows will usually lead to the +largest improvements. + +The command also prints out the size of all remote-settings attachments, which can be good to +optimize on its own since it represents the amount of data user's need to download. diff --git a/third_party/rust/suggest/src/benchmarks/client.rs b/third_party/rust/suggest/src/benchmarks/client.rs new file mode 100644 index 0000000000..f5a21fd9cc --- /dev/null +++ b/third_party/rust/suggest/src/benchmarks/client.rs @@ -0,0 +1,97 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use crate::{rs::SuggestRemoteSettingsClient, Result}; +use parking_lot::Mutex; +use remote_settings::{Client, GetItemsOptions, RemoteSettingsConfig, RemoteSettingsResponse}; +use std::collections::HashMap; + +/// Remotes settings client that runs during the benchmark warm-up phase. +/// +/// This should be used to run a full ingestion. +/// Then it can be converted into a [RemoteSettingsBenchmarkClient], which allows benchmark code to exclude the network request time. +/// [RemoteSettingsBenchmarkClient] implements [SuggestRemoteSettingsClient] by getting data from a HashMap rather than hitting the network. +pub struct RemoteSettingsWarmUpClient { + client: Client, + pub get_records_responses: Mutex>, + pub get_attachment_responses: Mutex>>, +} + +impl RemoteSettingsWarmUpClient { + pub fn new() -> Self { + Self { + client: Client::new(RemoteSettingsConfig { + server_url: None, + bucket_name: None, + collection_name: crate::rs::REMOTE_SETTINGS_COLLECTION.into(), + }) + .unwrap(), + get_records_responses: Mutex::new(HashMap::new()), + get_attachment_responses: Mutex::new(HashMap::new()), + } + } +} + +impl Default for RemoteSettingsWarmUpClient { + fn default() -> Self { + Self::new() + } +} + +impl SuggestRemoteSettingsClient for RemoteSettingsWarmUpClient { + fn get_records_with_options( + &self, + options: &GetItemsOptions, + ) -> Result { + let response = self.client.get_records_with_options(options)?; + self.get_records_responses + .lock() + .insert(options.clone(), response.clone()); + Ok(response) + } + + fn get_attachment(&self, location: &str) -> Result> { + let response = self.client.get_attachment(location)?; + self.get_attachment_responses + .lock() + .insert(location.to_string(), response.clone()); + Ok(response) + } +} + +#[derive(Clone)] +pub struct RemoteSettingsBenchmarkClient { + pub get_records_responses: HashMap, + pub get_attachment_responses: HashMap>, +} + +impl SuggestRemoteSettingsClient for RemoteSettingsBenchmarkClient { + fn get_records_with_options( + &self, + options: &GetItemsOptions, + ) -> Result { + Ok(self + .get_records_responses + .get(options) + .unwrap_or_else(|| panic!("options not found: {options:?}")) + .clone()) + } + + fn get_attachment(&self, location: &str) -> Result> { + Ok(self + .get_attachment_responses + .get(location) + .unwrap_or_else(|| panic!("location not found: {location:?}")) + .clone()) + } +} + +impl From for RemoteSettingsBenchmarkClient { + fn from(warm_up_client: RemoteSettingsWarmUpClient) -> Self { + Self { + get_records_responses: warm_up_client.get_records_responses.into_inner(), + get_attachment_responses: warm_up_client.get_attachment_responses.into_inner(), + } + } +} diff --git a/third_party/rust/suggest/src/benchmarks/ingest.rs b/third_party/rust/suggest/src/benchmarks/ingest.rs new file mode 100644 index 0000000000..bbefc6a00a --- /dev/null +++ b/third_party/rust/suggest/src/benchmarks/ingest.rs @@ -0,0 +1,116 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use crate::{ + benchmarks::{ + client::{RemoteSettingsBenchmarkClient, RemoteSettingsWarmUpClient}, + BenchmarkWithInput, + }, + rs::SuggestRecordType, + store::SuggestStoreInner, + SuggestIngestionConstraints, +}; +use std::sync::atomic::{AtomicU32, Ordering}; + +static DB_FILE_COUNTER: AtomicU32 = AtomicU32::new(0); + +pub struct IngestBenchmark { + temp_dir: tempfile::TempDir, + client: RemoteSettingsBenchmarkClient, + record_type: SuggestRecordType, +} + +impl IngestBenchmark { + pub fn new(record_type: SuggestRecordType) -> Self { + let temp_dir = tempfile::tempdir().unwrap(); + let store = SuggestStoreInner::new( + temp_dir.path().join("warmup.sqlite"), + RemoteSettingsWarmUpClient::new(), + ); + store.benchmark_ingest_records_by_type(record_type); + Self { + client: RemoteSettingsBenchmarkClient::from(store.into_settings_client()), + temp_dir, + record_type, + } + } +} + +// The input for each benchmark is `SuggestStoreInner` with a fresh database. +// +// This is wrapped in a newtype so that it can be exposed in the public trait +pub struct InputType(SuggestStoreInner); + +impl BenchmarkWithInput for IngestBenchmark { + type Input = InputType; + + fn generate_input(&self) -> Self::Input { + let data_path = self.temp_dir.path().join(format!( + "db{}.sqlite", + DB_FILE_COUNTER.fetch_add(1, Ordering::Relaxed) + )); + let store = SuggestStoreInner::new(data_path, self.client.clone()); + store.ensure_db_initialized(); + InputType(store) + } + + fn benchmarked_code(&self, input: Self::Input) { + let InputType(store) = input; + store.benchmark_ingest_records_by_type(self.record_type); + } +} + +/// Get IngestBenchmark instances for all record types +pub fn all_benchmarks() -> Vec<(&'static str, IngestBenchmark)> { + vec![ + ("icon", IngestBenchmark::new(SuggestRecordType::Icon)), + ( + "amp-wikipedia", + IngestBenchmark::new(SuggestRecordType::AmpWikipedia), + ), + ("amo", IngestBenchmark::new(SuggestRecordType::Amo)), + ("pocket", IngestBenchmark::new(SuggestRecordType::Pocket)), + ("yelp", IngestBenchmark::new(SuggestRecordType::Yelp)), + ("mdn", IngestBenchmark::new(SuggestRecordType::Mdn)), + ("weather", IngestBenchmark::new(SuggestRecordType::Weather)), + ( + "global-config", + IngestBenchmark::new(SuggestRecordType::GlobalConfig), + ), + ( + "amp-mobile", + IngestBenchmark::new(SuggestRecordType::AmpMobile), + ), + ] +} + +pub fn print_debug_ingestion_sizes() { + viaduct_reqwest::use_reqwest_backend(); + let store = SuggestStoreInner::new( + "file:debug_ingestion_sizes?mode=memory&cache=shared", + RemoteSettingsWarmUpClient::new(), + ); + store + .ingest(SuggestIngestionConstraints::default()) + .unwrap(); + let table_row_counts = store.table_row_counts(); + let client = store.into_settings_client(); + let total_attachment_size: usize = client + .get_attachment_responses + .lock() + .values() + .map(|data| data.len()) + .sum(); + + println!( + "Total attachment size: {}kb", + (total_attachment_size + 500) / 1000 + ); + println!(); + println!("Database table row counts"); + println!("-------------------------"); + for (name, count) in table_row_counts { + println!("{name:30}: {count}"); + } +} diff --git a/third_party/rust/suggest/src/benchmarks/mod.rs b/third_party/rust/suggest/src/benchmarks/mod.rs new file mode 100644 index 0000000000..eb3b2e8abe --- /dev/null +++ b/third_party/rust/suggest/src/benchmarks/mod.rs @@ -0,0 +1,40 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +//! Benchmarking support +//! +//! Benchmarks are split up into two parts: the functions to be benchmarked live here, which the benchmarking code itself lives in `benches/bench.rs`. +//! It's easier to write benchmarking code inside the main crate, where we have access to private items. +//! However, it's easier to integrate with Cargo and criterion if benchmarks live in a separate crate. +//! +//! All benchmarks are defined as structs that implement either the [Benchmark] or [BenchmarkWithInput] + +pub mod client; +pub mod ingest; + +/// Trait for simple benchmarks +/// +/// This supports simple benchmarks that don't require any input. Note: global setup can be done +/// in the `new()` method for the struct. +pub trait Benchmark { + /// Perform the operations that we're benchmarking. + fn benchmarked_code(&self); +} + +/// Trait for benchmarks that require input +/// +/// This will run using Criterion's `iter_batched` function. Criterion will create a batch of +/// inputs, then pass each one to benchmark. +/// +/// This supports simple benchmarks that don't require any input. Note: global setup can be done +/// in the `new()` method for the struct. +pub trait BenchmarkWithInput { + type Input; + + /// Generate the input (this is not included in the benchmark time) + fn generate_input(&self) -> Self::Input; + + /// Perform the operations that we're benchmarking. + fn benchmarked_code(&self, input: Self::Input); +} diff --git a/third_party/rust/suggest/src/bin/debug_ingestion_sizes.rs b/third_party/rust/suggest/src/bin/debug_ingestion_sizes.rs new file mode 100644 index 0000000000..14ca3d9462 --- /dev/null +++ b/third_party/rust/suggest/src/bin/debug_ingestion_sizes.rs @@ -0,0 +1,9 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use suggest::benchmarks::ingest; + +fn main() { + ingest::print_debug_ingestion_sizes() +} diff --git a/third_party/rust/suggest/src/config.rs b/third_party/rust/suggest/src/config.rs index fcb3c2e256..532ac5b504 100644 --- a/third_party/rust/suggest/src/config.rs +++ b/third_party/rust/suggest/src/config.rs @@ -1,3 +1,8 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + use serde::{Deserialize, Serialize}; use crate::rs::{DownloadedGlobalConfig, DownloadedWeatherData}; diff --git a/third_party/rust/suggest/src/db.rs b/third_party/rust/suggest/src/db.rs index 07fc3ab4a2..0412c50d8f 100644 --- a/third_party/rust/suggest/src/db.rs +++ b/third_party/rust/suggest/src/db.rs @@ -23,17 +23,17 @@ use crate::{ rs::{ DownloadedAmoSuggestion, DownloadedAmpSuggestion, DownloadedAmpWikipediaSuggestion, DownloadedMdnSuggestion, DownloadedPocketSuggestion, DownloadedWeatherData, - SuggestRecordId, + DownloadedWikipediaSuggestion, SuggestRecordId, }, - schema::{SuggestConnectionInitializer, VERSION}, + schema::{clear_database, SuggestConnectionInitializer, VERSION}, store::{UnparsableRecord, UnparsableRecords}, suggestion::{cook_raw_suggestion_url, AmpSuggestionType, Suggestion}, Result, SuggestionQuery, }; -/// The metadata key whose value is the timestamp of the last record ingested -/// from the Suggest Remote Settings collection. -pub const LAST_INGEST_META_KEY: &str = "last_quicksuggest_ingest"; +/// The metadata key prefix for the last ingested unparsable record. These are +/// records that were not parsed properly, or were not of the "approved" types. +pub const LAST_INGEST_META_UNPARSABLE: &str = "last_quicksuggest_ingest_unparsable"; /// The metadata key whose value keeps track of records of suggestions /// that aren't parsable and which schema version it was first seen in. pub const UNPARSABLE_RECORDS_META_KEY: &str = "unparsable_records"; @@ -148,20 +148,28 @@ impl<'a> SuggestDao<'a> { self.put_unparsable_record_id(&record_id)?; // Advance the last fetch time, so that we can resume // fetching after this record if we're interrupted. - self.put_last_ingest_if_newer(record.last_modified) + self.put_last_ingest_if_newer(LAST_INGEST_META_UNPARSABLE, record.last_modified) } - pub fn handle_ingested_record(&mut self, record: &RemoteSettingsRecord) -> Result<()> { + pub fn handle_ingested_record( + &mut self, + last_ingest_key: &str, + record: &RemoteSettingsRecord, + ) -> Result<()> { let record_id = SuggestRecordId::from(&record.id); // Remove this record's ID from the list of unparsable // records, since we understand it now. self.drop_unparsable_record_id(&record_id)?; // Advance the last fetch time, so that we can resume // fetching after this record if we're interrupted. - self.put_last_ingest_if_newer(record.last_modified) + self.put_last_ingest_if_newer(last_ingest_key, record.last_modified) } - pub fn handle_deleted_record(&mut self, record: &RemoteSettingsRecord) -> Result<()> { + pub fn handle_deleted_record( + &mut self, + last_ingest_key: &str, + record: &RemoteSettingsRecord, + ) -> Result<()> { let record_id = SuggestRecordId::from(&record.id); // Drop either the icon or suggestions, records only contain one or the other match record_id.as_icon_id() { @@ -173,7 +181,7 @@ impl<'a> SuggestDao<'a> { self.drop_unparsable_record_id(&record_id)?; // Advance the last fetch time, so that we can resume // fetching after this record if we're interrupted. - self.put_last_ingest_if_newer(record.last_modified) + self.put_last_ingest_if_newer(last_ingest_key, record.last_modified) } // =============== Low level API =============== @@ -231,15 +239,20 @@ impl<'a> SuggestDao<'a> { s.title, s.url, s.provider, - s.score + s.score, + fk.full_keyword FROM suggestions s JOIN keywords k ON k.suggestion_id = s.id + LEFT JOIN + full_keywords fk + ON k.full_keyword_id = fk.id WHERE s.provider = :provider AND k.keyword = :keyword + AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) "#, named_params! { ":keyword": keyword_lowercased, @@ -248,8 +261,9 @@ impl<'a> SuggestDao<'a> { |row| -> Result { let suggestion_id: i64 = row.get("id")?; let title = row.get("title")?; - let raw_url = row.get::<_, String>("url")?; - let score = row.get::<_, f64>("score")?; + let raw_url: String = row.get("url")?; + let score: f64 = row.get("score")?; + let full_keyword_from_db: Option = row.get("full_keyword")?; let keywords: Vec = self.conn.query_rows_and_then_cached( r#" @@ -277,9 +291,12 @@ impl<'a> SuggestDao<'a> { amp.iab_category, amp.impression_url, amp.click_url, - (SELECT i.data FROM icons i WHERE i.id = amp.icon_id) AS icon + i.data AS icon, + i.mimetype AS icon_mimetype FROM amp_custom_details amp + LEFT JOIN + icons i ON amp.icon_id = i.id WHERE amp.suggestion_id = :suggestion_id "#, @@ -298,8 +315,10 @@ impl<'a> SuggestDao<'a> { title, url: cooked_url, raw_url, - full_keyword: full_keyword(keyword_lowercased, &keywords), + full_keyword: full_keyword_from_db + .unwrap_or_else(|| full_keyword(keyword_lowercased, &keywords)), icon: row.get("icon")?, + icon_mimetype: row.get("icon_mimetype")?, impression_url: row.get("impression_url")?, click_url: cooked_click_url, raw_click_url, @@ -330,6 +349,7 @@ impl<'a> SuggestDao<'a> { WHERE s.provider = :provider AND k.keyword = :keyword + AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) "#, named_params! { ":keyword": keyword_lowercased, @@ -350,36 +370,51 @@ impl<'a> SuggestDao<'a> { }, |row| row.get(0), )?; - let icon = self.conn.try_query_one( - "SELECT i.data + let (icon, icon_mimetype) = self + .conn + .try_query_row( + "SELECT i.data, i.mimetype FROM icons i JOIN wikipedia_custom_details s ON s.icon_id = i.id - WHERE s.suggestion_id = :suggestion_id", - named_params! { - ":suggestion_id": suggestion_id - }, - true, - )?; + WHERE s.suggestion_id = :suggestion_id + LIMIT 1", + named_params! { + ":suggestion_id": suggestion_id + }, + |row| -> Result<_> { + Ok(( + row.get::<_, Option>>(0)?, + row.get::<_, Option>(1)?, + )) + }, + true, + )? + .unwrap_or((None, None)); + Ok(Suggestion::Wikipedia { title, url: raw_url, full_keyword: full_keyword(keyword_lowercased, &keywords), icon, + icon_mimetype, }) }, )?; Ok(suggestions) } - /// Fetches Suggestions of type Amo provider that match the given query - pub fn fetch_amo_suggestions(&self, query: &SuggestionQuery) -> Result> { + /// Query for suggestions using the keyword prefix and provider + fn map_prefix_keywords( + &self, + query: &SuggestionQuery, + provider: &SuggestionProvider, + mut mapper: impl FnMut(&rusqlite::Row, &str) -> Result, + ) -> Result> { let keyword_lowercased = &query.keyword.to_lowercase(); let (keyword_prefix, keyword_suffix) = split_keyword(keyword_lowercased); - let suggestions_limit = &query.limit.unwrap_or(-1); - let suggestions = self - .conn - .query_rows_and_then_cached( - r#" + let suggestions_limit = query.limit.unwrap_or(-1); + self.conn.query_rows_and_then_cached( + r#" SELECT s.id, MAX(k.rank) AS rank, @@ -397,6 +432,7 @@ impl<'a> SuggestDao<'a> { k.keyword_prefix = :keyword_prefix AND (k.keyword_suffix BETWEEN :keyword_suffix AND :keyword_suffix || x'FFFF') AND s.provider = :provider + AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) GROUP BY s.id ORDER BY @@ -405,13 +441,23 @@ impl<'a> SuggestDao<'a> { LIMIT :suggestions_limit "#, - named_params! { - ":keyword_prefix": keyword_prefix, - ":keyword_suffix": keyword_suffix, - ":provider": SuggestionProvider::Amo, - ":suggestions_limit": suggestions_limit, - }, - |row| -> Result> { + &[ + (":keyword_prefix", &keyword_prefix as &dyn ToSql), + (":keyword_suffix", &keyword_suffix as &dyn ToSql), + (":provider", provider as &dyn ToSql), + (":suggestions_limit", &suggestions_limit as &dyn ToSql), + ], + |row| mapper(row, keyword_suffix), + ) + } + + /// Fetches Suggestions of type Amo provider that match the given query + pub fn fetch_amo_suggestions(&self, query: &SuggestionQuery) -> Result> { + let suggestions = self + .map_prefix_keywords( + query, + &SuggestionProvider::Amo, + |row, keyword_suffix| -> Result> { let suggestion_id: i64 = row.get("id")?; let title = row.get("title")?; let raw_url = row.get::<_, String>("url")?; @@ -486,6 +532,7 @@ impl<'a> SuggestDao<'a> { k.keyword_prefix = :keyword_prefix AND (k.keyword_suffix BETWEEN :keyword_suffix AND :keyword_suffix || x'FFFF') AND s.provider = :provider + AND NOT EXISTS (SELECT 1 FROM dismissed_suggestions WHERE url=s.url) GROUP BY s.id, k.confidence @@ -534,45 +581,11 @@ impl<'a> SuggestDao<'a> { /// Fetches suggestions for MDN pub fn fetch_mdn_suggestions(&self, query: &SuggestionQuery) -> Result> { - let keyword_lowercased = &query.keyword.to_lowercase(); - let (keyword_prefix, keyword_suffix) = split_keyword(keyword_lowercased); - let suggestions_limit = &query.limit.unwrap_or(-1); let suggestions = self - .conn - .query_rows_and_then_cached( - r#" - SELECT - s.id, - MAX(k.rank) AS rank, - s.title, - s.url, - s.provider, - s.score, - k.keyword_suffix - FROM - suggestions s - JOIN - prefix_keywords k - ON k.suggestion_id = s.id - WHERE - k.keyword_prefix = :keyword_prefix - AND (k.keyword_suffix BETWEEN :keyword_suffix AND :keyword_suffix || x'FFFF') - AND s.provider = :provider - GROUP BY - s.id - ORDER BY - s.score DESC, - rank DESC - LIMIT - :suggestions_limit - "#, - named_params! { - ":keyword_prefix": keyword_prefix, - ":keyword_suffix": keyword_suffix, - ":provider": SuggestionProvider::Mdn, - ":suggestions_limit": suggestions_limit, - }, - |row| -> Result> { + .map_prefix_keywords( + query, + &SuggestionProvider::Mdn, + |row, keyword_suffix| -> Result> { let suggestion_id: i64 = row.get("id")?; let title = row.get("title")?; let raw_url = row.get::<_, String>("url")?; @@ -657,35 +670,15 @@ impl<'a> SuggestDao<'a> { record_id: &SuggestRecordId, suggestions: &[DownloadedAmoSuggestion], ) -> Result<()> { + let mut suggestion_insert = SuggestionInsertStatement::new(self.conn)?; for suggestion in suggestions { self.scope.err_if_interrupted()?; - let suggestion_id: i64 = self.conn.query_row_and_then_cachable( - &format!( - "INSERT INTO suggestions( - record_id, - provider, - title, - url, - score - ) - VALUES( - :record_id, - {}, - :title, - :url, - :score - ) - RETURNING id", - SuggestionProvider::Amo as u8 - ), - named_params! { - ":record_id": record_id.as_str(), - ":title": suggestion.title, - ":url": suggestion.url, - ":score": suggestion.score, - }, - |row| row.get(0), - true, + let suggestion_id = suggestion_insert.execute( + record_id, + &suggestion.title, + &suggestion.url, + suggestion.score, + SuggestionProvider::Amo, )?; self.conn.execute( "INSERT INTO amo_custom_details( @@ -747,105 +740,48 @@ impl<'a> SuggestDao<'a> { record_id: &SuggestRecordId, suggestions: &[DownloadedAmpWikipediaSuggestion], ) -> Result<()> { + // Prepare statements outside of the loop. This results in a large performance + // improvement on a fresh ingest, since there are so many rows. + let mut suggestion_insert = SuggestionInsertStatement::new(self.conn)?; + let mut amp_insert = AmpInsertStatement::new(self.conn)?; + let mut wiki_insert = WikipediaInsertStatement::new(self.conn)?; + let mut keyword_insert = KeywordInsertStatement::new(self.conn)?; for suggestion in suggestions { self.scope.err_if_interrupted()?; let common_details = suggestion.common_details(); let provider = suggestion.provider(); - let suggestion_id: i64 = self.conn.query_row_and_then_cachable( - &format!( - "INSERT INTO suggestions( - record_id, - provider, - title, - url, - score - ) - VALUES( - :record_id, - {}, - :title, - :url, - :score - ) - RETURNING id", - provider as u8 - ), - named_params! { - ":record_id": record_id.as_str(), - ":title": common_details.title, - ":url": common_details.url, - ":score": common_details.score.unwrap_or(DEFAULT_SUGGESTION_SCORE) - }, - |row| row.get(0), - true, + let suggestion_id = suggestion_insert.execute( + record_id, + &common_details.title, + &common_details.url, + common_details.score.unwrap_or(DEFAULT_SUGGESTION_SCORE), + provider, )?; match suggestion { DownloadedAmpWikipediaSuggestion::Amp(amp) => { - self.conn.execute( - "INSERT INTO amp_custom_details( - suggestion_id, - advertiser, - block_id, - iab_category, - impression_url, - click_url, - icon_id - ) - VALUES( - :suggestion_id, - :advertiser, - :block_id, - :iab_category, - :impression_url, - :click_url, - :icon_id - )", - named_params! { - ":suggestion_id": suggestion_id, - ":advertiser": amp.advertiser, - ":block_id": amp.block_id, - ":iab_category": amp.iab_category, - ":impression_url": amp.impression_url, - ":click_url": amp.click_url, - ":icon_id": amp.icon_id, - }, - )?; + amp_insert.execute(suggestion_id, amp)?; } DownloadedAmpWikipediaSuggestion::Wikipedia(wikipedia) => { - self.conn.execute( - "INSERT INTO wikipedia_custom_details( - suggestion_id, - icon_id - ) - VALUES( - :suggestion_id, - :icon_id - )", - named_params! { - ":suggestion_id": suggestion_id, - ":icon_id": wikipedia.icon_id, - }, - )?; + wiki_insert.execute(suggestion_id, wikipedia)?; } } - for (index, keyword) in common_details.keywords.iter().enumerate() { - self.conn.execute( - "INSERT INTO keywords( - keyword, - suggestion_id, - rank - ) - VALUES( - :keyword, - :suggestion_id, - :rank - )", - named_params! { - ":keyword": keyword, - ":rank": index, - ":suggestion_id": suggestion_id, - }, + let mut full_keyword_inserter = FullKeywordInserter::new(self.conn, suggestion_id); + for keyword in common_details.keywords() { + let full_keyword_id = match (suggestion, keyword.full_keyword) { + // Try to associate full keyword data. Only do this for AMP, we decided to + // skip it for Wikipedia in https://bugzilla.mozilla.org/show_bug.cgi?id=1876217 + (DownloadedAmpWikipediaSuggestion::Amp(_), Some(full_keyword)) => { + Some(full_keyword_inserter.maybe_insert(full_keyword)?) + } + _ => None, + }; + + keyword_insert.execute( + suggestion_id, + keyword.keyword, + full_keyword_id, + keyword.rank, )?; } } @@ -859,84 +795,32 @@ impl<'a> SuggestDao<'a> { record_id: &SuggestRecordId, suggestions: &[DownloadedAmpSuggestion], ) -> Result<()> { + let mut suggestion_insert = SuggestionInsertStatement::new(self.conn)?; + let mut amp_insert = AmpInsertStatement::new(self.conn)?; + let mut keyword_insert = KeywordInsertStatement::new(self.conn)?; for suggestion in suggestions { self.scope.err_if_interrupted()?; let common_details = &suggestion.common_details; - let suggestion_id: i64 = self.conn.query_row_and_then_cachable( - &format!( - "INSERT INTO suggestions( - record_id, - provider, - title, - url, - score - ) - VALUES( - :record_id, - {}, - :title, - :url, - :score - ) - RETURNING id", - SuggestionProvider::AmpMobile as u8 - ), - named_params! { - ":record_id": record_id.as_str(), - ":title": common_details.title, - ":url": common_details.url, - ":score": common_details.score.unwrap_or(DEFAULT_SUGGESTION_SCORE) - }, - |row| row.get(0), - true, - )?; - self.conn.execute( - "INSERT INTO amp_custom_details( - suggestion_id, - advertiser, - block_id, - iab_category, - impression_url, - click_url, - icon_id - ) - VALUES( - :suggestion_id, - :advertiser, - :block_id, - :iab_category, - :impression_url, - :click_url, - :icon_id - )", - named_params! { - ":suggestion_id": suggestion_id, - ":advertiser": suggestion.advertiser, - ":block_id": suggestion.block_id, - ":iab_category": suggestion.iab_category, - ":impression_url": suggestion.impression_url, - ":click_url": suggestion.click_url, - ":icon_id": suggestion.icon_id, - }, + let suggestion_id = suggestion_insert.execute( + record_id, + &common_details.title, + &common_details.url, + common_details.score.unwrap_or(DEFAULT_SUGGESTION_SCORE), + SuggestionProvider::AmpMobile, )?; + amp_insert.execute(suggestion_id, suggestion)?; - for (index, keyword) in common_details.keywords.iter().enumerate() { - self.conn.execute( - "INSERT INTO keywords( - keyword, - suggestion_id, - rank - ) - VALUES( - :keyword, - :suggestion_id, - :rank - )", - named_params! { - ":keyword": keyword, - ":rank": index, - ":suggestion_id": suggestion_id, - }, + let mut full_keyword_inserter = FullKeywordInserter::new(self.conn, suggestion_id); + for keyword in common_details.keywords() { + let full_keyword_id = keyword + .full_keyword + .map(|full_keyword| full_keyword_inserter.maybe_insert(full_keyword)) + .transpose()?; + keyword_insert.execute( + suggestion_id, + keyword.keyword, + full_keyword_id, + keyword.rank, )?; } } @@ -950,37 +834,16 @@ impl<'a> SuggestDao<'a> { record_id: &SuggestRecordId, suggestions: &[DownloadedPocketSuggestion], ) -> Result<()> { + let mut suggestion_insert = SuggestionInsertStatement::new(self.conn)?; for suggestion in suggestions { self.scope.err_if_interrupted()?; - let suggestion_id: i64 = self.conn.query_row_and_then_cachable( - &format!( - "INSERT INTO suggestions( - record_id, - provider, - title, - url, - score - ) - VALUES( - :record_id, - {}, - :title, - :url, - :score - ) - RETURNING id", - SuggestionProvider::Pocket as u8 - ), - named_params! { - ":record_id": record_id.as_str(), - ":title": suggestion.title, - ":url": suggestion.url, - ":score": suggestion.score, - }, - |row| row.get(0), - true, + let suggestion_id = suggestion_insert.execute( + record_id, + &suggestion.title, + &suggestion.url, + suggestion.score, + SuggestionProvider::Pocket, )?; - for ((rank, keyword), confidence) in suggestion .high_confidence_keywords .iter() @@ -1030,35 +893,15 @@ impl<'a> SuggestDao<'a> { record_id: &SuggestRecordId, suggestions: &[DownloadedMdnSuggestion], ) -> Result<()> { + let mut suggestion_insert = SuggestionInsertStatement::new(self.conn)?; for suggestion in suggestions { self.scope.err_if_interrupted()?; - let suggestion_id: i64 = self.conn.query_row_and_then_cachable( - &format!( - "INSERT INTO suggestions( - record_id, - provider, - title, - url, - score - ) - VALUES( - :record_id, - {}, - :title, - :url, - :score - ) - RETURNING id", - SuggestionProvider::Mdn as u8 - ), - named_params! { - ":record_id": record_id.as_str(), - ":title": suggestion.title, - ":url": suggestion.url, - ":score": suggestion.score, - }, - |row| row.get(0), - true, + let suggestion_id = suggestion_insert.execute( + record_id, + &suggestion.title, + &suggestion.url, + suggestion.score, + SuggestionProvider::Mdn, )?; self.conn.execute_cached( "INSERT INTO mdn_custom_details( @@ -1107,20 +950,14 @@ impl<'a> SuggestDao<'a> { record_id: &SuggestRecordId, data: &DownloadedWeatherData, ) -> Result<()> { + let mut suggestion_insert = SuggestionInsertStatement::new(self.conn)?; self.scope.err_if_interrupted()?; - let suggestion_id: i64 = self.conn.query_row_and_then_cachable( - &format!( - "INSERT INTO suggestions(record_id, provider, title, url, score) - VALUES(:record_id, {}, '', '', :score) - RETURNING id", - SuggestionProvider::Weather as u8 - ), - named_params! { - ":record_id": record_id.as_str(), - ":score": data.weather.score.unwrap_or(DEFAULT_SUGGESTION_SCORE), - }, - |row| row.get(0), - true, + let suggestion_id = suggestion_insert.execute( + record_id, + "", + "", + data.weather.score.unwrap_or(DEFAULT_SUGGESTION_SCORE), + SuggestionProvider::Weather, )?; for (index, keyword) in data.weather.keywords.iter().enumerate() { self.conn.execute( @@ -1141,24 +978,43 @@ impl<'a> SuggestDao<'a> { } /// Inserts or replaces an icon for a suggestion into the database. - pub fn put_icon(&mut self, icon_id: &str, data: &[u8]) -> Result<()> { + pub fn put_icon(&mut self, icon_id: &str, data: &[u8], mimetype: &str) -> Result<()> { self.conn.execute( "INSERT OR REPLACE INTO icons( id, - data + data, + mimetype ) VALUES( :id, - :data + :data, + :mimetype )", named_params! { ":id": icon_id, ":data": data, + ":mimetype": mimetype, }, )?; Ok(()) } + pub fn insert_dismissal(&self, url: &str) -> Result<()> { + self.conn.execute( + "INSERT OR IGNORE INTO dismissed_suggestions(url) + VALUES(:url)", + named_params! { + ":url": url, + }, + )?; + Ok(()) + } + + pub fn clear_dismissals(&self) -> Result<()> { + self.conn.execute("DELETE FROM dismissed_suggestions", ())?; + Ok(()) + } + /// Deletes all suggestions associated with a Remote Settings record from /// the database. pub fn drop_suggestions(&mut self, record_id: &SuggestRecordId) -> Result<()> { @@ -1196,12 +1052,7 @@ impl<'a> SuggestDao<'a> { /// Clears the database, removing all suggestions, icons, and metadata. pub fn clear(&mut self) -> Result<()> { - self.conn.execute_batch( - "DELETE FROM suggestions; - DELETE FROM icons; - DELETE FROM meta;", - )?; - Ok(()) + Ok(clear_database(self.conn)?) } /// Returns the value associated with a metadata key. @@ -1224,12 +1075,14 @@ impl<'a> SuggestDao<'a> { /// Updates the last ingest timestamp if the given last modified time is /// newer than the existing one recorded. - pub fn put_last_ingest_if_newer(&mut self, record_last_modified: u64) -> Result<()> { - let last_ingest = self - .get_meta::(LAST_INGEST_META_KEY)? - .unwrap_or_default(); + pub fn put_last_ingest_if_newer( + &mut self, + last_ingest_key: &str, + record_last_modified: u64, + ) -> Result<()> { + let last_ingest = self.get_meta::(last_ingest_key)?.unwrap_or_default(); if record_last_modified > last_ingest { - self.put_meta(LAST_INGEST_META_KEY, record_last_modified)?; + self.put_meta(last_ingest_key, record_last_modified)?; } Ok(()) @@ -1310,6 +1163,185 @@ impl<'a> SuggestDao<'a> { } } +/// Helper struct to get full_keyword_ids for a suggestion +/// +/// `FullKeywordInserter` handles repeated full keywords efficiently. The first instance will +/// cause a row to be inserted into the database. Subsequent instances will return the same +/// full_keyword_id. +struct FullKeywordInserter<'a> { + conn: &'a Connection, + suggestion_id: i64, + last_inserted: Option<(&'a str, i64)>, +} + +impl<'a> FullKeywordInserter<'a> { + fn new(conn: &'a Connection, suggestion_id: i64) -> Self { + Self { + conn, + suggestion_id, + last_inserted: None, + } + } + + fn maybe_insert(&mut self, full_keyword: &'a str) -> rusqlite::Result { + match self.last_inserted { + Some((s, id)) if s == full_keyword => Ok(id), + _ => { + let full_keyword_id = self.conn.query_row_and_then( + "INSERT INTO full_keywords( + suggestion_id, + full_keyword + ) + VALUES( + :suggestion_id, + :keyword + ) + RETURNING id", + named_params! { + ":keyword": full_keyword, + ":suggestion_id": self.suggestion_id, + }, + |row| row.get(0), + )?; + self.last_inserted = Some((full_keyword, full_keyword_id)); + Ok(full_keyword_id) + } + } + } +} + +// ======================== Statement types ======================== +// +// During ingestion we can insert hundreds of thousands of rows. These types enable speedups by +// allowing us to prepare a statement outside a loop and use it many times inside the loop. +// +// Each type wraps [Connection::prepare] and [Statement] to provide a simplified interface, +// tailored to a specific query. +// +// This pattern is applicable for whenever we execute the same query repeatedly in a loop. +// The impact scales with the number of loop iterations, which is why we currently don't do this +// for providers like Mdn, Pocket, and Weather, which have relatively small number of records +// compared to Amp/Wikipedia. + +struct SuggestionInsertStatement<'conn>(rusqlite::Statement<'conn>); + +impl<'conn> SuggestionInsertStatement<'conn> { + fn new(conn: &'conn Connection) -> Result { + Ok(Self(conn.prepare( + "INSERT INTO suggestions( + record_id, + title, + url, + score, + provider + ) + VALUES(?, ?, ?, ?, ?) + RETURNING id", + )?)) + } + + /// Execute the insert and return the `suggestion_id` for the new row + fn execute( + &mut self, + record_id: &SuggestRecordId, + title: &str, + url: &str, + score: f64, + provider: SuggestionProvider, + ) -> Result { + Ok(self.0.query_row( + (record_id.as_str(), title, url, score, provider as u8), + |row| row.get(0), + )?) + } +} + +struct AmpInsertStatement<'conn>(rusqlite::Statement<'conn>); + +impl<'conn> AmpInsertStatement<'conn> { + fn new(conn: &'conn Connection) -> Result { + Ok(Self(conn.prepare( + "INSERT INTO amp_custom_details( + suggestion_id, + advertiser, + block_id, + iab_category, + impression_url, + click_url, + icon_id + ) + VALUES(?, ?, ?, ?, ?, ?, ?) + ", + )?)) + } + + fn execute(&mut self, suggestion_id: i64, amp: &DownloadedAmpSuggestion) -> Result<()> { + self.0.execute(( + suggestion_id, + &.advertiser, + amp.block_id, + &.iab_category, + &.impression_url, + &.click_url, + &.icon_id, + ))?; + Ok(()) + } +} + +struct WikipediaInsertStatement<'conn>(rusqlite::Statement<'conn>); + +impl<'conn> WikipediaInsertStatement<'conn> { + fn new(conn: &'conn Connection) -> Result { + Ok(Self(conn.prepare( + "INSERT INTO wikipedia_custom_details( + suggestion_id, + icon_id + ) + VALUES(?, ?) + ", + )?)) + } + + fn execute( + &mut self, + suggestion_id: i64, + wikipedia: &DownloadedWikipediaSuggestion, + ) -> Result<()> { + self.0.execute((suggestion_id, &wikipedia.icon_id))?; + Ok(()) + } +} + +struct KeywordInsertStatement<'conn>(rusqlite::Statement<'conn>); + +impl<'conn> KeywordInsertStatement<'conn> { + fn new(conn: &'conn Connection) -> Result { + Ok(Self(conn.prepare( + "INSERT INTO keywords( + suggestion_id, + keyword, + full_keyword_id, + rank + ) + VALUES(?, ?, ?, ?) + ", + )?)) + } + + fn execute( + &mut self, + suggestion_id: i64, + keyword: &str, + full_keyword_id: Option, + rank: usize, + ) -> Result<()> { + self.0 + .execute((suggestion_id, keyword, full_keyword_id, rank))?; + Ok(()) + } +} + fn provider_config_meta_key(provider: SuggestionProvider) -> String { format!("{}{}", PROVIDER_CONFIG_META_KEY_PREFIX, provider as u8) } diff --git a/third_party/rust/suggest/src/lib.rs b/third_party/rust/suggest/src/lib.rs index 23775b7dec..15746614d0 100644 --- a/third_party/rust/suggest/src/lib.rs +++ b/third_party/rust/suggest/src/lib.rs @@ -4,6 +4,8 @@ */ use remote_settings::RemoteSettingsConfig; +#[cfg(feature = "benchmark_api")] +pub mod benchmarks; mod config; mod db; mod error; @@ -26,7 +28,7 @@ pub(crate) type Result = std::result::Result; pub type SuggestApiResult = std::result::Result; /// A query for suggestions to show in the address bar. -#[derive(Debug, Default)] +#[derive(Clone, Debug, Default)] pub struct SuggestionQuery { pub keyword: String, pub providers: Vec, diff --git a/third_party/rust/suggest/src/pocket.rs b/third_party/rust/suggest/src/pocket.rs index cf7070c62a..06fd4b012d 100644 --- a/third_party/rust/suggest/src/pocket.rs +++ b/third_party/rust/suggest/src/pocket.rs @@ -12,7 +12,7 @@ use rusqlite::{Result as RusqliteResult, ToSql}; /// substring for the suffix. #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] #[repr(u8)] -pub enum KeywordConfidence { +pub(crate) enum KeywordConfidence { Low = 0, High = 1, } diff --git a/third_party/rust/suggest/src/provider.rs b/third_party/rust/suggest/src/provider.rs index 1449c35c8a..2c0b6674cb 100644 --- a/third_party/rust/suggest/src/provider.rs +++ b/third_party/rust/suggest/src/provider.rs @@ -8,6 +8,8 @@ use rusqlite::{ Result as RusqliteResult, }; +use crate::rs::SuggestRecordType; + /// A provider is a source of search suggestions. #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] #[repr(u8)] @@ -46,6 +48,52 @@ impl SuggestionProvider { _ => None, } } + + pub(crate) fn records_for_provider(&self) -> Vec { + match self { + SuggestionProvider::Amp => { + vec![ + SuggestRecordType::AmpWikipedia, + SuggestRecordType::Icon, + SuggestRecordType::GlobalConfig, + ] + } + SuggestionProvider::Wikipedia => { + vec![ + SuggestRecordType::AmpWikipedia, + SuggestRecordType::Icon, + SuggestRecordType::GlobalConfig, + ] + } + SuggestionProvider::Amo => { + vec![SuggestRecordType::Amo, SuggestRecordType::GlobalConfig] + } + SuggestionProvider::Pocket => { + vec![SuggestRecordType::Pocket, SuggestRecordType::GlobalConfig] + } + SuggestionProvider::Yelp => { + vec![ + SuggestRecordType::Yelp, + SuggestRecordType::Icon, + SuggestRecordType::GlobalConfig, + ] + } + SuggestionProvider::Mdn => { + vec![SuggestRecordType::Mdn, SuggestRecordType::GlobalConfig] + } + SuggestionProvider::Weather => { + vec![SuggestRecordType::Weather, SuggestRecordType::GlobalConfig] + } + SuggestionProvider::AmpMobile => { + vec![ + SuggestRecordType::AmpMobile, + SuggestRecordType::AmpWikipedia, + SuggestRecordType::Icon, + SuggestRecordType::GlobalConfig, + ] + } + } + } } impl ToSql for SuggestionProvider { diff --git a/third_party/rust/suggest/src/rs.rs b/third_party/rust/suggest/src/rs.rs index 198a8c43f6..4a733ece9d 100644 --- a/third_party/rust/suggest/src/rs.rs +++ b/third_party/rust/suggest/src/rs.rs @@ -31,7 +31,7 @@ //! the new suggestion in their results, and return `Suggestion::T` variants //! as needed. -use std::borrow::Cow; +use std::{borrow::Cow, fmt}; use remote_settings::{GetItemsOptions, RemoteSettingsResponse}; use serde::{Deserialize, Deserializer}; @@ -47,6 +47,20 @@ pub(crate) const REMOTE_SETTINGS_COLLECTION: &str = "quicksuggest"; /// `mozilla-services/quicksuggest-rs` repo. pub(crate) const SUGGESTIONS_PER_ATTACHMENT: u64 = 200; +/// A list of default record types to download if nothing is specified. +/// This currently defaults to all of the record types. +pub(crate) const DEFAULT_RECORDS_TYPES: [SuggestRecordType; 9] = [ + SuggestRecordType::Icon, + SuggestRecordType::AmpWikipedia, + SuggestRecordType::Amo, + SuggestRecordType::Pocket, + SuggestRecordType::Yelp, + SuggestRecordType::Mdn, + SuggestRecordType::Weather, + SuggestRecordType::GlobalConfig, + SuggestRecordType::AmpMobile, +]; + /// A trait for a client that downloads suggestions from Remote Settings. /// /// This trait lets tests use a mock client. @@ -102,6 +116,61 @@ pub(crate) enum SuggestRecord { AmpMobile, } +/// Enum for the different record types that can be consumed. +/// Extracting this from the serialization enum so that we can +/// extend it to get type metadata. +#[derive(Copy, Clone, PartialEq, PartialOrd, Eq, Ord)] +pub enum SuggestRecordType { + Icon, + AmpWikipedia, + Amo, + Pocket, + Yelp, + Mdn, + Weather, + GlobalConfig, + AmpMobile, +} + +impl From for SuggestRecordType { + fn from(suggest_record: SuggestRecord) -> Self { + match suggest_record { + SuggestRecord::Amo => Self::Amo, + SuggestRecord::AmpWikipedia => Self::AmpWikipedia, + SuggestRecord::Icon => Self::Icon, + SuggestRecord::Mdn => Self::Mdn, + SuggestRecord::Pocket => Self::Pocket, + SuggestRecord::Weather(_) => Self::Weather, + SuggestRecord::Yelp => Self::Yelp, + SuggestRecord::GlobalConfig(_) => Self::GlobalConfig, + SuggestRecord::AmpMobile => Self::AmpMobile, + } + } +} + +impl fmt::Display for SuggestRecordType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Icon => write!(f, "icon"), + Self::AmpWikipedia => write!(f, "data"), + Self::Amo => write!(f, "amo-suggestions"), + Self::Pocket => write!(f, "pocket-suggestions"), + Self::Yelp => write!(f, "yelp-suggestions"), + Self::Mdn => write!(f, "mdn-suggestions"), + Self::Weather => write!(f, "weather"), + Self::GlobalConfig => write!(f, "configuration"), + Self::AmpMobile => write!(f, "amp-mobile-suggestions"), + } + } +} + +impl SuggestRecordType { + /// Return the meta key for the last ingested record. + pub fn last_ingest_meta_key(&self) -> String { + format!("last_quicksuggest_ingest_{}", self) + } +} + /// Represents either a single value, or a list of values. This is used to /// deserialize downloaded attachments. #[derive(Clone, Debug, Deserialize)] @@ -156,16 +225,18 @@ where } /// Fields that are common to all downloaded suggestions. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize)] pub(crate) struct DownloadedSuggestionCommonDetails { pub keywords: Vec, pub title: String, pub url: String, pub score: Option, + #[serde(default)] + pub full_keywords: Vec<(String, usize)>, } /// An AMP suggestion to ingest from an AMP-Wikipedia attachment. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize)] pub(crate) struct DownloadedAmpSuggestion { #[serde(flatten)] pub common_details: DownloadedSuggestionCommonDetails, @@ -180,7 +251,7 @@ pub(crate) struct DownloadedAmpSuggestion { } /// A Wikipedia suggestion to ingest from an AMP-Wikipedia attachment. -#[derive(Clone, Debug, Deserialize)] +#[derive(Clone, Debug, Default, Deserialize)] pub(crate) struct DownloadedWikipediaSuggestion { #[serde(flatten)] pub common_details: DownloadedSuggestionCommonDetails, @@ -214,6 +285,34 @@ impl DownloadedAmpWikipediaSuggestion { } } +impl DownloadedSuggestionCommonDetails { + /// Iterate over all keywords for this suggestion + pub fn keywords(&self) -> impl Iterator> { + let full_keywords = self + .full_keywords + .iter() + .flat_map(|(full_keyword, repeat_for)| { + std::iter::repeat(Some(full_keyword.as_str())).take(*repeat_for) + }) + .chain(std::iter::repeat(None)); // In case of insufficient full keywords, just fill in with infinite `None`s + // + self.keywords.iter().zip(full_keywords).enumerate().map( + move |(i, (keyword, full_keyword))| AmpKeyword { + rank: i, + keyword, + full_keyword, + }, + ) + } +} + +#[derive(Debug, PartialEq, Eq)] +pub(crate) struct AmpKeyword<'a> { + pub rank: usize, + pub keyword: &'a str, + pub full_keyword: Option<&'a str>, +} + impl<'de> Deserialize<'de> for DownloadedAmpWikipediaSuggestion { fn deserialize( deserializer: D, @@ -344,3 +443,119 @@ where { String::deserialize(deserializer).map(|s| s.parse().ok()) } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_full_keywords() { + let suggestion = DownloadedAmpWikipediaSuggestion::Amp(DownloadedAmpSuggestion { + common_details: DownloadedSuggestionCommonDetails { + keywords: vec![ + String::from("f"), + String::from("fo"), + String::from("foo"), + String::from("foo b"), + String::from("foo ba"), + String::from("foo bar"), + ], + full_keywords: vec![(String::from("foo"), 3), (String::from("foo bar"), 3)], + ..DownloadedSuggestionCommonDetails::default() + }, + ..DownloadedAmpSuggestion::default() + }); + + assert_eq!( + Vec::from_iter(suggestion.common_details().keywords()), + vec![ + AmpKeyword { + rank: 0, + keyword: "f", + full_keyword: Some("foo"), + }, + AmpKeyword { + rank: 1, + keyword: "fo", + full_keyword: Some("foo"), + }, + AmpKeyword { + rank: 2, + keyword: "foo", + full_keyword: Some("foo"), + }, + AmpKeyword { + rank: 3, + keyword: "foo b", + full_keyword: Some("foo bar"), + }, + AmpKeyword { + rank: 4, + keyword: "foo ba", + full_keyword: Some("foo bar"), + }, + AmpKeyword { + rank: 5, + keyword: "foo bar", + full_keyword: Some("foo bar"), + }, + ], + ); + } + + #[test] + fn test_missing_full_keywords() { + let suggestion = DownloadedAmpWikipediaSuggestion::Amp(DownloadedAmpSuggestion { + common_details: DownloadedSuggestionCommonDetails { + keywords: vec![ + String::from("f"), + String::from("fo"), + String::from("foo"), + String::from("foo b"), + String::from("foo ba"), + String::from("foo bar"), + ], + // Only the first 3 keywords have full keywords associated with them + full_keywords: vec![(String::from("foo"), 3)], + ..DownloadedSuggestionCommonDetails::default() + }, + ..DownloadedAmpSuggestion::default() + }); + + assert_eq!( + Vec::from_iter(suggestion.common_details().keywords()), + vec![ + AmpKeyword { + rank: 0, + keyword: "f", + full_keyword: Some("foo"), + }, + AmpKeyword { + rank: 1, + keyword: "fo", + full_keyword: Some("foo"), + }, + AmpKeyword { + rank: 2, + keyword: "foo", + full_keyword: Some("foo"), + }, + AmpKeyword { + rank: 3, + keyword: "foo b", + full_keyword: None, + }, + AmpKeyword { + rank: 4, + keyword: "foo ba", + full_keyword: None, + }, + AmpKeyword { + rank: 5, + keyword: "foo bar", + full_keyword: None, + }, + ], + ); + } +} diff --git a/third_party/rust/suggest/src/schema.rs b/third_party/rust/suggest/src/schema.rs index 95d987c09e..b304363de5 100644 --- a/third_party/rust/suggest/src/schema.rs +++ b/third_party/rust/suggest/src/schema.rs @@ -13,7 +13,9 @@ use sql_support::open_database::{self, ConnectionInitializer}; /// 1. Bump this version. /// 2. Add a migration from the old version to the new version in /// [`SuggestConnectionInitializer::upgrade_from`]. -pub const VERSION: u32 = 14; +/// a. If suggestions should be re-ingested after the migration, call `clear_database()` inside +/// the migration. +pub const VERSION: u32 = 18; /// The current Suggest database schema. pub const SQL: &str = " @@ -25,10 +27,19 @@ pub const SQL: &str = " CREATE TABLE keywords( keyword TEXT NOT NULL, suggestion_id INTEGER NOT NULL REFERENCES suggestions(id) ON DELETE CASCADE, + full_keyword_id INTEGER NULL REFERENCES full_keywords(id) ON DELETE SET NULL, rank INTEGER NOT NULL, PRIMARY KEY (keyword, suggestion_id) ) WITHOUT ROWID; + -- full keywords are what we display to the user when a (partial) keyword matches + -- The FK to suggestion_id makes it so full keywords get deleted when the parent suggestion is deleted. + CREATE TABLE full_keywords( + id INTEGER PRIMARY KEY, + suggestion_id INTEGER NOT NULL REFERENCES suggestions(id) ON DELETE CASCADE, + full_keyword TEXT NOT NULL + ); + CREATE TABLE prefix_keywords( keyword_prefix TEXT NOT NULL, keyword_suffix TEXT NOT NULL DEFAULT '', @@ -79,7 +90,8 @@ pub const SQL: &str = " CREATE TABLE icons( id TEXT PRIMARY KEY, - data BLOB NOT NULL + data BLOB NOT NULL, + mimetype TEXT NOT NULL ) WITHOUT ROWID; CREATE TABLE yelp_subjects( @@ -111,6 +123,10 @@ pub const SQL: &str = " description TEXT NOT NULL, FOREIGN KEY(suggestion_id) REFERENCES suggestions(id) ON DELETE CASCADE ); + + CREATE TABLE dismissed_suggestions ( + url TEXT PRIMARY KEY + ) WITHOUT ROWID; "; /// Initializes an SQLite connection to the Suggest database, performing @@ -139,15 +155,178 @@ impl ConnectionInitializer for SuggestConnectionInitializer { Ok(db.execute_batch(SQL)?) } - fn upgrade_from(&self, _db: &Transaction<'_>, version: u32) -> open_database::Result<()> { + fn upgrade_from(&self, tx: &Transaction<'_>, version: u32) -> open_database::Result<()> { match version { - 1..=13 => { + 1..=15 => { // Treat databases with these older schema versions as corrupt, // so that they'll be replaced by a fresh, empty database with // the current schema. Err(open_database::Error::Corrupt) } + 16 => { + tx.execute( + " + CREATE TABLE dismissed_suggestions ( + url_hash INTEGER PRIMARY KEY + ) WITHOUT ROWID;", + (), + )?; + Ok(()) + } + 17 => { + tx.execute( + " + DROP TABLE dismissed_suggestions; + CREATE TABLE dismissed_suggestions ( + url TEXT PRIMARY KEY + ) WITHOUT ROWID;", + (), + )?; + Ok(()) + } _ => Err(open_database::Error::IncompatibleVersion(version)), } } } + +/// Clears the database, removing all suggestions, icons, and metadata. +pub fn clear_database(db: &Connection) -> rusqlite::Result<()> { + db.execute_batch( + " + DELETE FROM meta; + DELETE FROM suggestions; + DELETE FROM icons; + DELETE FROM yelp_subjects; + DELETE FROM yelp_modifiers; + DELETE FROM yelp_location_signs; + DELETE FROM yelp_custom_details; + ", + ) +} + +#[cfg(test)] +mod test { + use super::*; + use sql_support::open_database::test_utils::MigratedDatabaseFile; + + // Snapshot of the v16 schema. We use this to test that we can migrate from there to the + // current schema. + const V16_SCHEMA: &str = r#" + CREATE TABLE meta( + key TEXT PRIMARY KEY, + value NOT NULL + ) WITHOUT ROWID; + + CREATE TABLE keywords( + keyword TEXT NOT NULL, + suggestion_id INTEGER NOT NULL REFERENCES suggestions(id) ON DELETE CASCADE, + full_keyword_id INTEGER NULL REFERENCES full_keywords(id) ON DELETE SET NULL, + rank INTEGER NOT NULL, + PRIMARY KEY (keyword, suggestion_id) + ) WITHOUT ROWID; + + -- full keywords are what we display to the user when a (partial) keyword matches + -- The FK to suggestion_id makes it so full keywords get deleted when the parent suggestion is deleted. + CREATE TABLE full_keywords( + id INTEGER PRIMARY KEY, + suggestion_id INTEGER NOT NULL REFERENCES suggestions(id) ON DELETE CASCADE, + full_keyword TEXT NOT NULL + ); + + CREATE TABLE prefix_keywords( + keyword_prefix TEXT NOT NULL, + keyword_suffix TEXT NOT NULL DEFAULT '', + confidence INTEGER NOT NULL DEFAULT 0, + rank INTEGER NOT NULL, + suggestion_id INTEGER NOT NULL REFERENCES suggestions(id) ON DELETE CASCADE, + PRIMARY KEY (keyword_prefix, keyword_suffix, suggestion_id) + ) WITHOUT ROWID; + + CREATE UNIQUE INDEX keywords_suggestion_id_rank ON keywords(suggestion_id, rank); + + CREATE TABLE suggestions( + id INTEGER PRIMARY KEY, + record_id TEXT NOT NULL, + provider INTEGER NOT NULL, + title TEXT NOT NULL, + url TEXT NOT NULL, + score REAL NOT NULL + ); + + CREATE TABLE amp_custom_details( + suggestion_id INTEGER PRIMARY KEY, + advertiser TEXT NOT NULL, + block_id INTEGER NOT NULL, + iab_category TEXT NOT NULL, + impression_url TEXT NOT NULL, + click_url TEXT NOT NULL, + icon_id TEXT NOT NULL, + FOREIGN KEY(suggestion_id) REFERENCES suggestions(id) ON DELETE CASCADE + ); + + CREATE TABLE wikipedia_custom_details( + suggestion_id INTEGER PRIMARY KEY REFERENCES suggestions(id) ON DELETE CASCADE, + icon_id TEXT NOT NULL + ); + + CREATE TABLE amo_custom_details( + suggestion_id INTEGER PRIMARY KEY, + description TEXT NOT NULL, + guid TEXT NOT NULL, + icon_url TEXT NOT NULL, + rating TEXT, + number_of_ratings INTEGER NOT NULL, + FOREIGN KEY(suggestion_id) REFERENCES suggestions(id) ON DELETE CASCADE + ); + + CREATE INDEX suggestions_record_id ON suggestions(record_id); + + CREATE TABLE icons( + id TEXT PRIMARY KEY, + data BLOB NOT NULL, + mimetype TEXT NOT NULL + ) WITHOUT ROWID; + + CREATE TABLE yelp_subjects( + keyword TEXT PRIMARY KEY, + record_id TEXT NOT NULL + ) WITHOUT ROWID; + + CREATE TABLE yelp_modifiers( + type INTEGER NOT NULL, + keyword TEXT NOT NULL, + record_id TEXT NOT NULL, + PRIMARY KEY (type, keyword) + ) WITHOUT ROWID; + + CREATE TABLE yelp_location_signs( + keyword TEXT PRIMARY KEY, + need_location INTEGER NOT NULL, + record_id TEXT NOT NULL + ) WITHOUT ROWID; + + CREATE TABLE yelp_custom_details( + icon_id TEXT PRIMARY KEY, + score REAL NOT NULL, + record_id TEXT NOT NULL + ) WITHOUT ROWID; + + CREATE TABLE mdn_custom_details( + suggestion_id INTEGER PRIMARY KEY, + description TEXT NOT NULL, + FOREIGN KEY(suggestion_id) REFERENCES suggestions(id) ON DELETE CASCADE + ); + + PRAGMA user_version=16; +"#; + + /// Test running all schema upgrades from V16, which was the first schema with a "real" + /// migration. + /// + /// If an upgrade fails, then this test will fail with a panic. + #[test] + fn test_all_upgrades() { + let db_file = MigratedDatabaseFile::new(SuggestConnectionInitializer, V16_SCHEMA); + db_file.run_all_upgrades(); + } +} diff --git a/third_party/rust/suggest/src/store.rs b/third_party/rust/suggest/src/store.rs index e1f437e8c5..c55cffc7f5 100644 --- a/third_party/rust/suggest/src/store.rs +++ b/third_party/rust/suggest/src/store.rs @@ -4,7 +4,7 @@ */ use std::{ - collections::BTreeMap, + collections::{BTreeMap, BTreeSet}, path::{Path, PathBuf}, sync::Arc, }; @@ -24,13 +24,15 @@ use serde::{de::DeserializeOwned, Deserialize, Serialize}; use crate::{ config::{SuggestGlobalConfig, SuggestProviderConfig}, db::{ - ConnectionType, SuggestDao, SuggestDb, LAST_INGEST_META_KEY, UNPARSABLE_RECORDS_META_KEY, + ConnectionType, SuggestDao, SuggestDb, LAST_INGEST_META_UNPARSABLE, + UNPARSABLE_RECORDS_META_KEY, }, error::Error, provider::SuggestionProvider, rs::{ - SuggestAttachment, SuggestRecord, SuggestRecordId, SuggestRemoteSettingsClient, - REMOTE_SETTINGS_COLLECTION, SUGGESTIONS_PER_ATTACHMENT, + SuggestAttachment, SuggestRecord, SuggestRecordId, SuggestRecordType, + SuggestRemoteSettingsClient, DEFAULT_RECORDS_TYPES, REMOTE_SETTINGS_COLLECTION, + SUGGESTIONS_PER_ATTACHMENT, }, schema::VERSION, Result, SuggestApiResult, Suggestion, SuggestionQuery, @@ -48,7 +50,6 @@ pub struct SuggestStoreBuilder(Mutex); #[derive(Default)] struct SuggestStoreBuilderInner { data_path: Option, - cache_path: Option, remote_settings_config: Option, } @@ -68,8 +69,8 @@ impl SuggestStoreBuilder { self } - pub fn cache_path(self: Arc, path: String) -> Arc { - self.0.lock().cache_path = Some(path); + pub fn cache_path(self: Arc, _path: String) -> Arc { + // We used to use this, but we're not using it anymore, just ignore the call self } @@ -85,10 +86,6 @@ impl SuggestStoreBuilder { .data_path .clone() .ok_or_else(|| Error::SuggestStoreBuilder("data_path not specified".to_owned()))?; - let cache_path = inner - .cache_path - .clone() - .ok_or_else(|| Error::SuggestStoreBuilder("cache_path not specified".to_owned()))?; let settings_client = remote_settings::Client::new(inner.remote_settings_config.clone().unwrap_or_else( || RemoteSettingsConfig { @@ -98,7 +95,7 @@ impl SuggestStoreBuilder { }, ))?; Ok(Arc::new(SuggestStore { - inner: SuggestStoreInner::new(data_path, cache_path, settings_client), + inner: SuggestStoreInner::new(data_path, settings_client), })) } } @@ -182,7 +179,7 @@ impl SuggestStore { )?) }()?; Ok(Self { - inner: SuggestStoreInner::new("".to_owned(), path.to_owned(), settings_client), + inner: SuggestStoreInner::new(path.to_owned(), settings_client), }) } @@ -192,6 +189,22 @@ impl SuggestStore { self.inner.query(query) } + /// Dismiss a suggestion + /// + /// Dismissed suggestions will not be returned again + /// + /// In the case of AMP suggestions this should be the raw URL. + #[handle_error(Error)] + pub fn dismiss_suggestion(&self, suggestion_url: String) -> SuggestApiResult<()> { + self.inner.dismiss_suggestion(suggestion_url) + } + + /// Clear dismissed suggestions + #[handle_error(Error)] + pub fn clear_dismissed_suggestions(&self) -> SuggestApiResult<()> { + self.inner.clear_dismissed_suggestions() + } + /// Interrupts any ongoing queries. /// /// This should be called when the user types new input into the address @@ -238,6 +251,7 @@ pub struct SuggestIngestionConstraints { /// Because of how suggestions are partitioned in Remote Settings, this is a /// soft limit, and the store might ingest more than requested. pub max_suggestions: Option, + pub providers: Option>, } /// The implementation of the store. This is generic over the Remote Settings @@ -250,23 +264,14 @@ pub(crate) struct SuggestStoreInner { /// It's not currently used because not all consumers pass this in yet. #[allow(unused)] data_path: PathBuf, - /// Path to the temporary SQL database. - /// - /// This stores things that should be deleted when the user clears their cache. - cache_path: PathBuf, dbs: OnceCell, settings_client: S, } impl SuggestStoreInner { - fn new( - data_path: impl Into, - cache_path: impl Into, - settings_client: S, - ) -> Self { + pub fn new(data_path: impl Into, settings_client: S) -> Self { Self { data_path: data_path.into(), - cache_path: cache_path.into(), dbs: OnceCell::new(), settings_client, } @@ -276,7 +281,7 @@ impl SuggestStoreInner { /// they're not already open. fn dbs(&self) -> Result<&SuggestStoreDbs> { self.dbs - .get_or_try_init(|| SuggestStoreDbs::open(&self.cache_path)) + .get_or_try_init(|| SuggestStoreDbs::open(&self.data_path)) } fn query(&self, query: SuggestionQuery) -> Result> { @@ -286,6 +291,17 @@ impl SuggestStoreInner { self.dbs()?.reader.read(|dao| dao.fetch_suggestions(&query)) } + fn dismiss_suggestion(&self, suggestion_url: String) -> Result<()> { + self.dbs()? + .writer + .write(|dao| dao.insert_dismissal(&suggestion_url)) + } + + fn clear_dismissed_suggestions(&self) -> Result<()> { + self.dbs()?.writer.write(|dao| dao.clear_dismissals())?; + Ok(()) + } + fn interrupt(&self) { if let Some(dbs) = self.dbs.get() { // Only interrupt if the databases are already open. @@ -315,7 +331,7 @@ impl SuggestStoreInner where S: SuggestRemoteSettingsClient, { - fn ingest(&self, constraints: SuggestIngestionConstraints) -> Result<()> { + pub fn ingest(&self, constraints: SuggestIngestionConstraints) -> Result<()> { let writer = &self.dbs()?.writer; if let Some(unparsable_records) = @@ -330,26 +346,58 @@ where for unparsable_ids in all_unparsable_ids.chunks(UNPARSABLE_IDS_PER_REQUEST) { let mut options = GetItemsOptions::new(); for unparsable_id in unparsable_ids { - options.eq("id", *unparsable_id); + options.filter_eq("id", *unparsable_id); } let records_chunk = self .settings_client .get_records_with_options(&options)? .records; - self.ingest_records(writer, &records_chunk)?; + self.ingest_records(LAST_INGEST_META_UNPARSABLE, writer, &records_chunk)?; } } + // use std::collections::BTreeSet; + let ingest_record_types = if let Some(rt) = &constraints.providers { + rt.iter() + .flat_map(|x| x.records_for_provider()) + .collect::>() + .into_iter() + .collect() + } else { + DEFAULT_RECORDS_TYPES.to_vec() + }; + + for ingest_record_type in ingest_record_types { + self.ingest_records_by_type(ingest_record_type, writer, &constraints)?; + } + + Ok(()) + } + + fn ingest_records_by_type( + &self, + ingest_record_type: SuggestRecordType, + writer: &SuggestDb, + constraints: &SuggestIngestionConstraints, + ) -> Result<()> { let mut options = GetItemsOptions::new(); + // Remote Settings returns records in descending modification order // (newest first), but we want them in ascending order (oldest first), // so that we can eventually resume downloading where we left off. options.sort("last_modified", SortOrder::Ascending); - if let Some(last_ingest) = writer.read(|dao| dao.get_meta::(LAST_INGEST_META_KEY))? { + + options.filter_eq("type", ingest_record_type.to_string()); + + // Get the last ingest value. This is the max of the last_ingest_keys + // that are in the database. + if let Some(last_ingest) = writer + .read(|dao| dao.get_meta::(ingest_record_type.last_ingest_meta_key().as_str()))? + { // Only download changes since our last ingest. If our last ingest // was interrupted, we'll pick up where we left off. - options.gt("last_modified", last_ingest.to_string()); + options.filter_gt("last_modified", last_ingest.to_string()); } if let Some(max_suggestions) = constraints.max_suggestions { @@ -363,39 +411,56 @@ where .settings_client .get_records_with_options(&options)? .records; - self.ingest_records(writer, &records)?; - + self.ingest_records(&ingest_record_type.last_ingest_meta_key(), writer, &records)?; Ok(()) } - fn ingest_records(&self, writer: &SuggestDb, records: &[RemoteSettingsRecord]) -> Result<()> { + fn ingest_records( + &self, + last_ingest_key: &str, + writer: &SuggestDb, + records: &[RemoteSettingsRecord], + ) -> Result<()> { for record in records { let record_id = SuggestRecordId::from(&record.id); if record.deleted { // If the entire record was deleted, drop all its suggestions // and advance the last ingest time. - writer.write(|dao| dao.handle_deleted_record(record))?; + writer.write(|dao| dao.handle_deleted_record(last_ingest_key, record))?; continue; } let Ok(fields) = serde_json::from_value(serde_json::Value::Object(record.fields.clone())) else { // We don't recognize this record's type, so we don't know how - // to ingest its suggestions. Record this in the meta table. + // to ingest its suggestions. Skip processing this record. writer.write(|dao| dao.handle_unparsable_record(record))?; continue; }; match fields { SuggestRecord::AmpWikipedia => { - self.ingest_attachment(writer, record, |dao, record_id, suggestions| { - dao.insert_amp_wikipedia_suggestions(record_id, suggestions) - })?; + self.ingest_attachment( + // TODO: Currently re-creating the last_ingest_key because using last_ingest_meta + // breaks the tests (particularly the unparsable functionality). So, keeping + // a direct reference until we remove the "unparsable" functionality. + &SuggestRecordType::AmpWikipedia.last_ingest_meta_key(), + writer, + record, + |dao, record_id, suggestions| { + dao.insert_amp_wikipedia_suggestions(record_id, suggestions) + }, + )?; } SuggestRecord::AmpMobile => { - self.ingest_attachment(writer, record, |dao, record_id, suggestions| { - dao.insert_amp_mobile_suggestions(record_id, suggestions) - })?; + self.ingest_attachment( + &SuggestRecordType::AmpMobile.last_ingest_meta_key(), + writer, + record, + |dao, record_id, suggestions| { + dao.insert_amp_mobile_suggestions(record_id, suggestions) + }, + )?; } SuggestRecord::Icon => { let (Some(icon_id), Some(attachment)) = @@ -404,47 +469,79 @@ where // An icon record should have an icon ID and an // attachment. Icons that don't have these are // malformed, so skip to the next record. - writer.write(|dao| dao.put_last_ingest_if_newer(record.last_modified))?; + writer.write(|dao| { + dao.put_last_ingest_if_newer( + &SuggestRecordType::Icon.last_ingest_meta_key(), + record.last_modified, + ) + })?; continue; }; let data = self.settings_client.get_attachment(&attachment.location)?; writer.write(|dao| { - dao.put_icon(icon_id, &data)?; - dao.handle_ingested_record(record) + dao.put_icon(icon_id, &data, &attachment.mimetype)?; + dao.handle_ingested_record( + &SuggestRecordType::Icon.last_ingest_meta_key(), + record, + ) })?; } SuggestRecord::Amo => { - self.ingest_attachment(writer, record, |dao, record_id, suggestions| { - dao.insert_amo_suggestions(record_id, suggestions) - })?; + self.ingest_attachment( + &SuggestRecordType::Amo.last_ingest_meta_key(), + writer, + record, + |dao, record_id, suggestions| { + dao.insert_amo_suggestions(record_id, suggestions) + }, + )?; } SuggestRecord::Pocket => { - self.ingest_attachment(writer, record, |dao, record_id, suggestions| { - dao.insert_pocket_suggestions(record_id, suggestions) - })?; + self.ingest_attachment( + &SuggestRecordType::Pocket.last_ingest_meta_key(), + writer, + record, + |dao, record_id, suggestions| { + dao.insert_pocket_suggestions(record_id, suggestions) + }, + )?; } SuggestRecord::Yelp => { - self.ingest_attachment(writer, record, |dao, record_id, suggestions| { - match suggestions.first() { + self.ingest_attachment( + &SuggestRecordType::Yelp.last_ingest_meta_key(), + writer, + record, + |dao, record_id, suggestions| match suggestions.first() { Some(suggestion) => dao.insert_yelp_suggestions(record_id, suggestion), None => Ok(()), - } - })?; + }, + )?; } SuggestRecord::Mdn => { - self.ingest_attachment(writer, record, |dao, record_id, suggestions| { - dao.insert_mdn_suggestions(record_id, suggestions) - })?; + self.ingest_attachment( + &SuggestRecordType::Mdn.last_ingest_meta_key(), + writer, + record, + |dao, record_id, suggestions| { + dao.insert_mdn_suggestions(record_id, suggestions) + }, + )?; } SuggestRecord::Weather(data) => { - self.ingest_record(writer, record, |dao, record_id| { - dao.insert_weather_data(record_id, &data) - })?; + self.ingest_record( + &SuggestRecordType::Weather.last_ingest_meta_key(), + writer, + record, + |dao, record_id| dao.insert_weather_data(record_id, &data), + )?; } SuggestRecord::GlobalConfig(config) => { - self.ingest_record(writer, record, |dao, _| { - dao.put_global_config(&SuggestGlobalConfig::from(&config)) - })?; + self.ingest_record( + &SuggestRecordType::GlobalConfig.last_ingest_meta_key(), + writer, + record, + |dao, _| dao.put_global_config(&SuggestGlobalConfig::from(&config)), + )?; } } } @@ -453,6 +550,7 @@ where fn ingest_record( &self, + last_ingest_key: &str, writer: &SuggestDb, record: &RemoteSettingsRecord, ingestion_handler: impl FnOnce(&mut SuggestDao<'_>, &SuggestRecordId) -> Result<()>, @@ -469,12 +567,13 @@ where // Ingest (or re-ingest) all data in the record. ingestion_handler(dao, &record_id)?; - dao.handle_ingested_record(record) + dao.handle_ingested_record(last_ingest_key, record) }) } fn ingest_attachment( &self, + last_ingest_key: &str, writer: &SuggestDb, record: &RemoteSettingsRecord, ingestion_handler: impl FnOnce(&mut SuggestDao<'_>, &SuggestRecordId, &[T]) -> Result<()>, @@ -486,20 +585,72 @@ where // This method should be called only when a record is expected to // have an attachment. If it doesn't have one, it's malformed, so // skip to the next record. - writer.write(|dao| dao.put_last_ingest_if_newer(record.last_modified))?; + writer + .write(|dao| dao.put_last_ingest_if_newer(last_ingest_key, record.last_modified))?; return Ok(()); }; let attachment_data = self.settings_client.get_attachment(&attachment.location)?; match serde_json::from_slice::>(&attachment_data) { - Ok(attachment) => self.ingest_record(writer, record, |dao, record_id| { - ingestion_handler(dao, record_id, attachment.suggestions()) - }), + Ok(attachment) => { + self.ingest_record(last_ingest_key, writer, record, |dao, record_id| { + ingestion_handler(dao, record_id, attachment.suggestions()) + }) + } Err(_) => writer.write(|dao| dao.handle_unparsable_record(record)), } } } +#[cfg(feature = "benchmark_api")] +impl SuggestStoreInner +where + S: SuggestRemoteSettingsClient, +{ + pub fn into_settings_client(self) -> S { + self.settings_client + } + + pub fn ensure_db_initialized(&self) { + self.dbs().unwrap(); + } + + pub fn benchmark_ingest_records_by_type(&self, ingest_record_type: SuggestRecordType) { + self.ingest_records_by_type( + ingest_record_type, + &self.dbs().unwrap().writer, + &SuggestIngestionConstraints::default(), + ) + .unwrap() + } + + pub fn table_row_counts(&self) -> Vec<(String, u32)> { + use sql_support::ConnExt; + + // Note: since this is just used for debugging, use unwrap to simplify the error handling. + let reader = &self.dbs().unwrap().reader; + let conn = reader.conn.lock(); + let table_names: Vec = conn + .query_rows_and_then( + "SELECT name FROM sqlite_master where type = 'table'", + (), + |row| row.get(0), + ) + .unwrap(); + let mut table_names_with_counts: Vec<(String, u32)> = table_names + .into_iter() + .map(|name| { + let count: u32 = conn + .query_one(&format!("SELECT COUNT(*) FROM {name}")) + .unwrap(); + (name, count) + }) + .collect(); + table_names_with_counts.sort_by(|a, b| (b.1.cmp(&a.1))); + table_names_with_counts + } +} + /// Holds a store's open connections to the Suggest database. struct SuggestStoreDbs { /// A read-write connection used to update the database with new data. @@ -549,10 +700,6 @@ mod tests { "file:test_store_data_{}?mode=memory&cache=shared", hex::encode(unique_suffix), ), - format!( - "file:test_store_cache_{}?mode=memory&cache=shared", - hex::encode(unique_suffix), - ), settings_client, ) } @@ -721,7 +868,14 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, Some(15)); + assert_eq!( + dao.get_meta::( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str() + )?, + Some(15) + ); expect![[r#" [ Amp { @@ -729,6 +883,7 @@ mod tests { url: "https://www.lph-nm.biz", raw_url: "https://www.lph-nm.biz", icon: None, + icon_mimetype: None, full_keyword: "los", block_id: 0, advertiser: "Los Pollos Hermanos", @@ -834,6 +989,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -872,6 +1030,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "penne", block_id: 0, advertiser: "Good Place Eats", @@ -895,6 +1056,267 @@ mod tests { Ok(()) } + #[test] + fn ingest_full_keywords() -> anyhow::Result<()> { + before_each(); + + let snapshot = Snapshot::with_records(json!([{ + "id": "1", + "type": "data", + "last_modified": 15, + "attachment": { + "filename": "data-1.json", + "mimetype": "application/json", + "location": "data-1.json", + "hash": "", + "size": 0, + }, + }, { + "id": "2", + "type": "data", + "last_modified": 15, + "attachment": { + "filename": "data-2.json", + "mimetype": "application/json", + "location": "data-2.json", + "hash": "", + "size": 0, + }, + }, { + "id": "3", + "type": "data", + "last_modified": 15, + "attachment": { + "filename": "data-3.json", + "mimetype": "application/json", + "location": "data-3.json", + "hash": "", + "size": 0, + }, + }, { + "id": "4", + "type": "amp-mobile-suggestions", + "last_modified": 15, + "attachment": { + "filename": "data-4.json", + "mimetype": "application/json", + "location": "data-4.json", + "hash": "", + "size": 0, + }, + }]))? + // AMP attachment with full keyword data + .with_data( + "data-1.json", + json!([{ + "id": 0, + "advertiser": "Los Pollos Hermanos", + "iab_category": "8 - Food & Drink", + "keywords": ["lo", "los", "los p", "los pollos", "los pollos h", "los pollos hermanos"], + "full_keywords": [ + // Full keyword for the first 4 keywords + ("los pollos", 4), + // Full keyword for the next 2 keywords + ("los pollos hermanos (restaurant)", 2), + ], + "title": "Los Pollos Hermanos - Albuquerque - 1", + "url": "https://www.lph-nm.biz", + "icon": "5678", + "impression_url": "https://example.com/impression_url", + "click_url": "https://example.com/click_url", + "score": 0.3 + }]), + )? + // AMP attachment without a full keyword + .with_data( + "data-2.json", + json!([{ + "id": 1, + "advertiser": "Los Pollos Hermanos", + "iab_category": "8 - Food & Drink", + "keywords": ["lo", "los", "los p", "los pollos", "los pollos h", "los pollos hermanos"], + "title": "Los Pollos Hermanos - Albuquerque - 2", + "url": "https://www.lph-nm.biz", + "icon": "5678", + "impression_url": "https://example.com/impression_url", + "click_url": "https://example.com/click_url", + "score": 0.3 + }]), + )? + // Wikipedia attachment with full keyword data. We should ignore the full + // keyword data for Wikipedia suggestions + .with_data( + "data-3.json", + json!([{ + "id": 2, + "advertiser": "Wikipedia", + "keywords": ["lo", "los", "los p", "los pollos", "los pollos h", "los pollos hermanos"], + "title": "Los Pollos Hermanos - Albuquerque - Wiki", + "full_keywords": [ + ("Los Pollos Hermanos - Albuquerque", 6), + ], + "url": "https://www.lph-nm.biz", + "icon": "5678", + "score": 0.3, + }]), + )? + // Amp mobile suggestion, this is essentially the same as 1, except for the SuggestionProvider + .with_data( + "data-4.json", + json!([{ + "id": 0, + "advertiser": "Los Pollos Hermanos", + "iab_category": "8 - Food & Drink", + "keywords": ["lo", "los", "los p", "los pollos", "los pollos h", "los pollos hermanos"], + "full_keywords": [ + // Full keyword for the first 4 keywords + ("los pollos", 4), + // Full keyword for the next 2 keywords + ("los pollos hermanos (restaurant)", 2), + ], + "title": "Los Pollos Hermanos - Albuquerque - 4", + "url": "https://www.lph-nm.biz", + "icon": "5678", + "impression_url": "https://example.com/impression_url", + "click_url": "https://example.com/click_url", + "score": 0.3 + }]), + )?; + + let store = unique_test_store(SnapshotSettingsClient::with_snapshot(snapshot)); + + store.ingest(SuggestIngestionConstraints::default())?; + + store.dbs()?.reader.read(|dao| { + // This one should match the first full keyword for the first AMP item. + expect![[r#" + [ + Amp { + title: "Los Pollos Hermanos - Albuquerque - 1", + url: "https://www.lph-nm.biz", + raw_url: "https://www.lph-nm.biz", + icon: None, + icon_mimetype: None, + full_keyword: "los pollos", + block_id: 0, + advertiser: "Los Pollos Hermanos", + iab_category: "8 - Food & Drink", + impression_url: "https://example.com/impression_url", + click_url: "https://example.com/click_url", + raw_click_url: "https://example.com/click_url", + score: 0.3, + }, + Amp { + title: "Los Pollos Hermanos - Albuquerque - 2", + url: "https://www.lph-nm.biz", + raw_url: "https://www.lph-nm.biz", + icon: None, + icon_mimetype: None, + full_keyword: "los", + block_id: 1, + advertiser: "Los Pollos Hermanos", + iab_category: "8 - Food & Drink", + impression_url: "https://example.com/impression_url", + click_url: "https://example.com/click_url", + raw_click_url: "https://example.com/click_url", + score: 0.3, + }, + ] + "#]] + .assert_debug_eq(&dao.fetch_suggestions(&SuggestionQuery { + keyword: "lo".into(), + providers: vec![SuggestionProvider::Amp], + limit: None, + })?); + // This one should match the second full keyword for the first AMP item. + expect![[r#" + [ + Amp { + title: "Los Pollos Hermanos - Albuquerque - 1", + url: "https://www.lph-nm.biz", + raw_url: "https://www.lph-nm.biz", + icon: None, + icon_mimetype: None, + full_keyword: "los pollos hermanos (restaurant)", + block_id: 0, + advertiser: "Los Pollos Hermanos", + iab_category: "8 - Food & Drink", + impression_url: "https://example.com/impression_url", + click_url: "https://example.com/click_url", + raw_click_url: "https://example.com/click_url", + score: 0.3, + }, + Amp { + title: "Los Pollos Hermanos - Albuquerque - 2", + url: "https://www.lph-nm.biz", + raw_url: "https://www.lph-nm.biz", + icon: None, + icon_mimetype: None, + full_keyword: "los pollos hermanos", + block_id: 1, + advertiser: "Los Pollos Hermanos", + iab_category: "8 - Food & Drink", + impression_url: "https://example.com/impression_url", + click_url: "https://example.com/click_url", + raw_click_url: "https://example.com/click_url", + score: 0.3, + }, + ] + "#]] + .assert_debug_eq(&dao.fetch_suggestions(&SuggestionQuery { + keyword: "los pollos h".into(), + providers: vec![SuggestionProvider::Amp], + limit: None, + })?); + // This one matches a Wikipedia suggestion, so the full keyword should be ignored + expect![[r#" + [ + Wikipedia { + title: "Los Pollos Hermanos - Albuquerque - Wiki", + url: "https://www.lph-nm.biz", + icon: None, + icon_mimetype: None, + full_keyword: "los", + }, + ] + "#]] + .assert_debug_eq(&dao.fetch_suggestions(&SuggestionQuery { + keyword: "los".into(), + providers: vec![SuggestionProvider::Wikipedia], + limit: None, + })?); + // This one matches a Wikipedia suggestion, so the full keyword should be ignored + expect![[r#" + [ + Amp { + title: "Los Pollos Hermanos - Albuquerque - 4", + url: "https://www.lph-nm.biz", + raw_url: "https://www.lph-nm.biz", + icon: None, + icon_mimetype: None, + full_keyword: "los pollos hermanos (restaurant)", + block_id: 0, + advertiser: "Los Pollos Hermanos", + iab_category: "8 - Food & Drink", + impression_url: "https://example.com/impression_url", + click_url: "https://example.com/click_url", + raw_click_url: "https://example.com/click_url", + score: 0.3, + }, + ] + "#]] + .assert_debug_eq(&dao.fetch_suggestions(&SuggestionQuery { + keyword: "los pollos h".into(), + providers: vec![SuggestionProvider::AmpMobile], + limit: None, + })?); + + Ok(()) + })?; + + Ok(()) + } + /// Tests ingesting a data attachment containing a single suggestion, /// instead of an array of suggestions. #[test] @@ -941,6 +1363,7 @@ mod tests { url: "https://www.lasagna.restaurant", raw_url: "https://www.lasagna.restaurant", icon: None, + icon_mimetype: None, full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -1014,7 +1437,14 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta(LAST_INGEST_META_KEY)?, Some(15u64)); + assert_eq!( + dao.get_meta( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str() + )?, + Some(15u64) + ); expect![[r#" [ Amp { @@ -1022,6 +1452,7 @@ mod tests { url: "https://www.lasagna.restaurant", raw_url: "https://www.lasagna.restaurant", icon: None, + icon_mimetype: None, full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -1084,8 +1515,15 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; - store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta(LAST_INGEST_META_KEY)?, Some(30u64)); + store.dbs()?.reader.read(|dao: &SuggestDao<'_>| { + assert_eq!( + dao.get_meta( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str() + )?, + Some(30u64) + ); assert!(dao .fetch_suggestions(&SuggestionQuery { keyword: "la".into(), @@ -1100,6 +1538,7 @@ mod tests { url: "https://www.lph-nm.biz", raw_url: "https://www.lph-nm.biz", icon: None, + icon_mimetype: None, full_keyword: "los pollos", block_id: 0, advertiser: "Los Pollos Hermanos", @@ -1123,6 +1562,7 @@ mod tests { url: "https://penne.biz", raw_url: "https://penne.biz", icon: None, + icon_mimetype: None, full_keyword: "penne", block_id: 0, advertiser: "Good Place Eats", @@ -1219,7 +1659,10 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta(LAST_INGEST_META_KEY)?, Some(25u64)); + assert_eq!( + dao.get_meta(SuggestRecordType::Icon.last_ingest_meta_key().as_str())?, + Some(25u64) + ); assert_eq!( dao.conn .query_one::("SELECT count(*) FROM suggestions")?, @@ -1259,7 +1702,10 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta(LAST_INGEST_META_KEY)?, Some(35u64)); + assert_eq!( + dao.get_meta(SuggestRecordType::Icon.last_ingest_meta_key().as_str())?, + Some(35u64) + ); expect![[r#" [ Amp { @@ -1286,6 +1732,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -1327,6 +1776,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "los", block_id: 0, advertiser: "Los Pollos Hermanos", @@ -1422,7 +1874,10 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta(LAST_INGEST_META_KEY)?, Some(15u64)); + assert_eq!( + dao.get_meta(SuggestRecordType::Amo.last_ingest_meta_key().as_str())?, + Some(15u64) + ); expect![[r#" [ @@ -1513,7 +1968,10 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta(LAST_INGEST_META_KEY)?, Some(30u64)); + assert_eq!( + dao.get_meta(SuggestRecordType::Amo.last_ingest_meta_key().as_str())?, + Some(30u64) + ); expect![[r#" [ @@ -1648,13 +2106,24 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, Some(20)); assert_eq!( dao.conn .query_one::("SELECT count(*) FROM suggestions")?, 1 ); assert_eq!(dao.conn.query_one::("SELECT count(*) FROM icons")?, 1); + assert_eq!( + dao.get_meta( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str() + )?, + Some(15) + ); + assert_eq!( + dao.get_meta(SuggestRecordType::Icon.last_ingest_meta_key().as_str())?, + Some(20) + ); Ok(()) })?; @@ -1674,14 +2143,16 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, Some(30)); assert_eq!( dao.conn .query_one::("SELECT count(*) FROM suggestions")?, 0 ); assert_eq!(dao.conn.query_one::("SELECT count(*) FROM icons")?, 0); - + assert_eq!( + dao.get_meta(SuggestRecordType::Icon.last_ingest_meta_key().as_str())?, + Some(30) + ); Ok(()) })?; @@ -1717,6 +2188,7 @@ mod tests { for (max_suggestions, expected_limit) in table { store.ingest(SuggestIngestionConstraints { max_suggestions: Some(max_suggestions), + providers: Some(vec![SuggestionProvider::Amp]), })?; let actual_limit = store .settings_client @@ -1772,7 +2244,14 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, Some(15)); + assert_eq!( + dao.get_meta::( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str() + )?, + Some(15) + ); assert_eq!( dao.conn .query_one::("SELECT count(*) FROM suggestions")?, @@ -1789,7 +2268,14 @@ mod tests { store.clear()?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, None); + assert_eq!( + dao.get_meta::( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str() + )?, + None + ); assert_eq!( dao.conn .query_one::("SELECT count(*) FROM suggestions")?, @@ -2080,6 +2566,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -2143,6 +2632,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "multimatch", }, ] @@ -2199,6 +2691,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "multimatch", }, ] @@ -2268,6 +2763,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -2349,6 +2847,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "california", }, Wikipedia { @@ -2370,6 +2871,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "california", }, ] @@ -2403,6 +2907,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "california", }, ] @@ -2614,6 +3121,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2647,6 +3157,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2680,6 +3193,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2735,6 +3251,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2779,6 +3298,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2812,6 +3334,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2856,6 +3381,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2889,6 +3417,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2933,6 +3464,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2966,6 +3500,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -2999,6 +3536,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: true, @@ -3032,6 +3572,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: true, @@ -3076,6 +3619,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: true, @@ -3109,6 +3655,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: true, @@ -3186,6 +3735,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: true, @@ -3219,6 +3771,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: true, @@ -3252,6 +3807,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -3285,6 +3843,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: true, subject_exact_match: true, @@ -3318,6 +3879,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: true, @@ -3362,6 +3926,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: false, @@ -3395,6 +3962,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: true, @@ -3428,6 +3998,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: false, @@ -3483,6 +4056,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: false, @@ -3516,6 +4092,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: false, @@ -3549,6 +4128,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: false, @@ -3593,6 +4175,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/svg+xml", + ), score: 0.5, has_location_sign: false, subject_exact_match: false, @@ -3734,6 +4319,7 @@ mod tests { url: "https://www.lasagna.restaurant", raw_url: "https://www.lasagna.restaurant", icon: None, + icon_mimetype: None, full_keyword: "amp wiki match", block_id: 0, advertiser: "Good Place Eats", @@ -3762,6 +4348,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "amp wiki match", }, Amp { @@ -3769,6 +4358,7 @@ mod tests { url: "https://penne.biz", raw_url: "https://penne.biz", icon: None, + icon_mimetype: None, full_keyword: "amp wiki match", block_id: 0, advertiser: "Good Place Eats", @@ -3801,6 +4391,7 @@ mod tests { url: "https://www.lasagna.restaurant", raw_url: "https://www.lasagna.restaurant", icon: None, + icon_mimetype: None, full_keyword: "amp wiki match", block_id: 0, advertiser: "Good Place Eats", @@ -3829,6 +4420,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "amp wiki match", }, ] @@ -3873,6 +4467,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "pocket wiki match", }, Pocket { @@ -4193,6 +4790,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -4234,6 +4834,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -4275,6 +4878,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -4304,6 +4910,9 @@ mod tests { 110, ], ), + icon_mimetype: Some( + "image/png", + ), full_keyword: "lasagna", block_id: 0, advertiser: "Good Place Eats", @@ -4365,7 +4974,10 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, Some(45)); + assert_eq!( + dao.get_meta::(SuggestRecordType::Icon.last_ingest_meta_key().as_str())?, + Some(45) + ); assert_eq!( dao.conn .query_one::("SELECT count(*) FROM suggestions")?, @@ -4400,16 +5012,19 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, Some(30)); + assert_eq!( + dao.get_meta("last_quicksuggest_ingest_unparsable")?, + Some(30) + ); expect![[r#" Some( UnparsableRecords( { "clippy-2": UnparsableRecord { - schema_version: 14, + schema_version: 18, }, "fancy-new-suggestions-1": UnparsableRecord { - schema_version: 14, + schema_version: 18, }, }, ), @@ -4469,16 +5084,19 @@ mod tests { store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, Some(30)); + assert_eq!( + dao.get_meta("last_quicksuggest_ingest_unparsable")?, + Some(30) + ); expect![[r#" Some( UnparsableRecords( { "clippy-2": UnparsableRecord { - schema_version: 14, + schema_version: 18, }, "fancy-new-suggestions-1": UnparsableRecord { - schema_version: 14, + schema_version: 18, }, }, ), @@ -4505,13 +5123,105 @@ mod tests { let store = unique_test_store(SnapshotSettingsClient::with_snapshot(snapshot)); store.dbs()?.writer.write(|dao| { - dao.put_meta(LAST_INGEST_META_KEY, 30)?; + dao.put_meta( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str(), + 30, + )?; Ok(()) })?; store.ingest(SuggestIngestionConstraints::default())?; store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, Some(30)); + assert_eq!( + dao.get_meta::( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str() + )?, + Some(30) + ); + Ok(()) + })?; + + Ok(()) + } + + /// Tests that we only ingest providers that we're concerned with. + #[test] + fn ingest_constraints_provider() -> anyhow::Result<()> { + before_each(); + + let snapshot = Snapshot::with_records(json!([{ + "id": "data-1", + "type": "data", + "last_modified": 15, + }, { + "id": "icon-1", + "type": "icon", + "last_modified": 30, + }]))?; + + let store = unique_test_store(SnapshotSettingsClient::with_snapshot(snapshot)); + store.dbs()?.writer.write(|dao| { + // Check that existing data is updated properly. + dao.put_meta( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str(), + 10, + )?; + Ok(()) + })?; + + let constraints = SuggestIngestionConstraints { + max_suggestions: Some(100), + providers: Some(vec![SuggestionProvider::Amp, SuggestionProvider::Pocket]), + }; + store.ingest(constraints)?; + + store.dbs()?.reader.read(|dao| { + assert_eq!( + dao.get_meta::( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str() + )?, + Some(15) + ); + assert_eq!( + dao.get_meta::(SuggestRecordType::Icon.last_ingest_meta_key().as_str())?, + Some(30) + ); + assert_eq!( + dao.get_meta::(SuggestRecordType::Pocket.last_ingest_meta_key().as_str())?, + None + ); + assert_eq!( + dao.get_meta::(SuggestRecordType::Amo.last_ingest_meta_key().as_str())?, + None + ); + assert_eq!( + dao.get_meta::(SuggestRecordType::Yelp.last_ingest_meta_key().as_str())?, + None + ); + assert_eq!( + dao.get_meta::(SuggestRecordType::Mdn.last_ingest_meta_key().as_str())?, + None + ); + assert_eq!( + dao.get_meta::(SuggestRecordType::AmpMobile.last_ingest_meta_key().as_str())?, + None + ); + assert_eq!( + dao.get_meta::( + SuggestRecordType::GlobalConfig + .last_ingest_meta_key() + .as_str() + )?, + None + ); Ok(()) })?; @@ -4582,10 +5292,10 @@ mod tests { UnparsableRecords( { "clippy-2": UnparsableRecord { - schema_version: 14, + schema_version: 18, }, "fancy-new-suggestions-1": UnparsableRecord { - schema_version: 14, + schema_version: 18, }, }, ), @@ -4671,7 +5381,7 @@ mod tests { UnparsableRecords( { "invalid-attachment": UnparsableRecord { - schema_version: 14, + schema_version: 18, }, }, ), @@ -4683,7 +5393,14 @@ mod tests { // Test that the valid record was read store.dbs()?.reader.read(|dao| { - assert_eq!(dao.get_meta::(LAST_INGEST_META_KEY)?, Some(15)); + assert_eq!( + dao.get_meta( + SuggestRecordType::AmpWikipedia + .last_ingest_meta_key() + .as_str() + )?, + Some(15) + ); expect![[r#" [ Amp { @@ -4691,6 +5408,7 @@ mod tests { url: "https://www.lph-nm.biz", raw_url: "https://www.lph-nm.biz", icon: None, + icon_mimetype: None, full_keyword: "los", block_id: 0, advertiser: "Los Pollos Hermanos", @@ -4899,6 +5617,7 @@ mod tests { url: "https://www.yelp.com/search?find_desc=ramen", title: "ramen", icon: None, + icon_mimetype: None, score: 0.5, has_location_sign: false, subject_exact_match: true, @@ -5313,4 +6032,204 @@ mod tests { Ok(()) } + + #[test] + fn remove_dismissed_suggestions() -> anyhow::Result<()> { + before_each(); + + let snapshot = Snapshot::with_records(json!([{ + "id": "data-1", + "type": "data", + "last_modified": 15, + "attachment": { + "filename": "data-1.json", + "mimetype": "application/json", + "location": "data-1.json", + "hash": "", + "size": 0, + }, + + }, { + "id": "data-2", + "type": "amo-suggestions", + "last_modified": 15, + "attachment": { + "filename": "data-2.json", + "mimetype": "application/json", + "location": "data-2.json", + "hash": "", + "size": 0, + }, + }, { + "id": "data-3", + "type": "pocket-suggestions", + "last_modified": 15, + "attachment": { + "filename": "data-3.json", + "mimetype": "application/json", + "location": "data-3.json", + "hash": "", + "size": 0, + }, + }, { + "id": "data-5", + "type": "mdn-suggestions", + "last_modified": 15, + "attachment": { + "filename": "data-5.json", + "mimetype": "application/json", + "location": "data-5.json", + "hash": "", + "size": 0, + }, + }, { + "id": "data-6", + "type": "amp-mobile-suggestions", + "last_modified": 15, + "attachment": { + "filename": "data-6.json", + "mimetype": "application/json", + "location": "data-6.json", + "hash": "", + "size": 0, + }, + }, { + "id": "icon-2", + "type": "icon", + "last_modified": 20, + "attachment": { + "filename": "icon-2.png", + "mimetype": "image/png", + "location": "icon-2.png", + "hash": "", + "size": 0, + }, + }, { + "id": "icon-3", + "type": "icon", + "last_modified": 25, + "attachment": { + "filename": "icon-3.png", + "mimetype": "image/png", + "location": "icon-3.png", + "hash": "", + "size": 0, + }, + }]))? + .with_data( + "data-1.json", + json!([{ + "id": 0, + "advertiser": "Good Place Eats", + "iab_category": "8 - Food & Drink", + "keywords": ["cats"], + "title": "Lasagna Come Out Tomorrow", + "url": "https://www.lasagna.restaurant", + "icon": "2", + "impression_url": "https://example.com/impression_url", + "click_url": "https://example.com/click_url", + "score": 0.31 + }, { + "id": 0, + "advertiser": "Wikipedia", + "iab_category": "5 - Education", + "keywords": ["cats"], + "title": "California", + "url": "https://wikipedia.org/California", + "icon": "3" + }]), + )? + .with_data( + "data-2.json", + json!([ + { + "description": "amo suggestion", + "url": "https://addons.mozilla.org/en-US/firefox/addon/example", + "guid": "{b9db16a4-6edc-47ec-a1f4-b86292ed211d}", + "keywords": ["cats"], + "title": "Firefox Relay", + "icon": "https://addons.mozilla.org/user-media/addon_icons/2633/2633704-64.png?modified=2c11a80b", + "rating": "4.9", + "number_of_ratings": 888, + "score": 0.32 + }, + ]), + )? + .with_data( + "data-3.json", + json!([ + { + "description": "pocket suggestion", + "url": "https://getpocket.com/collections/its-not-just-burnout-how-grind-culture-failed-women", + "lowConfidenceKeywords": [], + "highConfidenceKeywords": ["cats"], + "title": "‘It’s Not Just Burnout:’ How Grind Culture Fails Women", + "score": 0.33 + }, + ]), + )? + .with_data( + "data-5.json", + json!([ + { + "description": "Javascript Array", + "url": "https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Array", + "keywords": ["cats"], + "title": "Array", + "score": 0.24 + }, + ]), + )? + .with_data( + "data-6.json", + json!([ + { + "id": 0, + "advertiser": "Good Place Eats", + "iab_category": "8 - Food & Drink", + "keywords": ["cats"], + "title": "Mobile - Lasagna Come Out Tomorrow", + "url": "https://www.lasagna.restaurant", + "icon": "3", + "impression_url": "https://example.com/impression_url", + "click_url": "https://example.com/click_url", + "score": 0.26 + } + ]), + )? + .with_icon("icon-2.png", "i-am-an-icon".as_bytes().into()) + .with_icon("icon-3.png", "also-an-icon".as_bytes().into()); + + let store = unique_test_store(SnapshotSettingsClient::with_snapshot(snapshot)); + store.ingest(SuggestIngestionConstraints::default())?; + + // A query for cats should return all suggestions + let query = SuggestionQuery { + keyword: "cats".into(), + providers: vec![ + SuggestionProvider::Amp, + SuggestionProvider::Wikipedia, + SuggestionProvider::Amo, + SuggestionProvider::Pocket, + SuggestionProvider::Mdn, + SuggestionProvider::AmpMobile, + ], + limit: None, + }; + let results = store.query(query.clone())?; + assert_eq!(results.len(), 6); + + for result in results { + store.dismiss_suggestion(result.raw_url().unwrap().to_string())?; + } + + // After dismissing the suggestions, the next query shouldn't return them + assert_eq!(store.query(query.clone())?.len(), 0); + + // Clearing the dismissals should cause them to be returned again + store.clear_dismissed_suggestions()?; + assert_eq!(store.query(query.clone())?.len(), 6); + + Ok(()) + } } diff --git a/third_party/rust/suggest/src/suggest.udl b/third_party/rust/suggest/src/suggest.udl index 1cd8911a48..4a4e3fe9a0 100644 --- a/third_party/rust/suggest/src/suggest.udl +++ b/third_party/rust/suggest/src/suggest.udl @@ -40,6 +40,7 @@ interface Suggestion { string url, string raw_url, sequence? icon, + string? icon_mimetype, string full_keyword, i64 block_id, string advertiser, @@ -59,6 +60,7 @@ interface Suggestion { string title, string url, sequence? icon, + string? icon_mimetype, string full_keyword ); Amo( @@ -75,6 +77,7 @@ interface Suggestion { string url, string title, sequence? icon, + string? icon_mimetype, f64 score, boolean has_location_sign, boolean subject_exact_match, @@ -99,6 +102,7 @@ dictionary SuggestionQuery { dictionary SuggestIngestionConstraints { u64? max_suggestions = null; + sequence? providers = null; }; dictionary SuggestGlobalConfig { @@ -119,6 +123,12 @@ interface SuggestStore { [Throws=SuggestApiError] sequence query(SuggestionQuery query); + [Throws=SuggestApiError] + void dismiss_suggestion(string raw_suggestion_url); + + [Throws=SuggestApiError] + void clear_dismissed_suggestions(); + void interrupt(); [Throws=SuggestApiError] @@ -140,6 +150,7 @@ interface SuggestStoreBuilder { [Self=ByArc] SuggestStoreBuilder data_path(string path); + // Deprecated: this is no longer used by the suggest component. [Self=ByArc] SuggestStoreBuilder cache_path(string path); diff --git a/third_party/rust/suggest/src/suggestion.rs b/third_party/rust/suggest/src/suggestion.rs index f5425e3c73..c0b45524c7 100644 --- a/third_party/rust/suggest/src/suggestion.rs +++ b/third_party/rust/suggest/src/suggestion.rs @@ -29,6 +29,7 @@ pub enum Suggestion { url: String, raw_url: String, icon: Option>, + icon_mimetype: Option, full_keyword: String, block_id: i64, advertiser: String, @@ -48,6 +49,7 @@ pub enum Suggestion { title: String, url: String, icon: Option>, + icon_mimetype: Option, full_keyword: String, }, Amo { @@ -64,6 +66,7 @@ pub enum Suggestion { url: String, title: String, icon: Option>, + icon_mimetype: Option, score: f64, has_location_sign: bool, subject_exact_match: bool, @@ -106,6 +109,37 @@ impl Ord for Suggestion { } } +impl Suggestion { + /// Get the URL for this suggestion, if present + pub fn url(&self) -> Option<&str> { + match self { + Self::Amp { url, .. } + | Self::Pocket { url, .. } + | Self::Wikipedia { url, .. } + | Self::Amo { url, .. } + | Self::Yelp { url, .. } + | Self::Mdn { url, .. } => Some(url), + _ => None, + } + } + + /// Get the raw URL for this suggestion, if present + /// + /// This is the same as `url` except for Amp. In that case, `url` is the URL after being + /// "cooked" using template interpolation, while `raw_url` is the URL template. + pub fn raw_url(&self) -> Option<&str> { + match self { + Self::Amp { raw_url: url, .. } + | Self::Pocket { url, .. } + | Self::Wikipedia { url, .. } + | Self::Amo { url, .. } + | Self::Yelp { url, .. } + | Self::Mdn { url, .. } => Some(url), + _ => None, + } + } +} + impl Eq for Suggestion {} /// Replaces all template parameters in a "raw" sponsored suggestion URL, /// producing a "cooked" URL with real values. diff --git a/third_party/rust/suggest/src/yelp.rs b/third_party/rust/suggest/src/yelp.rs index 2413709c67..0eb6c10d56 100644 --- a/third_party/rust/suggest/src/yelp.rs +++ b/third_party/rust/suggest/src/yelp.rs @@ -52,7 +52,7 @@ const SUBJECT_PREFIX_MATCH_THRESHOLD: usize = 2; impl<'a> SuggestDao<'a> { /// Inserts the suggestions for Yelp attachment into the database. - pub fn insert_yelp_suggestions( + pub(crate) fn insert_yelp_suggestions( &mut self, record_id: &SuggestRecordId, suggestion: &DownloadedYelpSuggestion, @@ -130,7 +130,10 @@ impl<'a> SuggestDao<'a> { } /// Fetch Yelp suggestion from given user's query. - pub fn fetch_yelp_suggestions(&self, query: &SuggestionQuery) -> Result> { + pub(crate) fn fetch_yelp_suggestions( + &self, + query: &SuggestionQuery, + ) -> Result> { if !query.providers.contains(&SuggestionProvider::Yelp) { return Ok(vec![]); } @@ -144,7 +147,7 @@ impl<'a> SuggestDao<'a> { let Some((subject, subject_exact_match)) = self.find_subject(query_string)? else { return Ok(vec![]); }; - let (icon, score) = self.fetch_custom_details()?; + let (icon, icon_mimetype, score) = self.fetch_custom_details()?; let builder = SuggestionBuilder { subject: &subject, subject_exact_match, @@ -154,6 +157,7 @@ impl<'a> SuggestDao<'a> { location: None, need_location: false, icon, + icon_mimetype, score, }; return Ok(vec![builder.into()]); @@ -185,7 +189,7 @@ impl<'a> SuggestDao<'a> { return Ok(vec![]); }; - let (icon, score) = self.fetch_custom_details()?; + let (icon, icon_mimetype, score) = self.fetch_custom_details()?; let builder = SuggestionBuilder { subject: &subject, subject_exact_match, @@ -195,6 +199,7 @@ impl<'a> SuggestDao<'a> { location, need_location, icon, + icon_mimetype, score, }; Ok(vec![builder.into()]) @@ -204,6 +209,7 @@ impl<'a> SuggestDao<'a> { /// It returns the location tuple as follows: /// ( /// Option>: Icon data. If not found, returns None. + /// Option: Mimetype of the icon data. If not found, returns None. /// f64: Reflects score field in the yelp_custom_details table. /// ) /// @@ -212,11 +218,11 @@ impl<'a> SuggestDao<'a> { /// on Remote Settings. The following query will perform a table scan against /// `yelp_custom_details` followed by an index search against `icons`, /// which should be fine since there is only one record in the first table. - fn fetch_custom_details(&self) -> Result<(Option>, f64)> { + fn fetch_custom_details(&self) -> Result<(Option>, Option, f64)> { let result = self.conn.query_row_and_then_cachable( r#" SELECT - i.data, y.score + i.data, i.mimetype, y.score FROM yelp_custom_details y LEFT JOIN @@ -226,7 +232,13 @@ impl<'a> SuggestDao<'a> { 1 "#, (), - |row| -> Result<_> { Ok((row.get::<_, Option>>(0)?, row.get::<_, f64>(1)?)) }, + |row| -> Result<_> { + Ok(( + row.get::<_, Option>>(0)?, + row.get::<_, Option>(1)?, + row.get::<_, f64>(2)?, + )) + }, true, )?; @@ -439,6 +451,7 @@ struct SuggestionBuilder<'a> { location: Option, need_location: bool, icon: Option>, + icon_mimetype: Option, score: f64, } @@ -488,6 +501,7 @@ impl<'a> From> for Suggestion { url, title, icon: builder.icon, + icon_mimetype: builder.icon_mimetype, score: builder.score, has_location_sign: location_modifier.is_none() && builder.location_sign.is_some(), subject_exact_match: builder.subject_exact_match, -- cgit v1.2.3