diff options
Diffstat (limited to 'third_party/rust/relevancy')
-rw-r--r-- | third_party/rust/relevancy/.cargo-checksum.json | 2 | ||||
-rw-r--r-- | third_party/rust/relevancy/Cargo.toml | 13 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/db.rs | 68 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/error.rs | 15 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/ingest.rs | 394 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/interest.rs | 152 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/lib.rs | 80 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/populate_interests.rs | 157 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/relevancy.udl | 17 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/rs.rs | 60 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/url_hash.rs | 15 | ||||
-rw-r--r-- | third_party/rust/relevancy/test-data | bin | 192 -> 188 bytes |
12 files changed, 734 insertions, 239 deletions
diff --git a/third_party/rust/relevancy/.cargo-checksum.json b/third_party/rust/relevancy/.cargo-checksum.json index c8d8187c8d..bfa17d066a 100644 --- a/third_party/rust/relevancy/.cargo-checksum.json +++ b/third_party/rust/relevancy/.cargo-checksum.json @@ -1 +1 @@ -{"files":{"Cargo.toml":"76d64a839128f51662d1c10728ceddbb6a9ebdfce803915874cd654117d1b14e","build.rs":"a562bfe527d21c4e8a1a44b892defa83cdff141ec5dd51ed6f3862330e50ddd7","src/bin/generate-test-data.rs":"7f1c9dc445418c7627f89d1f2aa8e550d0f85b3d1f05edb7c378ab9441714f1f","src/db.rs":"0b45180f3031759213a0421231b6f109ed4f5c88aca556df159ce2717416cfec","src/error.rs":"6831fc329044174a8451b8b008c0b96c47404c591eb42e880562e65da0adfd0f","src/interest.rs":"ce6298ef8f69fcb57c8e5797467cbe1c0212a0d94daf828b12845740ac14a166","src/lib.rs":"7a0f0ad0a43f371035d9c0b73d143cf1b387d4b8cfad0d0db79314b5b91fd43c","src/populate_interests.rs":"b8905b52f9fc80719c175253b758413f606b27660e660635094421eec8b24c8f","src/relevancy.udl":"a3fae5097f9e8b39bb6c74ed6789906748c46f22d377e3dcb73b08731908f5bc","src/schema.rs":"f782c712f10c4f1af2f9e1424d6b52f59a2bacfcc452a8feb763f36478f5dd5d","src/url_hash.rs":"5619a249d471e7b642d889bad09e93212559c8b947010d49492c1423da2b310e","test-data":"392fc950363c9953ea6ab144b81d84021c4af1e1177cc0adac4eda5688c8bc33"},"package":null}
\ No newline at end of file +{"files":{"Cargo.toml":"2b7bf33e20b6aa768dd18619845e9d5d22235d86f770e94b250ed0052662ce2d","build.rs":"a562bfe527d21c4e8a1a44b892defa83cdff141ec5dd51ed6f3862330e50ddd7","src/bin/generate-test-data.rs":"7f1c9dc445418c7627f89d1f2aa8e550d0f85b3d1f05edb7c378ab9441714f1f","src/db.rs":"7ca5688c42d44ad6e5320208257d131c5c744be47a1cfe3e1380147abf2aadc3","src/error.rs":"0fe48e211dffb2010f732672c38e1c79b1995df3e70b06398ed8ac43d326c1b1","src/ingest.rs":"d3f528c1d62b4b6af404bb14cb0d431f8d523911ada09e4e1db5836b6cf44e04","src/interest.rs":"adbaa1e0324c7bb32b023f105b45499390a1a83973d1a8c7d727a661a25cc259","src/lib.rs":"29ce35211c9d94d561d62d7e8ef57fc56cc90a9ba42b88b54c2f4c9236a8cd4d","src/relevancy.udl":"b551e7476f30dccdc74cbf2f38fc3b87a3a7d0ec5dfa6c2ea4417b18fbc7475c","src/rs.rs":"b98091d0adca809d8fef38eb5394f885e04d4d382b7c8abd7bd0fe53f64e7bd6","src/schema.rs":"f782c712f10c4f1af2f9e1424d6b52f59a2bacfcc452a8feb763f36478f5dd5d","src/url_hash.rs":"2e908316fb70923644d1990dbf470d69ce2f5e99b0c5c3d95ec691590be8ffa5","test-data":"1ef2cd092d59e7e126cd4a514af983d449ed9f9c98708702fd237464a76c2b5e"},"package":null}
\ No newline at end of file diff --git a/third_party/rust/relevancy/Cargo.toml b/third_party/rust/relevancy/Cargo.toml index eddd8fd25c..67c1716ab2 100644 --- a/third_party/rust/relevancy/Cargo.toml +++ b/third_party/rust/relevancy/Cargo.toml @@ -25,9 +25,12 @@ license = "MPL-2.0" name = "generate-test-data" [dependencies] +anyhow = "1.0" +base64 = "0.21.2" log = "0.4" md-5 = "0.10" parking_lot = ">=0.11,<=0.12" +serde_json = "1" thiserror = "1.0" uniffi = "0.27.1" url = "2.5" @@ -35,10 +38,20 @@ url = "2.5" [dependencies.error-support] path = "../support/error" +[dependencies.interrupt-support] +path = "../support/interrupt" + +[dependencies.remote_settings] +path = "../remote_settings" + [dependencies.rusqlite] version = "0.30.0" features = ["bundled"] +[dependencies.serde] +version = "1" +features = ["derive"] + [dependencies.sql-support] path = "../support/sql" diff --git a/third_party/rust/relevancy/src/db.rs b/third_party/rust/relevancy/src/db.rs index 08684c45af..b2dc0b0c83 100644 --- a/third_party/rust/relevancy/src/db.rs +++ b/third_party/rust/relevancy/src/db.rs @@ -8,52 +8,66 @@ use crate::{ url_hash::{hash_url, UrlHash}, Interest, InterestVector, Result, }; -use parking_lot::Mutex; +use interrupt_support::SqlInterruptScope; use rusqlite::{Connection, OpenFlags}; -use sql_support::{open_database::open_database_with_flags, ConnExt}; +use sql_support::{ConnExt, LazyDb}; use std::path::Path; /// A thread-safe wrapper around an SQLite connection to the Relevancy database pub struct RelevancyDb { - pub conn: Mutex<Connection>, + reader: LazyDb<RelevancyConnectionInitializer>, + writer: LazyDb<RelevancyConnectionInitializer>, } impl RelevancyDb { - pub fn open(path: impl AsRef<Path>) -> Result<Self> { - let conn = open_database_with_flags( - path, - OpenFlags::SQLITE_OPEN_URI - | OpenFlags::SQLITE_OPEN_NO_MUTEX - | OpenFlags::SQLITE_OPEN_CREATE - | OpenFlags::SQLITE_OPEN_READ_WRITE, - &RelevancyConnectionInitializer, - )?; - Ok(Self { - conn: Mutex::new(conn), - }) + pub fn new(path: impl AsRef<Path>) -> Self { + // Note: use `SQLITE_OPEN_READ_WRITE` for both read and write connections. + // Even if we're opening a read connection, we may need to do a write as part of the + // initialization process. + // + // The read-only nature of the connection is enforced by the fact that [RelevancyDb::read] uses a + // shared ref to the `RelevancyDao`. + let db_open_flags = OpenFlags::SQLITE_OPEN_URI + | OpenFlags::SQLITE_OPEN_NO_MUTEX + | OpenFlags::SQLITE_OPEN_CREATE + | OpenFlags::SQLITE_OPEN_READ_WRITE; + Self { + reader: LazyDb::new(path.as_ref(), db_open_flags, RelevancyConnectionInitializer), + writer: LazyDb::new(path.as_ref(), db_open_flags, RelevancyConnectionInitializer), + } + } + + pub fn close(&self) { + self.reader.close(true); + self.writer.close(true); + } + + pub fn interrupt(&self) { + self.reader.interrupt(); + self.writer.interrupt(); } #[cfg(test)] - pub fn open_for_test() -> Self { + pub fn new_for_test() -> Self { use std::sync::atomic::{AtomicU32, Ordering}; static COUNTER: AtomicU32 = AtomicU32::new(0); let count = COUNTER.fetch_add(1, Ordering::Relaxed); - Self::open(format!("file:test{count}.sqlite?mode=memory&cache=shared")).unwrap() + Self::new(format!("file:test{count}.sqlite?mode=memory&cache=shared")) } /// Accesses the Suggest database in a transaction for reading. pub fn read<T>(&self, op: impl FnOnce(&RelevancyDao) -> Result<T>) -> Result<T> { - let mut conn = self.conn.lock(); + let (mut conn, scope) = self.reader.lock()?; let tx = conn.transaction()?; - let dao = RelevancyDao::new(&tx); + let dao = RelevancyDao::new(&tx, scope); op(&dao) } /// Accesses the Suggest database in a transaction for reading and writing. pub fn read_write<T>(&self, op: impl FnOnce(&mut RelevancyDao) -> Result<T>) -> Result<T> { - let mut conn = self.conn.lock(); + let (mut conn, scope) = self.writer.lock()?; let tx = conn.transaction()?; - let mut dao = RelevancyDao::new(&tx); + let mut dao = RelevancyDao::new(&tx, scope); let result = op(&mut dao)?; tx.commit()?; Ok(result) @@ -67,11 +81,17 @@ impl RelevancyDb { /// reference (`&mut self`). pub struct RelevancyDao<'a> { pub conn: &'a Connection, + pub scope: SqlInterruptScope, } impl<'a> RelevancyDao<'a> { - fn new(conn: &'a Connection) -> Self { - Self { conn } + fn new(conn: &'a Connection, scope: SqlInterruptScope) -> Self { + Self { conn, scope } + } + + /// Return Err(Interrupted) if we were interrupted + pub fn err_if_interrupted(&self) -> Result<()> { + Ok(self.scope.err_if_interrupted()?) } /// Associate a URL with an interest @@ -98,7 +118,7 @@ impl<'a> RelevancyDao<'a> { ", )?; let interests = stmt.query_and_then((hash,), |row| -> Result<Interest> { - Ok(row.get::<_, u32>(0)?.into()) + row.get::<_, u32>(0)?.try_into() })?; let mut interest_vec = InterestVector::default(); diff --git a/third_party/rust/relevancy/src/error.rs b/third_party/rust/relevancy/src/error.rs index 93ca7aabaa..1d42ff2c03 100644 --- a/third_party/rust/relevancy/src/error.rs +++ b/third_party/rust/relevancy/src/error.rs @@ -23,6 +23,21 @@ pub enum Error { #[error("Error fetching interest data")] FetchInterestDataError, + + #[error("Interrupted")] + Interrupted(#[from] interrupt_support::Interrupted), + + #[error("Invalid interest code: {0}")] + InvalidInterestCode(u32), + + #[error("Remote Setting Error: {0}")] + RemoteSettingsError(#[from] remote_settings::RemoteSettingsError), + + #[error("Serde Json Error: {0}")] + SerdeJsonError(#[from] serde_json::Error), + + #[error("Base64 Decode Error: {0}")] + Base64DecodeError(String), } /// Result enum for the public API diff --git a/third_party/rust/relevancy/src/ingest.rs b/third_party/rust/relevancy/src/ingest.rs new file mode 100644 index 0000000000..dc01fbe019 --- /dev/null +++ b/third_party/rust/relevancy/src/ingest.rs @@ -0,0 +1,394 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use crate::db::RelevancyDao; +use crate::rs::{ + RelevancyAttachmentData, RelevancyRecord, RelevancyRemoteSettingsClient, + REMOTE_SETTINGS_COLLECTION, +}; +use crate::url_hash::UrlHash; +use crate::{Error, Interest, RelevancyDb, Result}; +use base64::{engine::general_purpose::STANDARD, Engine}; +use remote_settings::{Client, RemoteSettingsConfig, RemoteSettingsRecord, RemoteSettingsServer}; + +// Number of rows to write when inserting interest data before checking for interruption +const WRITE_CHUNK_SIZE: usize = 100; + +pub fn ensure_interest_data_populated(db: &RelevancyDb) -> Result<()> { + if !db.read(|dao| dao.need_to_load_url_interests())? { + return Ok(()); + } + + match fetch_interest_data() { + Ok(data) => { + db.read_write(move |dao| insert_interest_data(data, dao))?; + } + Err(e) => { + log::warn!("error fetching interest data: {e}"); + return Err(Error::FetchInterestDataError); + } + } + Ok(()) +} + +fn fetch_interest_data() -> Result<Vec<(Interest, UrlHash)>> { + let rs = Client::new(RemoteSettingsConfig { + collection_name: REMOTE_SETTINGS_COLLECTION.to_string(), + server: Some(RemoteSettingsServer::Prod), + server_url: None, + bucket_name: None, + })?; + fetch_interest_data_inner(rs) +} + +/// Fetch the interest data +fn fetch_interest_data_inner( + rs: impl RelevancyRemoteSettingsClient, +) -> Result<Vec<(Interest, UrlHash)>> { + let remote_settings_response = rs.get_records()?; + let mut result = vec![]; + + for record in remote_settings_response.records { + let attachment_data = match &record.attachment { + None => return Err(Error::FetchInterestDataError), + Some(a) => rs.get_attachment(&a.location)?, + }; + let interest = get_interest(&record)?; + let urls = get_hash_urls(attachment_data)?; + result.extend(std::iter::repeat(interest).zip(urls)); + } + Ok(result) +} + +fn get_hash_urls(attachment_data: Vec<u8>) -> Result<Vec<UrlHash>> { + let mut hash_urls = vec![]; + + let parsed_attachment_data = + serde_json::from_slice::<Vec<RelevancyAttachmentData>>(&attachment_data)?; + + for attachment_data in parsed_attachment_data { + let hash_url = STANDARD + .decode(attachment_data.domain) + .map_err(|_| Error::Base64DecodeError("Invalid base64 error".to_string()))?; + let url_hash = hash_url.try_into().map_err(|_| { + Error::Base64DecodeError("Base64 string has wrong number of bytes".to_string()) + })?; + hash_urls.push(url_hash); + } + Ok(hash_urls) +} + +/// Extract Interest from the record info +fn get_interest(record: &RemoteSettingsRecord) -> Result<Interest> { + let record_fields: RelevancyRecord = + serde_json::from_value(serde_json::Value::Object(record.fields.clone()))?; + let custom_details = record_fields.record_custom_details; + let category_code = custom_details.category_to_domains.category_code; + Interest::try_from(category_code as u32) +} + +/// Insert Interests into Db +fn insert_interest_data(data: Vec<(Interest, UrlHash)>, dao: &mut RelevancyDao) -> Result<()> { + for chunk in data.chunks(WRITE_CHUNK_SIZE) { + dao.err_if_interrupted()?; + for (interest, hash_url) in chunk { + dao.add_url_interest(*hash_url, *interest)?; + } + } + + Ok(()) +} + +#[cfg(test)] +mod test { + + use std::{cell::RefCell, collections::HashMap}; + + use anyhow::Context; + use remote_settings::RemoteSettingsResponse; + use serde_json::json; + + use super::*; + use crate::{rs::RelevancyRemoteSettingsClient, url_hash::hash_url, InterestVector}; + + /// A snapshot containing fake Remote Settings records and attachments for + /// the store to ingest. We use snapshots to test the store's behavior in a + /// data-driven way. + struct Snapshot { + records: Vec<RemoteSettingsRecord>, + attachments: HashMap<&'static str, Vec<u8>>, + } + + impl Snapshot { + /// Creates a snapshot from a JSON value that represents a collection of + /// Relevancy Remote Settings records. + /// + /// You can use the [`serde_json::json!`] macro to construct the JSON + /// value, then pass it to this function. It's easier to use the + /// `Snapshot::with_records(json!(...))` idiom than to construct the + /// records by hand. + fn with_records(value: serde_json::Value) -> anyhow::Result<Self> { + Ok(Self { + records: serde_json::from_value(value) + .context("Couldn't create snapshot with Remote Settings records")?, + attachments: HashMap::new(), + }) + } + + /// Adds a data attachment to the snapshot. + fn with_data( + mut self, + location: &'static str, + value: serde_json::Value, + ) -> anyhow::Result<Self> { + self.attachments.insert( + location, + serde_json::to_vec(&value).context("Couldn't add data attachment to snapshot")?, + ); + Ok(self) + } + } + + /// A fake Remote Settings client that returns records and attachments from + /// a snapshot. + struct SnapshotSettingsClient { + /// The current snapshot. You can modify it using + /// [`RefCell::borrow_mut()`] to simulate remote updates in tests. + snapshot: RefCell<Snapshot>, + } + + impl SnapshotSettingsClient { + /// Creates a client with an initial snapshot. + fn with_snapshot(snapshot: Snapshot) -> Self { + Self { + snapshot: RefCell::new(snapshot), + } + } + } + + impl RelevancyRemoteSettingsClient for SnapshotSettingsClient { + fn get_records(&self) -> Result<RemoteSettingsResponse> { + let records = self.snapshot.borrow().records.clone(); + let last_modified = records + .iter() + .map(|record: &RemoteSettingsRecord| record.last_modified) + .max() + .unwrap_or(0); + Ok(RemoteSettingsResponse { + records, + last_modified, + }) + } + + fn get_attachment(&self, location: &str) -> Result<Vec<u8>> { + Ok(self + .snapshot + .borrow() + .attachments + .get(location) + .unwrap_or_else(|| unreachable!("Unexpected request for attachment `{}`", location)) + .clone()) + } + } + + #[test] + fn test_interest_vectors() { + let db = RelevancyDb::new_for_test(); + db.read_write(|dao| { + // Test that the interest data matches the values we started from in + // `bin/generate-test-data.rs` + + dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?; + dao.add_url_interest(hash_url("https://dogs.com").unwrap(), Interest::Animals)?; + dao.add_url_interest(hash_url("https://cars.com").unwrap(), Interest::Autos)?; + dao.add_url_interest( + hash_url("https://www.vouge.com").unwrap(), + Interest::Fashion, + )?; + dao.add_url_interest(hash_url("https://slashdot.org").unwrap(), Interest::Tech)?; + dao.add_url_interest(hash_url("https://www.nascar.com").unwrap(), Interest::Autos)?; + dao.add_url_interest( + hash_url("https://www.nascar.com").unwrap(), + Interest::Sports, + )?; + dao.add_url_interest( + hash_url("https://unknown.url").unwrap(), + Interest::Inconclusive, + )?; + + assert_eq!( + dao.get_url_interest_vector("https://espn.com/").unwrap(), + InterestVector { + sports: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://dogs.com/").unwrap(), + InterestVector { + animals: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://cars.com/").unwrap(), + InterestVector { + autos: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://www.vouge.com/") + .unwrap(), + InterestVector { + fashion: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://slashdot.org/") + .unwrap(), + InterestVector { + tech: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://www.nascar.com/") + .unwrap(), + InterestVector { + autos: 1, + sports: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://unknown.url/").unwrap(), + InterestVector { + inconclusive: 1, + ..InterestVector::default() + } + ); + Ok(()) + }) + .unwrap(); + } + + #[test] + fn test_variations_on_the_url() { + let db = RelevancyDb::new_for_test(); + db.read_write(|dao| { + dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?; + dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Autos)?; + dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Sports)?; + + // Different paths/queries should work + assert_eq!( + dao.get_url_interest_vector("https://espn.com/foo/bar/?baz") + .unwrap(), + InterestVector { + sports: 1, + ..InterestVector::default() + } + ); + // Different schemes should too + assert_eq!( + dao.get_url_interest_vector("http://espn.com/").unwrap(), + InterestVector { + sports: 1, + ..InterestVector::default() + } + ); + // But changes to the domain shouldn't + assert_eq!( + dao.get_url_interest_vector("http://espn2.com/").unwrap(), + InterestVector::default() + ); + // However, extra components past the 2nd one in the domain are ignored + assert_eq!( + dao.get_url_interest_vector("https://www.nascar.com/") + .unwrap(), + InterestVector { + autos: 1, + sports: 1, + ..InterestVector::default() + } + ); + Ok(()) + }) + .unwrap(); + } + + #[test] + fn test_parse_records() -> anyhow::Result<()> { + let snapshot = Snapshot::with_records(json!([{ + "id": "animals-0001", + "last_modified": 15, + "type": "category_to_domains", + "attachment": { + "filename": "data-1.json", + "mimetype": "application/json", + "location": "data-1.json", + "hash": "", + "size": 0 + }, + "record_custom_details": { + "category_to_domains": { + "category": "animals", + "category_code": 1, + "version": 1 + } + } + }]))? + .with_data( + "data-1.json", + json!([ + {"domain": "J2jtyjQtYQ/+/p//xhz43Q=="}, + {"domain": "Zd4awCwGZLkat59nIWje3g=="}]), + )?; + let rs_client = SnapshotSettingsClient::with_snapshot(snapshot); + assert_eq!( + fetch_interest_data_inner(rs_client).unwrap(), + vec![ + (Interest::Animals, hash_url("https://dogs.com").unwrap()), + (Interest::Animals, hash_url("https://cats.com").unwrap()) + ] + ); + + Ok(()) + } + + #[test] + fn test_parse_records_with_bad_domain_strings() -> anyhow::Result<()> { + let snapshot = Snapshot::with_records(json!([{ + "id": "animals-0001", + "last_modified": 15, + "type": "category_to_domains", + "attachment": { + "filename": "data-1.json", + "mimetype": "application/json", + "location": "data-1.json", + "hash": "", + "size": 0 + }, + "record_custom_details": { + "category_to_domains": { + "category": "animals", + "category_code": 1, + "version": 1 + } + } + }]))? + .with_data( + "data-1.json", + json!([ + {"domain": "badString"}, + {"domain": "notBase64"}]), + )?; + let rs_client = SnapshotSettingsClient::with_snapshot(snapshot); + fetch_interest_data_inner(rs_client).expect_err("Invalid base64 error"); + + Ok(()) + } +} diff --git a/third_party/rust/relevancy/src/interest.rs b/third_party/rust/relevancy/src/interest.rs index 0573c743fc..797df11236 100644 --- a/third_party/rust/relevancy/src/interest.rs +++ b/third_party/rust/relevancy/src/interest.rs @@ -2,32 +2,37 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +use crate::Error; + /// List of possible interests for a domain. Domains can have be associated with one or multiple /// interests. `Inconclusive` is used for domains in the user's top sites that we can't classify /// because there's no corresponding entry in the interest database. #[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] #[repr(u32)] pub enum Interest { - Animals, - Arts, - Autos, - Business, - Career, - Education, - Fashion, - Finance, - Food, - Government, - Health, - Hobbies, - Home, - News, - RealEstate, - Society, - Sports, - Tech, - Travel, - Inconclusive, + // Note: if you change these codes, make sure to update the `TryFrom<u32>` implementation and + // the `test_interest_code_conversion` test. + Inconclusive = 0, + Animals = 1, + Arts = 2, + Autos = 3, + Business = 4, + Career = 5, + Education = 6, + Fashion = 7, + Finance = 8, + Food = 9, + Government = 10, + //Disable this per policy consultation + // Health = 11, + Hobbies = 12, + Home = 13, + News = 14, + RealEstate = 15, + Society = 16, + Sports = 17, + Tech = 18, + Travel = 19, } impl From<Interest> for u32 { @@ -42,22 +47,44 @@ impl From<Interest> for usize { } } -impl From<u32> for Interest { - fn from(code: u32) -> Self { - if code as usize > Self::COUNT { - panic!("Invalid interest code: {code}") +impl TryFrom<u32> for Interest { + // On error, return the invalid code back + type Error = Error; + + fn try_from(code: u32) -> Result<Self, Self::Error> { + match code { + 0 => Ok(Self::Inconclusive), + 1 => Ok(Self::Animals), + 2 => Ok(Self::Arts), + 3 => Ok(Self::Autos), + 4 => Ok(Self::Business), + 5 => Ok(Self::Career), + 6 => Ok(Self::Education), + 7 => Ok(Self::Fashion), + 8 => Ok(Self::Finance), + 9 => Ok(Self::Food), + 10 => Ok(Self::Government), + //Disable this per policy consultation + // 11 => Ok(Self::Health), + 12 => Ok(Self::Hobbies), + 13 => Ok(Self::Home), + 14 => Ok(Self::News), + 15 => Ok(Self::RealEstate), + 16 => Ok(Self::Society), + 17 => Ok(Self::Sports), + 18 => Ok(Self::Tech), + 19 => Ok(Self::Travel), + n => Err(Error::InvalidInterestCode(n)), } - // Safety: This is safe since Interest has a u32 representation and we've done a bounds - // check - unsafe { std::mem::transmute(code) } } } impl Interest { - const COUNT: usize = 20; + const COUNT: usize = 19; pub fn all() -> [Interest; Self::COUNT] { [ + Self::Inconclusive, Self::Animals, Self::Arts, Self::Autos, @@ -68,7 +95,7 @@ impl Interest { Self::Finance, Self::Food, Self::Government, - Self::Health, + // Self::Health, Self::Hobbies, Self::Home, Self::News, @@ -77,7 +104,6 @@ impl Interest { Self::Sports, Self::Tech, Self::Travel, - Self::Inconclusive, ] } } @@ -88,6 +114,7 @@ impl Interest { /// number of elements. #[derive(Debug, Default, PartialEq, Eq)] pub struct InterestVector { + pub inconclusive: u32, pub animals: u32, pub arts: u32, pub autos: u32, @@ -98,7 +125,7 @@ pub struct InterestVector { pub finance: u32, pub food: u32, pub government: u32, - pub health: u32, + // pub health: u32, pub hobbies: u32, pub home: u32, pub news: u32, @@ -107,7 +134,34 @@ pub struct InterestVector { pub sports: u32, pub tech: u32, pub travel: u32, - pub inconclusive: u32, +} + +impl std::ops::Add for InterestVector { + type Output = Self; + + fn add(self, other: Self) -> Self { + Self { + inconclusive: self.inconclusive + other.inconclusive, + animals: self.animals + other.animals, + arts: self.arts + other.arts, + autos: self.autos + other.autos, + business: self.business + other.business, + career: self.career + other.career, + education: self.education + other.education, + fashion: self.fashion + other.fashion, + finance: self.finance + other.finance, + food: self.food + other.food, + government: self.government + other.government, + hobbies: self.hobbies + other.hobbies, + home: self.home + other.home, + news: self.news + other.news, + real_estate: self.real_estate + other.real_estate, + society: self.society + other.society, + sports: self.sports + other.sports, + tech: self.tech + other.tech, + travel: self.travel + other.travel, + } + } } impl std::ops::Index<Interest> for InterestVector { @@ -115,6 +169,7 @@ impl std::ops::Index<Interest> for InterestVector { fn index(&self, index: Interest) -> &u32 { match index { + Interest::Inconclusive => &self.inconclusive, Interest::Animals => &self.animals, Interest::Arts => &self.arts, Interest::Autos => &self.autos, @@ -125,7 +180,7 @@ impl std::ops::Index<Interest> for InterestVector { Interest::Finance => &self.finance, Interest::Food => &self.food, Interest::Government => &self.government, - Interest::Health => &self.health, + // Interest::Health => &self.health, Interest::Hobbies => &self.hobbies, Interest::Home => &self.home, Interest::News => &self.news, @@ -134,7 +189,6 @@ impl std::ops::Index<Interest> for InterestVector { Interest::Sports => &self.sports, Interest::Tech => &self.tech, Interest::Travel => &self.travel, - Interest::Inconclusive => &self.inconclusive, } } } @@ -142,6 +196,7 @@ impl std::ops::Index<Interest> for InterestVector { impl std::ops::IndexMut<Interest> for InterestVector { fn index_mut(&mut self, index: Interest) -> &mut u32 { match index { + Interest::Inconclusive => &mut self.inconclusive, Interest::Animals => &mut self.animals, Interest::Arts => &mut self.arts, Interest::Autos => &mut self.autos, @@ -152,7 +207,7 @@ impl std::ops::IndexMut<Interest> for InterestVector { Interest::Finance => &mut self.finance, Interest::Food => &mut self.food, Interest::Government => &mut self.government, - Interest::Health => &mut self.health, + // Interest::Health => &mut self.health, Interest::Hobbies => &mut self.hobbies, Interest::Home => &mut self.home, Interest::News => &mut self.news, @@ -161,7 +216,32 @@ impl std::ops::IndexMut<Interest> for InterestVector { Interest::Sports => &mut self.sports, Interest::Tech => &mut self.tech, Interest::Travel => &mut self.travel, - Interest::Inconclusive => &mut self.inconclusive, } } } + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_interest_code_conversion() { + for interest in Interest::all() { + assert_eq!(Interest::try_from(u32::from(interest)).unwrap(), interest) + } + // try_from() for out of bounds codes should return an error + assert!(matches!( + Interest::try_from(20), + Err(Error::InvalidInterestCode(20)) + )); + assert!(matches!( + Interest::try_from(100), + Err(Error::InvalidInterestCode(100)) + )); + // Health is currently disabled, so it's code should return None for now + assert!(matches!( + Interest::try_from(11), + Err(Error::InvalidInterestCode(11)) + )); + } +} diff --git a/third_party/rust/relevancy/src/lib.rs b/third_party/rust/relevancy/src/lib.rs index 157a26277e..4bc774a261 100644 --- a/third_party/rust/relevancy/src/lib.rs +++ b/third_party/rust/relevancy/src/lib.rs @@ -11,8 +11,9 @@ mod db; mod error; +mod ingest; mod interest; -mod populate_interests; +mod rs; mod schema; pub mod url_hash; @@ -28,11 +29,18 @@ pub struct RelevancyStore { /// Top-level API for the Relevancy component impl RelevancyStore { - #[handle_error(Error)] - pub fn new(db_path: String) -> ApiResult<Self> { - Ok(Self { - db: RelevancyDb::open(db_path)?, - }) + pub fn new(db_path: String) -> Self { + Self { + db: RelevancyDb::new(db_path), + } + } + + pub fn close(&self) { + self.db.close() + } + + pub fn interrupt(&self) { + self.db.interrupt() } /// Ingest top URLs to build the user's interest vector. @@ -47,9 +55,21 @@ impl RelevancyStore { /// /// This method may execute for a long time and should only be called from a worker thread. #[handle_error(Error)] - pub fn ingest(&self, _top_urls_by_frecency: Vec<String>) -> ApiResult<()> { - populate_interests::ensure_interest_data_populated(&self.db)?; - todo!() + pub fn ingest(&self, top_urls_by_frecency: Vec<String>) -> ApiResult<InterestVector> { + ingest::ensure_interest_data_populated(&self.db)?; + self.classify(top_urls_by_frecency) + } + + pub fn classify(&self, top_urls_by_frecency: Vec<String>) -> Result<InterestVector> { + // For experimentation purposes we are going to return an interest vector. + // Eventually we would want to store this data in the DB and incrementally update it. + let mut interest_vector = InterestVector::default(); + for url in top_urls_by_frecency { + let interest_count = self.db.read(|dao| dao.get_url_interest_vector(&url))?; + interest_vector = interest_vector + interest_count; + } + + Ok(interest_vector) } /// Calculate metrics for the validation phase @@ -79,3 +99,45 @@ pub struct InterestMetrics { } uniffi::include_scaffolding!("relevancy"); + +#[cfg(test)] +mod test { + use crate::url_hash::hash_url; + + use super::*; + + #[test] + fn test_ingest() { + let top_urls = vec![ + "https://food.com/".to_string(), + "https://hello.com".to_string(), + "https://pasta.com".to_string(), + "https://dog.com".to_string(), + ]; + let relevancy_store = + RelevancyStore::new("file:test_store_data?mode=memory&cache=shared".to_owned()); + relevancy_store + .db + .read_write(|dao| { + dao.add_url_interest(hash_url("https://food.com").unwrap(), Interest::Food)?; + dao.add_url_interest( + hash_url("https://hello.com").unwrap(), + Interest::Inconclusive, + )?; + dao.add_url_interest(hash_url("https://pasta.com").unwrap(), Interest::Food)?; + dao.add_url_interest(hash_url("https://dog.com").unwrap(), Interest::Animals)?; + Ok(()) + }) + .expect("Insert should succeed"); + + assert_eq!( + relevancy_store.ingest(top_urls).unwrap(), + InterestVector { + inconclusive: 1, + animals: 1, + food: 2, + ..InterestVector::default() + } + ); + } +} diff --git a/third_party/rust/relevancy/src/populate_interests.rs b/third_party/rust/relevancy/src/populate_interests.rs deleted file mode 100644 index e33b677dd6..0000000000 --- a/third_party/rust/relevancy/src/populate_interests.rs +++ /dev/null @@ -1,157 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -use crate::{url_hash::UrlHash, Error, Interest, RelevancyDb, Result}; -use std::io::{Cursor, Read}; - -pub fn ensure_interest_data_populated(db: &RelevancyDb) -> Result<()> { - if !db.read(|dao| dao.need_to_load_url_interests())? { - return Ok(()); - } - let interest_data = match fetch_interest_data() { - Ok(data) => data, - Err(e) => { - log::warn!("error fetching interest data: {e}"); - return Err(Error::FetchInterestDataError); - } - }; - db.read_write(move |dao| { - for (url_hash, interest) in interest_data { - dao.add_url_interest(url_hash, interest)?; - } - Ok(()) - }) -} - -/// Fetch the interest data -fn fetch_interest_data() -> std::io::Result<Vec<(UrlHash, Interest)>> { - // TODO: this hack should be replaced with something that fetches from remote settings - let bytes = include_bytes!("../test-data"); - let mut reader = Cursor::new(&bytes); - let mut data = vec![]; - - // Loop over all possible interests - for interest in Interest::all() { - // read the count - let mut buf = [0u8; 4]; - reader.read_exact(&mut buf)?; - let count = u32::from_le_bytes(buf); - for _ in 0..count { - let mut url_hash: UrlHash = [0u8; 16]; - reader.read_exact(&mut url_hash)?; - data.push((url_hash, interest)); - } - } - Ok(data) -} - -#[cfg(test)] -mod test { - use super::*; - use crate::InterestVector; - - #[test] - fn test_interest_vectors() { - let db = RelevancyDb::open_for_test(); - ensure_interest_data_populated(&db).unwrap(); - db.read(|dao| { - // Test that the interest data matches the values we started from in - // `bin/generate-test-data.rs` - assert_eq!( - dao.get_url_interest_vector("https://espn.com/").unwrap(), - InterestVector { - sports: 1, - ..InterestVector::default() - } - ); - assert_eq!( - dao.get_url_interest_vector("https://dogs.com/").unwrap(), - InterestVector { - animals: 1, - ..InterestVector::default() - } - ); - assert_eq!( - dao.get_url_interest_vector("https://cars.com/").unwrap(), - InterestVector { - autos: 1, - ..InterestVector::default() - } - ); - assert_eq!( - dao.get_url_interest_vector("https://www.vouge.com/") - .unwrap(), - InterestVector { - fashion: 1, - ..InterestVector::default() - } - ); - assert_eq!( - dao.get_url_interest_vector("https://slashdot.org/") - .unwrap(), - InterestVector { - tech: 1, - ..InterestVector::default() - } - ); - assert_eq!( - dao.get_url_interest_vector("https://www.nascar.com/") - .unwrap(), - InterestVector { - autos: 1, - sports: 1, - ..InterestVector::default() - } - ); - assert_eq!( - dao.get_url_interest_vector("https://unknown.url/").unwrap(), - InterestVector::default() - ); - Ok(()) - }) - .unwrap(); - } - - #[test] - fn test_variations_on_the_url() { - let db = RelevancyDb::open_for_test(); - ensure_interest_data_populated(&db).unwrap(); - db.read(|dao| { - // Different paths/queries should work - assert_eq!( - dao.get_url_interest_vector("https://espn.com/foo/bar/?baz") - .unwrap(), - InterestVector { - sports: 1, - ..InterestVector::default() - } - ); - // Different schemes should too - assert_eq!( - dao.get_url_interest_vector("http://espn.com/").unwrap(), - InterestVector { - sports: 1, - ..InterestVector::default() - } - ); - // But changes to the domain shouldn't - assert_eq!( - dao.get_url_interest_vector("http://www.espn.com/").unwrap(), - InterestVector::default() - ); - // However, extra components past the 3rd one in the domain are ignored - assert_eq!( - dao.get_url_interest_vector("https://foo.www.nascar.com/") - .unwrap(), - InterestVector { - autos: 1, - sports: 1, - ..InterestVector::default() - } - ); - Ok(()) - }) - .unwrap(); - } -} diff --git a/third_party/rust/relevancy/src/relevancy.udl b/third_party/rust/relevancy/src/relevancy.udl index e07243ec28..ba9eb09969 100644 --- a/third_party/rust/relevancy/src/relevancy.udl +++ b/third_party/rust/relevancy/src/relevancy.udl @@ -8,12 +8,21 @@ interface RelevancyApiError { // Top-level class for the Relevancy component interface RelevancyStore { // Construct a new RelevancyStore - [Throws=RelevancyApiError] + // + // This is non-blocking since databases and other resources are lazily opened. constructor(string dbpath); + // Close any open resources (for example databases) + // + // Calling `close` will interrupt any in-progress queries on other threads. + void close(); + + // Interrupt any current database queries + void interrupt(); + // Ingest the top URLs by frequency to build up the user's interest vector [Throws=RelevancyApiError] - void ingest(sequence<string> top_urls); + InterestVector ingest(sequence<string> top_urls); // Calculate metrics for the user's interest vector in order to measure how strongly we're // identifying interests. See the `InterestMetrics` struct for details. @@ -39,7 +48,7 @@ enum Interest { "Finance", "Food", "Government", - "Health", + // "Health", "Hobbies", "Home", "News", @@ -93,7 +102,7 @@ dictionary InterestVector { u32 finance; u32 food; u32 government; - u32 health; + // u32 health; u32 hobbies; u32 home; u32 news; diff --git a/third_party/rust/relevancy/src/rs.rs b/third_party/rust/relevancy/src/rs.rs new file mode 100644 index 0000000000..bc8cc938e8 --- /dev/null +++ b/third_party/rust/relevancy/src/rs.rs @@ -0,0 +1,60 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +use crate::Result; +use remote_settings::RemoteSettingsResponse; +use serde::Deserialize; +/// The Remote Settings collection name. +pub(crate) const REMOTE_SETTINGS_COLLECTION: &str = "content-relevance"; + +/// A trait for a client that downloads records from Remote Settings. +/// +/// This trait lets tests use a mock client. +pub(crate) trait RelevancyRemoteSettingsClient { + /// Fetches records from the Suggest Remote Settings collection. + fn get_records(&self) -> Result<RemoteSettingsResponse>; + + /// Fetches a record's attachment from the Suggest Remote Settings + /// collection. + fn get_attachment(&self, location: &str) -> Result<Vec<u8>>; +} + +impl RelevancyRemoteSettingsClient for remote_settings::Client { + fn get_records(&self) -> Result<RemoteSettingsResponse> { + Ok(remote_settings::Client::get_records(self)?) + } + + fn get_attachment(&self, location: &str) -> Result<Vec<u8>> { + Ok(remote_settings::Client::get_attachment(self, location)?) + } +} + +/// A record in the Relevancy Remote Settings collection. +#[derive(Clone, Debug, Deserialize)] +pub struct RelevancyRecord { + #[serde(rename = "type")] + pub record_type: String, + pub record_custom_details: RecordCustomDetails, +} + +// Custom details related to category of the record. +#[derive(Clone, Debug, Deserialize)] +pub struct RecordCustomDetails { + pub category_to_domains: CategoryToDomains, +} + +/// Category information related to the record. +#[derive(Clone, Debug, Deserialize)] +pub struct CategoryToDomains { + pub version: i32, + pub category: String, + pub category_code: i32, +} + +/// A downloaded Remote Settings attachment that contains domain data. +#[derive(Clone, Debug, Deserialize)] +pub struct RelevancyAttachmentData { + pub domain: String, +} diff --git a/third_party/rust/relevancy/src/url_hash.rs b/third_party/rust/relevancy/src/url_hash.rs index d31a45d06b..c010dcaf12 100644 --- a/third_party/rust/relevancy/src/url_hash.rs +++ b/third_party/rust/relevancy/src/url_hash.rs @@ -8,11 +8,10 @@ use url::{Host, Url}; pub type UrlHash = [u8; 16]; /// Given a URL, extract the part of it that we want to use to identify it. -/// -/// We currently use the final 3 components of the URL domain. -/// -/// TODO: decide if this should be 3 or 3 components. pub fn url_hash_source(url: &str) -> Option<String> { + // We currently use the final 2 components of the URL domain. + const URL_COMPONENTS_TO_USE: usize = 2; + let url = Url::parse(url).ok()?; let domain = match url.host() { Some(Host::Domain(d)) => d, @@ -20,7 +19,7 @@ pub fn url_hash_source(url: &str) -> Option<String> { }; // This will store indexes of `.` chars as we search backwards. let mut pos = domain.len(); - for _ in 0..3 { + for _ in 0..URL_COMPONENTS_TO_USE { match domain[0..pos].rfind('.') { Some(p) => pos = p, // The domain has less than 3 dots, return it all @@ -47,12 +46,12 @@ mod test { fn test_url_hash_source() { let table = [ ("http://example.com/some-path", Some("example.com")), - ("http://foo.example.com/some-path", Some("foo.example.com")), + ("http://foo.example.com/some-path", Some("example.com")), ( "http://foo.bar.baz.example.com/some-path", - Some("baz.example.com"), + Some("example.com"), ), - ("http://foo.com.uk/some-path", Some("foo.com.uk")), + ("http://foo.com.uk/some-path", Some("com.uk")), ("http://amazon.com/some-path", Some("amazon.com")), ("http://192.168.0.1/some-path", None), ]; diff --git a/third_party/rust/relevancy/test-data b/third_party/rust/relevancy/test-data Binary files differindex c645914143..46fd850189 100644 --- a/third_party/rust/relevancy/test-data +++ b/third_party/rust/relevancy/test-data |