diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:34:50 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-15 03:34:50 +0000 |
commit | def92d1b8e9d373e2f6f27c366d578d97d8960c6 (patch) | |
tree | 2ef34b9ad8bb9a9220e05d60352558b15f513894 /third_party/rust/relevancy/src | |
parent | Adding debian version 125.0.3-1. (diff) | |
download | firefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.tar.xz firefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.zip |
Merging upstream version 126.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/relevancy/src')
-rw-r--r-- | third_party/rust/relevancy/src/bin/generate-test-data.rs | 43 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/db.rs | 118 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/error.rs | 44 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/interest.rs | 167 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/lib.rs | 81 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/populate_interests.rs | 157 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/relevancy.udl | 106 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/schema.rs | 53 | ||||
-rw-r--r-- | third_party/rust/relevancy/src/url_hash.rs | 63 |
9 files changed, 832 insertions, 0 deletions
diff --git a/third_party/rust/relevancy/src/bin/generate-test-data.rs b/third_party/rust/relevancy/src/bin/generate-test-data.rs new file mode 100644 index 0000000000..04c5827275 --- /dev/null +++ b/third_party/rust/relevancy/src/bin/generate-test-data.rs @@ -0,0 +1,43 @@ +use relevancy::{ + url_hash::{hash_url, UrlHash}, + Interest, +}; +use std::{collections::HashMap, fs::File, io::Write}; + +// Generate a set of test data and output it to the `test-data` file. +// +// This is meant to be a placeholder until we can get this data stored in remote settings. + +const TEST_INTEREST_DATA: &[(&str, Interest)] = &[ + ("https://espn.com/", Interest::Sports), + ("https://dogs.com/", Interest::Animals), + ("https://cars.com/", Interest::Autos), + ("https://www.vouge.com/", Interest::Fashion), + ("https://slashdot.org/", Interest::Tech), + ("https://www.nascar.com/", Interest::Autos), + ("https://www.nascar.com/", Interest::Sports), +]; + +fn main() { + let mut interest_map: HashMap<Interest, Vec<UrlHash>> = + HashMap::from_iter(Interest::all().into_iter().map(|i| (i, vec![]))); + for (url, interest) in TEST_INTEREST_DATA { + if let Some(hash) = hash_url(url) { + interest_map.get_mut(interest).unwrap().push(hash) + } + } + + let mut f = File::create("test-data").expect("Error opening file"); + // Loop over all possible interests + for interest in Interest::all() { + // Get the list of URL hashes for that interest + let hashes = interest_map.get(&interest).unwrap(); + // Write the count + f.write_all(&(hashes.len() as u32).to_le_bytes()) + .expect("Error writing file"); + // Write the hashes + for hash in hashes { + f.write_all(hash).expect("Error writing file"); + } + } +} diff --git a/third_party/rust/relevancy/src/db.rs b/third_party/rust/relevancy/src/db.rs new file mode 100644 index 0000000000..08684c45af --- /dev/null +++ b/third_party/rust/relevancy/src/db.rs @@ -0,0 +1,118 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +use crate::{ + schema::RelevancyConnectionInitializer, + url_hash::{hash_url, UrlHash}, + Interest, InterestVector, Result, +}; +use parking_lot::Mutex; +use rusqlite::{Connection, OpenFlags}; +use sql_support::{open_database::open_database_with_flags, ConnExt}; +use std::path::Path; + +/// A thread-safe wrapper around an SQLite connection to the Relevancy database +pub struct RelevancyDb { + pub conn: Mutex<Connection>, +} + +impl RelevancyDb { + pub fn open(path: impl AsRef<Path>) -> Result<Self> { + let conn = open_database_with_flags( + path, + OpenFlags::SQLITE_OPEN_URI + | OpenFlags::SQLITE_OPEN_NO_MUTEX + | OpenFlags::SQLITE_OPEN_CREATE + | OpenFlags::SQLITE_OPEN_READ_WRITE, + &RelevancyConnectionInitializer, + )?; + Ok(Self { + conn: Mutex::new(conn), + }) + } + + #[cfg(test)] + pub fn open_for_test() -> Self { + use std::sync::atomic::{AtomicU32, Ordering}; + static COUNTER: AtomicU32 = AtomicU32::new(0); + let count = COUNTER.fetch_add(1, Ordering::Relaxed); + Self::open(format!("file:test{count}.sqlite?mode=memory&cache=shared")).unwrap() + } + + /// Accesses the Suggest database in a transaction for reading. + pub fn read<T>(&self, op: impl FnOnce(&RelevancyDao) -> Result<T>) -> Result<T> { + let mut conn = self.conn.lock(); + let tx = conn.transaction()?; + let dao = RelevancyDao::new(&tx); + op(&dao) + } + + /// Accesses the Suggest database in a transaction for reading and writing. + pub fn read_write<T>(&self, op: impl FnOnce(&mut RelevancyDao) -> Result<T>) -> Result<T> { + let mut conn = self.conn.lock(); + let tx = conn.transaction()?; + let mut dao = RelevancyDao::new(&tx); + let result = op(&mut dao)?; + tx.commit()?; + Ok(result) + } +} + +/// A data access object (DAO) that wraps a connection to the Relevancy database +/// +/// Methods that only read from the database take an immutable reference to +/// `self` (`&self`), and methods that write to the database take a mutable +/// reference (`&mut self`). +pub struct RelevancyDao<'a> { + pub conn: &'a Connection, +} + +impl<'a> RelevancyDao<'a> { + fn new(conn: &'a Connection) -> Self { + Self { conn } + } + + /// Associate a URL with an interest + pub fn add_url_interest(&mut self, url_hash: UrlHash, interest: Interest) -> Result<()> { + let sql = " + INSERT OR REPLACE INTO url_interest(url_hash, interest_code) + VALUES (?, ?) + "; + self.conn.execute(sql, (url_hash, interest as u32))?; + Ok(()) + } + + /// Get an interest vector for a URL + pub fn get_url_interest_vector(&self, url: &str) -> Result<InterestVector> { + let hash = match hash_url(url) { + Some(u) => u, + None => return Ok(InterestVector::default()), + }; + let mut stmt = self.conn.prepare_cached( + " + SELECT interest_code + FROM url_interest + WHERE url_hash=? + ", + )?; + let interests = stmt.query_and_then((hash,), |row| -> Result<Interest> { + Ok(row.get::<_, u32>(0)?.into()) + })?; + + let mut interest_vec = InterestVector::default(); + for interest in interests { + interest_vec[interest?] += 1 + } + Ok(interest_vec) + } + + /// Do we need to load the interest data? + pub fn need_to_load_url_interests(&self) -> Result<bool> { + // TODO: we probably will need a better check than this. + Ok(self + .conn + .query_one("SELECT NOT EXISTS (SELECT 1 FROM url_interest)")?) + } +} diff --git a/third_party/rust/relevancy/src/error.rs b/third_party/rust/relevancy/src/error.rs new file mode 100644 index 0000000000..93ca7aabaa --- /dev/null +++ b/third_party/rust/relevancy/src/error.rs @@ -0,0 +1,44 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public +* License, v. 2.0. If a copy of the MPL was not distributed with this +* file, You can obtain one at http://mozilla.org/MPL/2.0/. +*/ + +use error_support::{ErrorHandling, GetErrorHandling}; + +/// Errors we return via the public interface. +#[derive(Debug, thiserror::Error)] +pub enum RelevancyApiError { + #[error("Unexpected Error: {reason}")] + Unexpected { reason: String }, +} + +/// Errors we use internally +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("Error opening database: {0}")] + OpenDatabase(#[from] sql_support::open_database::Error), + + #[error("Sql error: {0}")] + SqlError(#[from] rusqlite::Error), + + #[error("Error fetching interest data")] + FetchInterestDataError, +} + +/// Result enum for the public API +pub type ApiResult<T> = std::result::Result<T, RelevancyApiError>; + +/// Result enum for internal functions +pub type Result<T> = std::result::Result<T, Error>; + +// Define how our internal errors are handled and converted to external errors +// See `support/error/README.md` for how this works, especially the warning about PII. +impl GetErrorHandling for Error { + type ExternalError = RelevancyApiError; + + fn get_error_handling(&self) -> ErrorHandling<Self::ExternalError> { + ErrorHandling::convert(RelevancyApiError::Unexpected { + reason: self.to_string(), + }) + } +} diff --git a/third_party/rust/relevancy/src/interest.rs b/third_party/rust/relevancy/src/interest.rs new file mode 100644 index 0000000000..0573c743fc --- /dev/null +++ b/third_party/rust/relevancy/src/interest.rs @@ -0,0 +1,167 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/// List of possible interests for a domain. Domains can have be associated with one or multiple +/// interests. `Inconclusive` is used for domains in the user's top sites that we can't classify +/// because there's no corresponding entry in the interest database. +#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)] +#[repr(u32)] +pub enum Interest { + Animals, + Arts, + Autos, + Business, + Career, + Education, + Fashion, + Finance, + Food, + Government, + Health, + Hobbies, + Home, + News, + RealEstate, + Society, + Sports, + Tech, + Travel, + Inconclusive, +} + +impl From<Interest> for u32 { + fn from(interest: Interest) -> Self { + interest as u32 + } +} + +impl From<Interest> for usize { + fn from(interest: Interest) -> Self { + interest as usize + } +} + +impl From<u32> for Interest { + fn from(code: u32) -> Self { + if code as usize > Self::COUNT { + panic!("Invalid interest code: {code}") + } + // Safety: This is safe since Interest has a u32 representation and we've done a bounds + // check + unsafe { std::mem::transmute(code) } + } +} + +impl Interest { + const COUNT: usize = 20; + + pub fn all() -> [Interest; Self::COUNT] { + [ + Self::Animals, + Self::Arts, + Self::Autos, + Self::Business, + Self::Career, + Self::Education, + Self::Fashion, + Self::Finance, + Self::Food, + Self::Government, + Self::Health, + Self::Hobbies, + Self::Home, + Self::News, + Self::RealEstate, + Self::Society, + Self::Sports, + Self::Tech, + Self::Travel, + Self::Inconclusive, + ] + } +} + +/// Vector storing a count value for each interest +/// +/// Here "vector" refers to the mathematical object, not a Rust `Vec`. It always has a fixed +/// number of elements. +#[derive(Debug, Default, PartialEq, Eq)] +pub struct InterestVector { + pub animals: u32, + pub arts: u32, + pub autos: u32, + pub business: u32, + pub career: u32, + pub education: u32, + pub fashion: u32, + pub finance: u32, + pub food: u32, + pub government: u32, + pub health: u32, + pub hobbies: u32, + pub home: u32, + pub news: u32, + pub real_estate: u32, + pub society: u32, + pub sports: u32, + pub tech: u32, + pub travel: u32, + pub inconclusive: u32, +} + +impl std::ops::Index<Interest> for InterestVector { + type Output = u32; + + fn index(&self, index: Interest) -> &u32 { + match index { + Interest::Animals => &self.animals, + Interest::Arts => &self.arts, + Interest::Autos => &self.autos, + Interest::Business => &self.business, + Interest::Career => &self.career, + Interest::Education => &self.education, + Interest::Fashion => &self.fashion, + Interest::Finance => &self.finance, + Interest::Food => &self.food, + Interest::Government => &self.government, + Interest::Health => &self.health, + Interest::Hobbies => &self.hobbies, + Interest::Home => &self.home, + Interest::News => &self.news, + Interest::RealEstate => &self.real_estate, + Interest::Society => &self.society, + Interest::Sports => &self.sports, + Interest::Tech => &self.tech, + Interest::Travel => &self.travel, + Interest::Inconclusive => &self.inconclusive, + } + } +} + +impl std::ops::IndexMut<Interest> for InterestVector { + fn index_mut(&mut self, index: Interest) -> &mut u32 { + match index { + Interest::Animals => &mut self.animals, + Interest::Arts => &mut self.arts, + Interest::Autos => &mut self.autos, + Interest::Business => &mut self.business, + Interest::Career => &mut self.career, + Interest::Education => &mut self.education, + Interest::Fashion => &mut self.fashion, + Interest::Finance => &mut self.finance, + Interest::Food => &mut self.food, + Interest::Government => &mut self.government, + Interest::Health => &mut self.health, + Interest::Hobbies => &mut self.hobbies, + Interest::Home => &mut self.home, + Interest::News => &mut self.news, + Interest::RealEstate => &mut self.real_estate, + Interest::Society => &mut self.society, + Interest::Sports => &mut self.sports, + Interest::Tech => &mut self.tech, + Interest::Travel => &mut self.travel, + Interest::Inconclusive => &mut self.inconclusive, + } + } +} diff --git a/third_party/rust/relevancy/src/lib.rs b/third_party/rust/relevancy/src/lib.rs new file mode 100644 index 0000000000..157a26277e --- /dev/null +++ b/third_party/rust/relevancy/src/lib.rs @@ -0,0 +1,81 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +//! Proposed API for the relevancy component (validation phase) +//! +//! The goal here is to allow us to validate that we can reliably detect user interests from +//! history data, without spending too much time building the API out. There's some hand-waving +//! towards how we would use this data to rank search results, but we don't need to come to a final +//! decision on that yet. + +mod db; +mod error; +mod interest; +mod populate_interests; +mod schema; +pub mod url_hash; + +pub use db::RelevancyDb; +pub use error::{ApiResult, Error, RelevancyApiError, Result}; +pub use interest::{Interest, InterestVector}; + +use error_support::handle_error; + +pub struct RelevancyStore { + db: RelevancyDb, +} + +/// Top-level API for the Relevancy component +impl RelevancyStore { + #[handle_error(Error)] + pub fn new(db_path: String) -> ApiResult<Self> { + Ok(Self { + db: RelevancyDb::open(db_path)?, + }) + } + + /// Ingest top URLs to build the user's interest vector. + /// + /// Consumer should pass a list of the user's top URLs by frecency to this method. It will + /// then: + /// + /// - Download the URL interest data from remote settings. Eventually this should be cached / + /// stored in the database, but for now it would be fine to download fresh data each time. + /// - Match the user's top URls against the interest data to build up their interest vector. + /// - Store the user's interest vector in the database. + /// + /// This method may execute for a long time and should only be called from a worker thread. + #[handle_error(Error)] + pub fn ingest(&self, _top_urls_by_frecency: Vec<String>) -> ApiResult<()> { + populate_interests::ensure_interest_data_populated(&self.db)?; + todo!() + } + + /// Calculate metrics for the validation phase + /// + /// This runs after [Self::ingest]. It takes the interest vector that ingest created and + /// calculates a set of metrics that we can report to glean. + #[handle_error(Error)] + pub fn calculate_metrics(&self) -> ApiResult<InterestMetrics> { + todo!() + } + + /// Get the user's interest vector directly. + /// + /// This runs after [Self::ingest]. It returns the interest vector directly so that the + /// consumer can show it in an `about:` page. + #[handle_error(Error)] + pub fn user_interest_vector(&self) -> ApiResult<InterestVector> { + todo!() + } +} + +/// Interest metric data. See `relevancy.udl` for details. +pub struct InterestMetrics { + pub top_single_interest_similarity: u32, + pub top_2interest_similarity: u32, + pub top_3interest_similarity: u32, +} + +uniffi::include_scaffolding!("relevancy"); diff --git a/third_party/rust/relevancy/src/populate_interests.rs b/third_party/rust/relevancy/src/populate_interests.rs new file mode 100644 index 0000000000..e33b677dd6 --- /dev/null +++ b/third_party/rust/relevancy/src/populate_interests.rs @@ -0,0 +1,157 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use crate::{url_hash::UrlHash, Error, Interest, RelevancyDb, Result}; +use std::io::{Cursor, Read}; + +pub fn ensure_interest_data_populated(db: &RelevancyDb) -> Result<()> { + if !db.read(|dao| dao.need_to_load_url_interests())? { + return Ok(()); + } + let interest_data = match fetch_interest_data() { + Ok(data) => data, + Err(e) => { + log::warn!("error fetching interest data: {e}"); + return Err(Error::FetchInterestDataError); + } + }; + db.read_write(move |dao| { + for (url_hash, interest) in interest_data { + dao.add_url_interest(url_hash, interest)?; + } + Ok(()) + }) +} + +/// Fetch the interest data +fn fetch_interest_data() -> std::io::Result<Vec<(UrlHash, Interest)>> { + // TODO: this hack should be replaced with something that fetches from remote settings + let bytes = include_bytes!("../test-data"); + let mut reader = Cursor::new(&bytes); + let mut data = vec![]; + + // Loop over all possible interests + for interest in Interest::all() { + // read the count + let mut buf = [0u8; 4]; + reader.read_exact(&mut buf)?; + let count = u32::from_le_bytes(buf); + for _ in 0..count { + let mut url_hash: UrlHash = [0u8; 16]; + reader.read_exact(&mut url_hash)?; + data.push((url_hash, interest)); + } + } + Ok(data) +} + +#[cfg(test)] +mod test { + use super::*; + use crate::InterestVector; + + #[test] + fn test_interest_vectors() { + let db = RelevancyDb::open_for_test(); + ensure_interest_data_populated(&db).unwrap(); + db.read(|dao| { + // Test that the interest data matches the values we started from in + // `bin/generate-test-data.rs` + assert_eq!( + dao.get_url_interest_vector("https://espn.com/").unwrap(), + InterestVector { + sports: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://dogs.com/").unwrap(), + InterestVector { + animals: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://cars.com/").unwrap(), + InterestVector { + autos: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://www.vouge.com/") + .unwrap(), + InterestVector { + fashion: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://slashdot.org/") + .unwrap(), + InterestVector { + tech: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://www.nascar.com/") + .unwrap(), + InterestVector { + autos: 1, + sports: 1, + ..InterestVector::default() + } + ); + assert_eq!( + dao.get_url_interest_vector("https://unknown.url/").unwrap(), + InterestVector::default() + ); + Ok(()) + }) + .unwrap(); + } + + #[test] + fn test_variations_on_the_url() { + let db = RelevancyDb::open_for_test(); + ensure_interest_data_populated(&db).unwrap(); + db.read(|dao| { + // Different paths/queries should work + assert_eq!( + dao.get_url_interest_vector("https://espn.com/foo/bar/?baz") + .unwrap(), + InterestVector { + sports: 1, + ..InterestVector::default() + } + ); + // Different schemes should too + assert_eq!( + dao.get_url_interest_vector("http://espn.com/").unwrap(), + InterestVector { + sports: 1, + ..InterestVector::default() + } + ); + // But changes to the domain shouldn't + assert_eq!( + dao.get_url_interest_vector("http://www.espn.com/").unwrap(), + InterestVector::default() + ); + // However, extra components past the 3rd one in the domain are ignored + assert_eq!( + dao.get_url_interest_vector("https://foo.www.nascar.com/") + .unwrap(), + InterestVector { + autos: 1, + sports: 1, + ..InterestVector::default() + } + ); + Ok(()) + }) + .unwrap(); + } +} diff --git a/third_party/rust/relevancy/src/relevancy.udl b/third_party/rust/relevancy/src/relevancy.udl new file mode 100644 index 0000000000..e07243ec28 --- /dev/null +++ b/third_party/rust/relevancy/src/relevancy.udl @@ -0,0 +1,106 @@ +namespace relevancy { }; + +[Error] +interface RelevancyApiError { + Unexpected(string reason); +}; + +// Top-level class for the Relevancy component +interface RelevancyStore { + // Construct a new RelevancyStore + [Throws=RelevancyApiError] + constructor(string dbpath); + + // Ingest the top URLs by frequency to build up the user's interest vector + [Throws=RelevancyApiError] + void ingest(sequence<string> top_urls); + + // Calculate metrics for the user's interest vector in order to measure how strongly we're + // identifying interests. See the `InterestMetrics` struct for details. + [Throws=RelevancyApiError] + InterestMetrics calculate_metrics(); + + // Get the interest vector for the user. + // + // This is intended to be show to the user in an `about:` page so that users can judge if it + // feels correct. + [Throws=RelevancyApiError] + InterestVector user_interest_vector(); +}; + +enum Interest { + "Animals", + "Arts", + "Autos", + "Business", + "Career", + "Education", + "Fashion", + "Finance", + "Food", + "Government", + "Health", + "Hobbies", + "Home", + "News", + "RealEstate", + "Society", + "Sports", + "Tech", + "Travel", + "Inconclusive", +}; + +// Interest metrics that we want to send to Glean as part of the validation process. These contain +// the cosine similarity when comparing the user's interest against various interest vectors that +// consumers may use. +// +// Cosine similary was chosen because it seems easy to calculate. This was then matched against +// some semi-plausible real-world interest vectors that consumers might use. This is all up for +// debate and we may decide to switch to some other metrics. +// +// Similarity values are transformed to integers by multiplying the floating point value by 1000 and +// rounding. This is to make them compatible with Glean's distribution metrics. +dictionary InterestMetrics { + // Similarity between the user's interest vector and an interest vector where the element for + // the user's top interest is copied, but all other interests are set to zero. This measures + // the highest possible similarity with consumers that used interest vectors with a single + // interest set. + u32 top_single_interest_similarity; + + // The same as before, but the top 2 interests are copied. This measures the highest possible + // similarity with consumers that used interest vectors with a two interests (note: this means + // they would need to choose the user's top two interests and have the exact same proportion + // between them as the user). + u32 top_2interest_similarity; + + // The same as before, but the top 3 interests are copied. + u32 top_3interest_similarity; +}; + +// Vector storing a count value for each interest +// +// Here "vector" refers to the mathematical object, not a Rust `Vec`. It always has a fixed +// number of elements. +dictionary InterestVector { + u32 animals; + u32 arts; + u32 autos; + u32 business; + u32 career; + u32 education; + u32 fashion; + u32 finance; + u32 food; + u32 government; + u32 health; + u32 hobbies; + u32 home; + u32 news; + u32 real_estate; + u32 society; + u32 sports; + u32 tech; + u32 travel; + u32 inconclusive; +}; diff --git a/third_party/rust/relevancy/src/schema.rs b/third_party/rust/relevancy/src/schema.rs new file mode 100644 index 0000000000..bcb2f260d9 --- /dev/null +++ b/third_party/rust/relevancy/src/schema.rs @@ -0,0 +1,53 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ + +use rusqlite::{Connection, Transaction}; +use sql_support::open_database::{self, ConnectionInitializer}; + +/// The current database schema version. +/// +/// For any changes to the schema [`SQL`], please make sure to: +/// +/// 1. Bump this version. +/// 2. Add a migration from the old version to the new version in +/// [`RelevancyConnectionInitializer::upgrade_from`]. +pub const VERSION: u32 = 13; + +/// The current database schema. +pub const SQL: &str = " + CREATE TABLE url_interest( + url_hash BLOB NOT NULL, + interest_code INTEGER NOT NULL, + PRIMARY KEY (url_hash, interest_code) + ) WITHOUT ROWID; +"; + +/// Initializes an SQLite connection to the Relevancy database, performing +/// migrations as needed. +pub struct RelevancyConnectionInitializer; + +impl ConnectionInitializer for RelevancyConnectionInitializer { + const NAME: &'static str = "relevancy db"; + const END_VERSION: u32 = VERSION; + + fn prepare(&self, conn: &Connection, _db_empty: bool) -> open_database::Result<()> { + let initial_pragmas = " + -- Use in-memory storage for TEMP tables. + PRAGMA temp_store = 2; + PRAGMA journal_mode = WAL; + PRAGMA foreign_keys = ON; + "; + conn.execute_batch(initial_pragmas)?; + Ok(()) + } + + fn init(&self, db: &Transaction<'_>) -> open_database::Result<()> { + Ok(db.execute_batch(SQL)?) + } + + fn upgrade_from(&self, _db: &Transaction<'_>, version: u32) -> open_database::Result<()> { + Err(open_database::Error::IncompatibleVersion(version)) + } +} diff --git a/third_party/rust/relevancy/src/url_hash.rs b/third_party/rust/relevancy/src/url_hash.rs new file mode 100644 index 0000000000..d31a45d06b --- /dev/null +++ b/third_party/rust/relevancy/src/url_hash.rs @@ -0,0 +1,63 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +use md5::{Digest, Md5}; +use url::{Host, Url}; + +pub type UrlHash = [u8; 16]; + +/// Given a URL, extract the part of it that we want to use to identify it. +/// +/// We currently use the final 3 components of the URL domain. +/// +/// TODO: decide if this should be 3 or 3 components. +pub fn url_hash_source(url: &str) -> Option<String> { + let url = Url::parse(url).ok()?; + let domain = match url.host() { + Some(Host::Domain(d)) => d, + _ => return None, + }; + // This will store indexes of `.` chars as we search backwards. + let mut pos = domain.len(); + for _ in 0..3 { + match domain[0..pos].rfind('.') { + Some(p) => pos = p, + // The domain has less than 3 dots, return it all + None => return Some(domain.to_owned()), + } + } + Some(domain[pos + 1..].to_owned()) +} + +pub fn hash_url(url: &str) -> Option<UrlHash> { + url_hash_source(url).map(|hash_source| { + let mut hasher = Md5::new(); + hasher.update(hash_source); + let result = hasher.finalize(); + result.into() + }) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_url_hash_source() { + let table = [ + ("http://example.com/some-path", Some("example.com")), + ("http://foo.example.com/some-path", Some("foo.example.com")), + ( + "http://foo.bar.baz.example.com/some-path", + Some("baz.example.com"), + ), + ("http://foo.com.uk/some-path", Some("foo.com.uk")), + ("http://amazon.com/some-path", Some("amazon.com")), + ("http://192.168.0.1/some-path", None), + ]; + for (url, expected) in table { + assert_eq!(url_hash_source(url).as_deref(), expected) + } + } +} |