summaryrefslogtreecommitdiffstats
path: root/third_party/rust/relevancy/src
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-15 03:34:50 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-15 03:34:50 +0000
commitdef92d1b8e9d373e2f6f27c366d578d97d8960c6 (patch)
tree2ef34b9ad8bb9a9220e05d60352558b15f513894 /third_party/rust/relevancy/src
parentAdding debian version 125.0.3-1. (diff)
downloadfirefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.tar.xz
firefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.zip
Merging upstream version 126.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/relevancy/src')
-rw-r--r--third_party/rust/relevancy/src/bin/generate-test-data.rs43
-rw-r--r--third_party/rust/relevancy/src/db.rs118
-rw-r--r--third_party/rust/relevancy/src/error.rs44
-rw-r--r--third_party/rust/relevancy/src/interest.rs167
-rw-r--r--third_party/rust/relevancy/src/lib.rs81
-rw-r--r--third_party/rust/relevancy/src/populate_interests.rs157
-rw-r--r--third_party/rust/relevancy/src/relevancy.udl106
-rw-r--r--third_party/rust/relevancy/src/schema.rs53
-rw-r--r--third_party/rust/relevancy/src/url_hash.rs63
9 files changed, 832 insertions, 0 deletions
diff --git a/third_party/rust/relevancy/src/bin/generate-test-data.rs b/third_party/rust/relevancy/src/bin/generate-test-data.rs
new file mode 100644
index 0000000000..04c5827275
--- /dev/null
+++ b/third_party/rust/relevancy/src/bin/generate-test-data.rs
@@ -0,0 +1,43 @@
+use relevancy::{
+ url_hash::{hash_url, UrlHash},
+ Interest,
+};
+use std::{collections::HashMap, fs::File, io::Write};
+
+// Generate a set of test data and output it to the `test-data` file.
+//
+// This is meant to be a placeholder until we can get this data stored in remote settings.
+
+const TEST_INTEREST_DATA: &[(&str, Interest)] = &[
+ ("https://espn.com/", Interest::Sports),
+ ("https://dogs.com/", Interest::Animals),
+ ("https://cars.com/", Interest::Autos),
+ ("https://www.vouge.com/", Interest::Fashion),
+ ("https://slashdot.org/", Interest::Tech),
+ ("https://www.nascar.com/", Interest::Autos),
+ ("https://www.nascar.com/", Interest::Sports),
+];
+
+fn main() {
+ let mut interest_map: HashMap<Interest, Vec<UrlHash>> =
+ HashMap::from_iter(Interest::all().into_iter().map(|i| (i, vec![])));
+ for (url, interest) in TEST_INTEREST_DATA {
+ if let Some(hash) = hash_url(url) {
+ interest_map.get_mut(interest).unwrap().push(hash)
+ }
+ }
+
+ let mut f = File::create("test-data").expect("Error opening file");
+ // Loop over all possible interests
+ for interest in Interest::all() {
+ // Get the list of URL hashes for that interest
+ let hashes = interest_map.get(&interest).unwrap();
+ // Write the count
+ f.write_all(&(hashes.len() as u32).to_le_bytes())
+ .expect("Error writing file");
+ // Write the hashes
+ for hash in hashes {
+ f.write_all(hash).expect("Error writing file");
+ }
+ }
+}
diff --git a/third_party/rust/relevancy/src/db.rs b/third_party/rust/relevancy/src/db.rs
new file mode 100644
index 0000000000..08684c45af
--- /dev/null
+++ b/third_party/rust/relevancy/src/db.rs
@@ -0,0 +1,118 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+use crate::{
+ schema::RelevancyConnectionInitializer,
+ url_hash::{hash_url, UrlHash},
+ Interest, InterestVector, Result,
+};
+use parking_lot::Mutex;
+use rusqlite::{Connection, OpenFlags};
+use sql_support::{open_database::open_database_with_flags, ConnExt};
+use std::path::Path;
+
+/// A thread-safe wrapper around an SQLite connection to the Relevancy database
+pub struct RelevancyDb {
+ pub conn: Mutex<Connection>,
+}
+
+impl RelevancyDb {
+ pub fn open(path: impl AsRef<Path>) -> Result<Self> {
+ let conn = open_database_with_flags(
+ path,
+ OpenFlags::SQLITE_OPEN_URI
+ | OpenFlags::SQLITE_OPEN_NO_MUTEX
+ | OpenFlags::SQLITE_OPEN_CREATE
+ | OpenFlags::SQLITE_OPEN_READ_WRITE,
+ &RelevancyConnectionInitializer,
+ )?;
+ Ok(Self {
+ conn: Mutex::new(conn),
+ })
+ }
+
+ #[cfg(test)]
+ pub fn open_for_test() -> Self {
+ use std::sync::atomic::{AtomicU32, Ordering};
+ static COUNTER: AtomicU32 = AtomicU32::new(0);
+ let count = COUNTER.fetch_add(1, Ordering::Relaxed);
+ Self::open(format!("file:test{count}.sqlite?mode=memory&cache=shared")).unwrap()
+ }
+
+ /// Accesses the Suggest database in a transaction for reading.
+ pub fn read<T>(&self, op: impl FnOnce(&RelevancyDao) -> Result<T>) -> Result<T> {
+ let mut conn = self.conn.lock();
+ let tx = conn.transaction()?;
+ let dao = RelevancyDao::new(&tx);
+ op(&dao)
+ }
+
+ /// Accesses the Suggest database in a transaction for reading and writing.
+ pub fn read_write<T>(&self, op: impl FnOnce(&mut RelevancyDao) -> Result<T>) -> Result<T> {
+ let mut conn = self.conn.lock();
+ let tx = conn.transaction()?;
+ let mut dao = RelevancyDao::new(&tx);
+ let result = op(&mut dao)?;
+ tx.commit()?;
+ Ok(result)
+ }
+}
+
+/// A data access object (DAO) that wraps a connection to the Relevancy database
+///
+/// Methods that only read from the database take an immutable reference to
+/// `self` (`&self`), and methods that write to the database take a mutable
+/// reference (`&mut self`).
+pub struct RelevancyDao<'a> {
+ pub conn: &'a Connection,
+}
+
+impl<'a> RelevancyDao<'a> {
+ fn new(conn: &'a Connection) -> Self {
+ Self { conn }
+ }
+
+ /// Associate a URL with an interest
+ pub fn add_url_interest(&mut self, url_hash: UrlHash, interest: Interest) -> Result<()> {
+ let sql = "
+ INSERT OR REPLACE INTO url_interest(url_hash, interest_code)
+ VALUES (?, ?)
+ ";
+ self.conn.execute(sql, (url_hash, interest as u32))?;
+ Ok(())
+ }
+
+ /// Get an interest vector for a URL
+ pub fn get_url_interest_vector(&self, url: &str) -> Result<InterestVector> {
+ let hash = match hash_url(url) {
+ Some(u) => u,
+ None => return Ok(InterestVector::default()),
+ };
+ let mut stmt = self.conn.prepare_cached(
+ "
+ SELECT interest_code
+ FROM url_interest
+ WHERE url_hash=?
+ ",
+ )?;
+ let interests = stmt.query_and_then((hash,), |row| -> Result<Interest> {
+ Ok(row.get::<_, u32>(0)?.into())
+ })?;
+
+ let mut interest_vec = InterestVector::default();
+ for interest in interests {
+ interest_vec[interest?] += 1
+ }
+ Ok(interest_vec)
+ }
+
+ /// Do we need to load the interest data?
+ pub fn need_to_load_url_interests(&self) -> Result<bool> {
+ // TODO: we probably will need a better check than this.
+ Ok(self
+ .conn
+ .query_one("SELECT NOT EXISTS (SELECT 1 FROM url_interest)")?)
+ }
+}
diff --git a/third_party/rust/relevancy/src/error.rs b/third_party/rust/relevancy/src/error.rs
new file mode 100644
index 0000000000..93ca7aabaa
--- /dev/null
+++ b/third_party/rust/relevancy/src/error.rs
@@ -0,0 +1,44 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+* License, v. 2.0. If a copy of the MPL was not distributed with this
+* file, You can obtain one at http://mozilla.org/MPL/2.0/.
+*/
+
+use error_support::{ErrorHandling, GetErrorHandling};
+
+/// Errors we return via the public interface.
+#[derive(Debug, thiserror::Error)]
+pub enum RelevancyApiError {
+ #[error("Unexpected Error: {reason}")]
+ Unexpected { reason: String },
+}
+
+/// Errors we use internally
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+ #[error("Error opening database: {0}")]
+ OpenDatabase(#[from] sql_support::open_database::Error),
+
+ #[error("Sql error: {0}")]
+ SqlError(#[from] rusqlite::Error),
+
+ #[error("Error fetching interest data")]
+ FetchInterestDataError,
+}
+
+/// Result enum for the public API
+pub type ApiResult<T> = std::result::Result<T, RelevancyApiError>;
+
+/// Result enum for internal functions
+pub type Result<T> = std::result::Result<T, Error>;
+
+// Define how our internal errors are handled and converted to external errors
+// See `support/error/README.md` for how this works, especially the warning about PII.
+impl GetErrorHandling for Error {
+ type ExternalError = RelevancyApiError;
+
+ fn get_error_handling(&self) -> ErrorHandling<Self::ExternalError> {
+ ErrorHandling::convert(RelevancyApiError::Unexpected {
+ reason: self.to_string(),
+ })
+ }
+}
diff --git a/third_party/rust/relevancy/src/interest.rs b/third_party/rust/relevancy/src/interest.rs
new file mode 100644
index 0000000000..0573c743fc
--- /dev/null
+++ b/third_party/rust/relevancy/src/interest.rs
@@ -0,0 +1,167 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/// List of possible interests for a domain. Domains can have be associated with one or multiple
+/// interests. `Inconclusive` is used for domains in the user's top sites that we can't classify
+/// because there's no corresponding entry in the interest database.
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+#[repr(u32)]
+pub enum Interest {
+ Animals,
+ Arts,
+ Autos,
+ Business,
+ Career,
+ Education,
+ Fashion,
+ Finance,
+ Food,
+ Government,
+ Health,
+ Hobbies,
+ Home,
+ News,
+ RealEstate,
+ Society,
+ Sports,
+ Tech,
+ Travel,
+ Inconclusive,
+}
+
+impl From<Interest> for u32 {
+ fn from(interest: Interest) -> Self {
+ interest as u32
+ }
+}
+
+impl From<Interest> for usize {
+ fn from(interest: Interest) -> Self {
+ interest as usize
+ }
+}
+
+impl From<u32> for Interest {
+ fn from(code: u32) -> Self {
+ if code as usize > Self::COUNT {
+ panic!("Invalid interest code: {code}")
+ }
+ // Safety: This is safe since Interest has a u32 representation and we've done a bounds
+ // check
+ unsafe { std::mem::transmute(code) }
+ }
+}
+
+impl Interest {
+ const COUNT: usize = 20;
+
+ pub fn all() -> [Interest; Self::COUNT] {
+ [
+ Self::Animals,
+ Self::Arts,
+ Self::Autos,
+ Self::Business,
+ Self::Career,
+ Self::Education,
+ Self::Fashion,
+ Self::Finance,
+ Self::Food,
+ Self::Government,
+ Self::Health,
+ Self::Hobbies,
+ Self::Home,
+ Self::News,
+ Self::RealEstate,
+ Self::Society,
+ Self::Sports,
+ Self::Tech,
+ Self::Travel,
+ Self::Inconclusive,
+ ]
+ }
+}
+
+/// Vector storing a count value for each interest
+///
+/// Here "vector" refers to the mathematical object, not a Rust `Vec`. It always has a fixed
+/// number of elements.
+#[derive(Debug, Default, PartialEq, Eq)]
+pub struct InterestVector {
+ pub animals: u32,
+ pub arts: u32,
+ pub autos: u32,
+ pub business: u32,
+ pub career: u32,
+ pub education: u32,
+ pub fashion: u32,
+ pub finance: u32,
+ pub food: u32,
+ pub government: u32,
+ pub health: u32,
+ pub hobbies: u32,
+ pub home: u32,
+ pub news: u32,
+ pub real_estate: u32,
+ pub society: u32,
+ pub sports: u32,
+ pub tech: u32,
+ pub travel: u32,
+ pub inconclusive: u32,
+}
+
+impl std::ops::Index<Interest> for InterestVector {
+ type Output = u32;
+
+ fn index(&self, index: Interest) -> &u32 {
+ match index {
+ Interest::Animals => &self.animals,
+ Interest::Arts => &self.arts,
+ Interest::Autos => &self.autos,
+ Interest::Business => &self.business,
+ Interest::Career => &self.career,
+ Interest::Education => &self.education,
+ Interest::Fashion => &self.fashion,
+ Interest::Finance => &self.finance,
+ Interest::Food => &self.food,
+ Interest::Government => &self.government,
+ Interest::Health => &self.health,
+ Interest::Hobbies => &self.hobbies,
+ Interest::Home => &self.home,
+ Interest::News => &self.news,
+ Interest::RealEstate => &self.real_estate,
+ Interest::Society => &self.society,
+ Interest::Sports => &self.sports,
+ Interest::Tech => &self.tech,
+ Interest::Travel => &self.travel,
+ Interest::Inconclusive => &self.inconclusive,
+ }
+ }
+}
+
+impl std::ops::IndexMut<Interest> for InterestVector {
+ fn index_mut(&mut self, index: Interest) -> &mut u32 {
+ match index {
+ Interest::Animals => &mut self.animals,
+ Interest::Arts => &mut self.arts,
+ Interest::Autos => &mut self.autos,
+ Interest::Business => &mut self.business,
+ Interest::Career => &mut self.career,
+ Interest::Education => &mut self.education,
+ Interest::Fashion => &mut self.fashion,
+ Interest::Finance => &mut self.finance,
+ Interest::Food => &mut self.food,
+ Interest::Government => &mut self.government,
+ Interest::Health => &mut self.health,
+ Interest::Hobbies => &mut self.hobbies,
+ Interest::Home => &mut self.home,
+ Interest::News => &mut self.news,
+ Interest::RealEstate => &mut self.real_estate,
+ Interest::Society => &mut self.society,
+ Interest::Sports => &mut self.sports,
+ Interest::Tech => &mut self.tech,
+ Interest::Travel => &mut self.travel,
+ Interest::Inconclusive => &mut self.inconclusive,
+ }
+ }
+}
diff --git a/third_party/rust/relevancy/src/lib.rs b/third_party/rust/relevancy/src/lib.rs
new file mode 100644
index 0000000000..157a26277e
--- /dev/null
+++ b/third_party/rust/relevancy/src/lib.rs
@@ -0,0 +1,81 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+//! Proposed API for the relevancy component (validation phase)
+//!
+//! The goal here is to allow us to validate that we can reliably detect user interests from
+//! history data, without spending too much time building the API out. There's some hand-waving
+//! towards how we would use this data to rank search results, but we don't need to come to a final
+//! decision on that yet.
+
+mod db;
+mod error;
+mod interest;
+mod populate_interests;
+mod schema;
+pub mod url_hash;
+
+pub use db::RelevancyDb;
+pub use error::{ApiResult, Error, RelevancyApiError, Result};
+pub use interest::{Interest, InterestVector};
+
+use error_support::handle_error;
+
+pub struct RelevancyStore {
+ db: RelevancyDb,
+}
+
+/// Top-level API for the Relevancy component
+impl RelevancyStore {
+ #[handle_error(Error)]
+ pub fn new(db_path: String) -> ApiResult<Self> {
+ Ok(Self {
+ db: RelevancyDb::open(db_path)?,
+ })
+ }
+
+ /// Ingest top URLs to build the user's interest vector.
+ ///
+ /// Consumer should pass a list of the user's top URLs by frecency to this method. It will
+ /// then:
+ ///
+ /// - Download the URL interest data from remote settings. Eventually this should be cached /
+ /// stored in the database, but for now it would be fine to download fresh data each time.
+ /// - Match the user's top URls against the interest data to build up their interest vector.
+ /// - Store the user's interest vector in the database.
+ ///
+ /// This method may execute for a long time and should only be called from a worker thread.
+ #[handle_error(Error)]
+ pub fn ingest(&self, _top_urls_by_frecency: Vec<String>) -> ApiResult<()> {
+ populate_interests::ensure_interest_data_populated(&self.db)?;
+ todo!()
+ }
+
+ /// Calculate metrics for the validation phase
+ ///
+ /// This runs after [Self::ingest]. It takes the interest vector that ingest created and
+ /// calculates a set of metrics that we can report to glean.
+ #[handle_error(Error)]
+ pub fn calculate_metrics(&self) -> ApiResult<InterestMetrics> {
+ todo!()
+ }
+
+ /// Get the user's interest vector directly.
+ ///
+ /// This runs after [Self::ingest]. It returns the interest vector directly so that the
+ /// consumer can show it in an `about:` page.
+ #[handle_error(Error)]
+ pub fn user_interest_vector(&self) -> ApiResult<InterestVector> {
+ todo!()
+ }
+}
+
+/// Interest metric data. See `relevancy.udl` for details.
+pub struct InterestMetrics {
+ pub top_single_interest_similarity: u32,
+ pub top_2interest_similarity: u32,
+ pub top_3interest_similarity: u32,
+}
+
+uniffi::include_scaffolding!("relevancy");
diff --git a/third_party/rust/relevancy/src/populate_interests.rs b/third_party/rust/relevancy/src/populate_interests.rs
new file mode 100644
index 0000000000..e33b677dd6
--- /dev/null
+++ b/third_party/rust/relevancy/src/populate_interests.rs
@@ -0,0 +1,157 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+use crate::{url_hash::UrlHash, Error, Interest, RelevancyDb, Result};
+use std::io::{Cursor, Read};
+
+pub fn ensure_interest_data_populated(db: &RelevancyDb) -> Result<()> {
+ if !db.read(|dao| dao.need_to_load_url_interests())? {
+ return Ok(());
+ }
+ let interest_data = match fetch_interest_data() {
+ Ok(data) => data,
+ Err(e) => {
+ log::warn!("error fetching interest data: {e}");
+ return Err(Error::FetchInterestDataError);
+ }
+ };
+ db.read_write(move |dao| {
+ for (url_hash, interest) in interest_data {
+ dao.add_url_interest(url_hash, interest)?;
+ }
+ Ok(())
+ })
+}
+
+/// Fetch the interest data
+fn fetch_interest_data() -> std::io::Result<Vec<(UrlHash, Interest)>> {
+ // TODO: this hack should be replaced with something that fetches from remote settings
+ let bytes = include_bytes!("../test-data");
+ let mut reader = Cursor::new(&bytes);
+ let mut data = vec![];
+
+ // Loop over all possible interests
+ for interest in Interest::all() {
+ // read the count
+ let mut buf = [0u8; 4];
+ reader.read_exact(&mut buf)?;
+ let count = u32::from_le_bytes(buf);
+ for _ in 0..count {
+ let mut url_hash: UrlHash = [0u8; 16];
+ reader.read_exact(&mut url_hash)?;
+ data.push((url_hash, interest));
+ }
+ }
+ Ok(data)
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+ use crate::InterestVector;
+
+ #[test]
+ fn test_interest_vectors() {
+ let db = RelevancyDb::open_for_test();
+ ensure_interest_data_populated(&db).unwrap();
+ db.read(|dao| {
+ // Test that the interest data matches the values we started from in
+ // `bin/generate-test-data.rs`
+ assert_eq!(
+ dao.get_url_interest_vector("https://espn.com/").unwrap(),
+ InterestVector {
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://dogs.com/").unwrap(),
+ InterestVector {
+ animals: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://cars.com/").unwrap(),
+ InterestVector {
+ autos: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://www.vouge.com/")
+ .unwrap(),
+ InterestVector {
+ fashion: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://slashdot.org/")
+ .unwrap(),
+ InterestVector {
+ tech: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://www.nascar.com/")
+ .unwrap(),
+ InterestVector {
+ autos: 1,
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://unknown.url/").unwrap(),
+ InterestVector::default()
+ );
+ Ok(())
+ })
+ .unwrap();
+ }
+
+ #[test]
+ fn test_variations_on_the_url() {
+ let db = RelevancyDb::open_for_test();
+ ensure_interest_data_populated(&db).unwrap();
+ db.read(|dao| {
+ // Different paths/queries should work
+ assert_eq!(
+ dao.get_url_interest_vector("https://espn.com/foo/bar/?baz")
+ .unwrap(),
+ InterestVector {
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ // Different schemes should too
+ assert_eq!(
+ dao.get_url_interest_vector("http://espn.com/").unwrap(),
+ InterestVector {
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ // But changes to the domain shouldn't
+ assert_eq!(
+ dao.get_url_interest_vector("http://www.espn.com/").unwrap(),
+ InterestVector::default()
+ );
+ // However, extra components past the 3rd one in the domain are ignored
+ assert_eq!(
+ dao.get_url_interest_vector("https://foo.www.nascar.com/")
+ .unwrap(),
+ InterestVector {
+ autos: 1,
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ Ok(())
+ })
+ .unwrap();
+ }
+}
diff --git a/third_party/rust/relevancy/src/relevancy.udl b/third_party/rust/relevancy/src/relevancy.udl
new file mode 100644
index 0000000000..e07243ec28
--- /dev/null
+++ b/third_party/rust/relevancy/src/relevancy.udl
@@ -0,0 +1,106 @@
+namespace relevancy { };
+
+[Error]
+interface RelevancyApiError {
+ Unexpected(string reason);
+};
+
+// Top-level class for the Relevancy component
+interface RelevancyStore {
+ // Construct a new RelevancyStore
+ [Throws=RelevancyApiError]
+ constructor(string dbpath);
+
+ // Ingest the top URLs by frequency to build up the user's interest vector
+ [Throws=RelevancyApiError]
+ void ingest(sequence<string> top_urls);
+
+ // Calculate metrics for the user's interest vector in order to measure how strongly we're
+ // identifying interests. See the `InterestMetrics` struct for details.
+ [Throws=RelevancyApiError]
+ InterestMetrics calculate_metrics();
+
+ // Get the interest vector for the user.
+ //
+ // This is intended to be show to the user in an `about:` page so that users can judge if it
+ // feels correct.
+ [Throws=RelevancyApiError]
+ InterestVector user_interest_vector();
+};
+
+enum Interest {
+ "Animals",
+ "Arts",
+ "Autos",
+ "Business",
+ "Career",
+ "Education",
+ "Fashion",
+ "Finance",
+ "Food",
+ "Government",
+ "Health",
+ "Hobbies",
+ "Home",
+ "News",
+ "RealEstate",
+ "Society",
+ "Sports",
+ "Tech",
+ "Travel",
+ "Inconclusive",
+};
+
+// Interest metrics that we want to send to Glean as part of the validation process. These contain
+// the cosine similarity when comparing the user's interest against various interest vectors that
+// consumers may use.
+//
+// Cosine similary was chosen because it seems easy to calculate. This was then matched against
+// some semi-plausible real-world interest vectors that consumers might use. This is all up for
+// debate and we may decide to switch to some other metrics.
+//
+// Similarity values are transformed to integers by multiplying the floating point value by 1000 and
+// rounding. This is to make them compatible with Glean's distribution metrics.
+dictionary InterestMetrics {
+ // Similarity between the user's interest vector and an interest vector where the element for
+ // the user's top interest is copied, but all other interests are set to zero. This measures
+ // the highest possible similarity with consumers that used interest vectors with a single
+ // interest set.
+ u32 top_single_interest_similarity;
+
+ // The same as before, but the top 2 interests are copied. This measures the highest possible
+ // similarity with consumers that used interest vectors with a two interests (note: this means
+ // they would need to choose the user's top two interests and have the exact same proportion
+ // between them as the user).
+ u32 top_2interest_similarity;
+
+ // The same as before, but the top 3 interests are copied.
+ u32 top_3interest_similarity;
+};
+
+// Vector storing a count value for each interest
+//
+// Here "vector" refers to the mathematical object, not a Rust `Vec`. It always has a fixed
+// number of elements.
+dictionary InterestVector {
+ u32 animals;
+ u32 arts;
+ u32 autos;
+ u32 business;
+ u32 career;
+ u32 education;
+ u32 fashion;
+ u32 finance;
+ u32 food;
+ u32 government;
+ u32 health;
+ u32 hobbies;
+ u32 home;
+ u32 news;
+ u32 real_estate;
+ u32 society;
+ u32 sports;
+ u32 tech;
+ u32 travel;
+ u32 inconclusive;
+};
diff --git a/third_party/rust/relevancy/src/schema.rs b/third_party/rust/relevancy/src/schema.rs
new file mode 100644
index 0000000000..bcb2f260d9
--- /dev/null
+++ b/third_party/rust/relevancy/src/schema.rs
@@ -0,0 +1,53 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+use rusqlite::{Connection, Transaction};
+use sql_support::open_database::{self, ConnectionInitializer};
+
+/// The current database schema version.
+///
+/// For any changes to the schema [`SQL`], please make sure to:
+///
+/// 1. Bump this version.
+/// 2. Add a migration from the old version to the new version in
+/// [`RelevancyConnectionInitializer::upgrade_from`].
+pub const VERSION: u32 = 13;
+
+/// The current database schema.
+pub const SQL: &str = "
+ CREATE TABLE url_interest(
+ url_hash BLOB NOT NULL,
+ interest_code INTEGER NOT NULL,
+ PRIMARY KEY (url_hash, interest_code)
+ ) WITHOUT ROWID;
+";
+
+/// Initializes an SQLite connection to the Relevancy database, performing
+/// migrations as needed.
+pub struct RelevancyConnectionInitializer;
+
+impl ConnectionInitializer for RelevancyConnectionInitializer {
+ const NAME: &'static str = "relevancy db";
+ const END_VERSION: u32 = VERSION;
+
+ fn prepare(&self, conn: &Connection, _db_empty: bool) -> open_database::Result<()> {
+ let initial_pragmas = "
+ -- Use in-memory storage for TEMP tables.
+ PRAGMA temp_store = 2;
+ PRAGMA journal_mode = WAL;
+ PRAGMA foreign_keys = ON;
+ ";
+ conn.execute_batch(initial_pragmas)?;
+ Ok(())
+ }
+
+ fn init(&self, db: &Transaction<'_>) -> open_database::Result<()> {
+ Ok(db.execute_batch(SQL)?)
+ }
+
+ fn upgrade_from(&self, _db: &Transaction<'_>, version: u32) -> open_database::Result<()> {
+ Err(open_database::Error::IncompatibleVersion(version))
+ }
+}
diff --git a/third_party/rust/relevancy/src/url_hash.rs b/third_party/rust/relevancy/src/url_hash.rs
new file mode 100644
index 0000000000..d31a45d06b
--- /dev/null
+++ b/third_party/rust/relevancy/src/url_hash.rs
@@ -0,0 +1,63 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+use md5::{Digest, Md5};
+use url::{Host, Url};
+
+pub type UrlHash = [u8; 16];
+
+/// Given a URL, extract the part of it that we want to use to identify it.
+///
+/// We currently use the final 3 components of the URL domain.
+///
+/// TODO: decide if this should be 3 or 3 components.
+pub fn url_hash_source(url: &str) -> Option<String> {
+ let url = Url::parse(url).ok()?;
+ let domain = match url.host() {
+ Some(Host::Domain(d)) => d,
+ _ => return None,
+ };
+ // This will store indexes of `.` chars as we search backwards.
+ let mut pos = domain.len();
+ for _ in 0..3 {
+ match domain[0..pos].rfind('.') {
+ Some(p) => pos = p,
+ // The domain has less than 3 dots, return it all
+ None => return Some(domain.to_owned()),
+ }
+ }
+ Some(domain[pos + 1..].to_owned())
+}
+
+pub fn hash_url(url: &str) -> Option<UrlHash> {
+ url_hash_source(url).map(|hash_source| {
+ let mut hasher = Md5::new();
+ hasher.update(hash_source);
+ let result = hasher.finalize();
+ result.into()
+ })
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_url_hash_source() {
+ let table = [
+ ("http://example.com/some-path", Some("example.com")),
+ ("http://foo.example.com/some-path", Some("foo.example.com")),
+ (
+ "http://foo.bar.baz.example.com/some-path",
+ Some("baz.example.com"),
+ ),
+ ("http://foo.com.uk/some-path", Some("foo.com.uk")),
+ ("http://amazon.com/some-path", Some("amazon.com")),
+ ("http://192.168.0.1/some-path", None),
+ ];
+ for (url, expected) in table {
+ assert_eq!(url_hash_source(url).as_deref(), expected)
+ }
+ }
+}