summaryrefslogtreecommitdiffstats
path: root/third_party/rust/relevancy/src
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 05:35:29 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-06-12 05:35:29 +0000
commit59203c63bb777a3bacec32fb8830fba33540e809 (patch)
tree58298e711c0ff0575818c30485b44a2f21bf28a0 /third_party/rust/relevancy/src
parentAdding upstream version 126.0.1. (diff)
downloadfirefox-59203c63bb777a3bacec32fb8830fba33540e809.tar.xz
firefox-59203c63bb777a3bacec32fb8830fba33540e809.zip
Adding upstream version 127.0.upstream/127.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/rust/relevancy/src')
-rw-r--r--third_party/rust/relevancy/src/db.rs68
-rw-r--r--third_party/rust/relevancy/src/error.rs15
-rw-r--r--third_party/rust/relevancy/src/ingest.rs394
-rw-r--r--third_party/rust/relevancy/src/interest.rs152
-rw-r--r--third_party/rust/relevancy/src/lib.rs80
-rw-r--r--third_party/rust/relevancy/src/populate_interests.rs157
-rw-r--r--third_party/rust/relevancy/src/relevancy.udl17
-rw-r--r--third_party/rust/relevancy/src/rs.rs60
-rw-r--r--third_party/rust/relevancy/src/url_hash.rs15
9 files changed, 720 insertions, 238 deletions
diff --git a/third_party/rust/relevancy/src/db.rs b/third_party/rust/relevancy/src/db.rs
index 08684c45af..b2dc0b0c83 100644
--- a/third_party/rust/relevancy/src/db.rs
+++ b/third_party/rust/relevancy/src/db.rs
@@ -8,52 +8,66 @@ use crate::{
url_hash::{hash_url, UrlHash},
Interest, InterestVector, Result,
};
-use parking_lot::Mutex;
+use interrupt_support::SqlInterruptScope;
use rusqlite::{Connection, OpenFlags};
-use sql_support::{open_database::open_database_with_flags, ConnExt};
+use sql_support::{ConnExt, LazyDb};
use std::path::Path;
/// A thread-safe wrapper around an SQLite connection to the Relevancy database
pub struct RelevancyDb {
- pub conn: Mutex<Connection>,
+ reader: LazyDb<RelevancyConnectionInitializer>,
+ writer: LazyDb<RelevancyConnectionInitializer>,
}
impl RelevancyDb {
- pub fn open(path: impl AsRef<Path>) -> Result<Self> {
- let conn = open_database_with_flags(
- path,
- OpenFlags::SQLITE_OPEN_URI
- | OpenFlags::SQLITE_OPEN_NO_MUTEX
- | OpenFlags::SQLITE_OPEN_CREATE
- | OpenFlags::SQLITE_OPEN_READ_WRITE,
- &RelevancyConnectionInitializer,
- )?;
- Ok(Self {
- conn: Mutex::new(conn),
- })
+ pub fn new(path: impl AsRef<Path>) -> Self {
+ // Note: use `SQLITE_OPEN_READ_WRITE` for both read and write connections.
+ // Even if we're opening a read connection, we may need to do a write as part of the
+ // initialization process.
+ //
+ // The read-only nature of the connection is enforced by the fact that [RelevancyDb::read] uses a
+ // shared ref to the `RelevancyDao`.
+ let db_open_flags = OpenFlags::SQLITE_OPEN_URI
+ | OpenFlags::SQLITE_OPEN_NO_MUTEX
+ | OpenFlags::SQLITE_OPEN_CREATE
+ | OpenFlags::SQLITE_OPEN_READ_WRITE;
+ Self {
+ reader: LazyDb::new(path.as_ref(), db_open_flags, RelevancyConnectionInitializer),
+ writer: LazyDb::new(path.as_ref(), db_open_flags, RelevancyConnectionInitializer),
+ }
+ }
+
+ pub fn close(&self) {
+ self.reader.close(true);
+ self.writer.close(true);
+ }
+
+ pub fn interrupt(&self) {
+ self.reader.interrupt();
+ self.writer.interrupt();
}
#[cfg(test)]
- pub fn open_for_test() -> Self {
+ pub fn new_for_test() -> Self {
use std::sync::atomic::{AtomicU32, Ordering};
static COUNTER: AtomicU32 = AtomicU32::new(0);
let count = COUNTER.fetch_add(1, Ordering::Relaxed);
- Self::open(format!("file:test{count}.sqlite?mode=memory&cache=shared")).unwrap()
+ Self::new(format!("file:test{count}.sqlite?mode=memory&cache=shared"))
}
/// Accesses the Suggest database in a transaction for reading.
pub fn read<T>(&self, op: impl FnOnce(&RelevancyDao) -> Result<T>) -> Result<T> {
- let mut conn = self.conn.lock();
+ let (mut conn, scope) = self.reader.lock()?;
let tx = conn.transaction()?;
- let dao = RelevancyDao::new(&tx);
+ let dao = RelevancyDao::new(&tx, scope);
op(&dao)
}
/// Accesses the Suggest database in a transaction for reading and writing.
pub fn read_write<T>(&self, op: impl FnOnce(&mut RelevancyDao) -> Result<T>) -> Result<T> {
- let mut conn = self.conn.lock();
+ let (mut conn, scope) = self.writer.lock()?;
let tx = conn.transaction()?;
- let mut dao = RelevancyDao::new(&tx);
+ let mut dao = RelevancyDao::new(&tx, scope);
let result = op(&mut dao)?;
tx.commit()?;
Ok(result)
@@ -67,11 +81,17 @@ impl RelevancyDb {
/// reference (`&mut self`).
pub struct RelevancyDao<'a> {
pub conn: &'a Connection,
+ pub scope: SqlInterruptScope,
}
impl<'a> RelevancyDao<'a> {
- fn new(conn: &'a Connection) -> Self {
- Self { conn }
+ fn new(conn: &'a Connection, scope: SqlInterruptScope) -> Self {
+ Self { conn, scope }
+ }
+
+ /// Return Err(Interrupted) if we were interrupted
+ pub fn err_if_interrupted(&self) -> Result<()> {
+ Ok(self.scope.err_if_interrupted()?)
}
/// Associate a URL with an interest
@@ -98,7 +118,7 @@ impl<'a> RelevancyDao<'a> {
",
)?;
let interests = stmt.query_and_then((hash,), |row| -> Result<Interest> {
- Ok(row.get::<_, u32>(0)?.into())
+ row.get::<_, u32>(0)?.try_into()
})?;
let mut interest_vec = InterestVector::default();
diff --git a/third_party/rust/relevancy/src/error.rs b/third_party/rust/relevancy/src/error.rs
index 93ca7aabaa..1d42ff2c03 100644
--- a/third_party/rust/relevancy/src/error.rs
+++ b/third_party/rust/relevancy/src/error.rs
@@ -23,6 +23,21 @@ pub enum Error {
#[error("Error fetching interest data")]
FetchInterestDataError,
+
+ #[error("Interrupted")]
+ Interrupted(#[from] interrupt_support::Interrupted),
+
+ #[error("Invalid interest code: {0}")]
+ InvalidInterestCode(u32),
+
+ #[error("Remote Setting Error: {0}")]
+ RemoteSettingsError(#[from] remote_settings::RemoteSettingsError),
+
+ #[error("Serde Json Error: {0}")]
+ SerdeJsonError(#[from] serde_json::Error),
+
+ #[error("Base64 Decode Error: {0}")]
+ Base64DecodeError(String),
}
/// Result enum for the public API
diff --git a/third_party/rust/relevancy/src/ingest.rs b/third_party/rust/relevancy/src/ingest.rs
new file mode 100644
index 0000000000..dc01fbe019
--- /dev/null
+++ b/third_party/rust/relevancy/src/ingest.rs
@@ -0,0 +1,394 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+use crate::db::RelevancyDao;
+use crate::rs::{
+ RelevancyAttachmentData, RelevancyRecord, RelevancyRemoteSettingsClient,
+ REMOTE_SETTINGS_COLLECTION,
+};
+use crate::url_hash::UrlHash;
+use crate::{Error, Interest, RelevancyDb, Result};
+use base64::{engine::general_purpose::STANDARD, Engine};
+use remote_settings::{Client, RemoteSettingsConfig, RemoteSettingsRecord, RemoteSettingsServer};
+
+// Number of rows to write when inserting interest data before checking for interruption
+const WRITE_CHUNK_SIZE: usize = 100;
+
+pub fn ensure_interest_data_populated(db: &RelevancyDb) -> Result<()> {
+ if !db.read(|dao| dao.need_to_load_url_interests())? {
+ return Ok(());
+ }
+
+ match fetch_interest_data() {
+ Ok(data) => {
+ db.read_write(move |dao| insert_interest_data(data, dao))?;
+ }
+ Err(e) => {
+ log::warn!("error fetching interest data: {e}");
+ return Err(Error::FetchInterestDataError);
+ }
+ }
+ Ok(())
+}
+
+fn fetch_interest_data() -> Result<Vec<(Interest, UrlHash)>> {
+ let rs = Client::new(RemoteSettingsConfig {
+ collection_name: REMOTE_SETTINGS_COLLECTION.to_string(),
+ server: Some(RemoteSettingsServer::Prod),
+ server_url: None,
+ bucket_name: None,
+ })?;
+ fetch_interest_data_inner(rs)
+}
+
+/// Fetch the interest data
+fn fetch_interest_data_inner(
+ rs: impl RelevancyRemoteSettingsClient,
+) -> Result<Vec<(Interest, UrlHash)>> {
+ let remote_settings_response = rs.get_records()?;
+ let mut result = vec![];
+
+ for record in remote_settings_response.records {
+ let attachment_data = match &record.attachment {
+ None => return Err(Error::FetchInterestDataError),
+ Some(a) => rs.get_attachment(&a.location)?,
+ };
+ let interest = get_interest(&record)?;
+ let urls = get_hash_urls(attachment_data)?;
+ result.extend(std::iter::repeat(interest).zip(urls));
+ }
+ Ok(result)
+}
+
+fn get_hash_urls(attachment_data: Vec<u8>) -> Result<Vec<UrlHash>> {
+ let mut hash_urls = vec![];
+
+ let parsed_attachment_data =
+ serde_json::from_slice::<Vec<RelevancyAttachmentData>>(&attachment_data)?;
+
+ for attachment_data in parsed_attachment_data {
+ let hash_url = STANDARD
+ .decode(attachment_data.domain)
+ .map_err(|_| Error::Base64DecodeError("Invalid base64 error".to_string()))?;
+ let url_hash = hash_url.try_into().map_err(|_| {
+ Error::Base64DecodeError("Base64 string has wrong number of bytes".to_string())
+ })?;
+ hash_urls.push(url_hash);
+ }
+ Ok(hash_urls)
+}
+
+/// Extract Interest from the record info
+fn get_interest(record: &RemoteSettingsRecord) -> Result<Interest> {
+ let record_fields: RelevancyRecord =
+ serde_json::from_value(serde_json::Value::Object(record.fields.clone()))?;
+ let custom_details = record_fields.record_custom_details;
+ let category_code = custom_details.category_to_domains.category_code;
+ Interest::try_from(category_code as u32)
+}
+
+/// Insert Interests into Db
+fn insert_interest_data(data: Vec<(Interest, UrlHash)>, dao: &mut RelevancyDao) -> Result<()> {
+ for chunk in data.chunks(WRITE_CHUNK_SIZE) {
+ dao.err_if_interrupted()?;
+ for (interest, hash_url) in chunk {
+ dao.add_url_interest(*hash_url, *interest)?;
+ }
+ }
+
+ Ok(())
+}
+
+#[cfg(test)]
+mod test {
+
+ use std::{cell::RefCell, collections::HashMap};
+
+ use anyhow::Context;
+ use remote_settings::RemoteSettingsResponse;
+ use serde_json::json;
+
+ use super::*;
+ use crate::{rs::RelevancyRemoteSettingsClient, url_hash::hash_url, InterestVector};
+
+ /// A snapshot containing fake Remote Settings records and attachments for
+ /// the store to ingest. We use snapshots to test the store's behavior in a
+ /// data-driven way.
+ struct Snapshot {
+ records: Vec<RemoteSettingsRecord>,
+ attachments: HashMap<&'static str, Vec<u8>>,
+ }
+
+ impl Snapshot {
+ /// Creates a snapshot from a JSON value that represents a collection of
+ /// Relevancy Remote Settings records.
+ ///
+ /// You can use the [`serde_json::json!`] macro to construct the JSON
+ /// value, then pass it to this function. It's easier to use the
+ /// `Snapshot::with_records(json!(...))` idiom than to construct the
+ /// records by hand.
+ fn with_records(value: serde_json::Value) -> anyhow::Result<Self> {
+ Ok(Self {
+ records: serde_json::from_value(value)
+ .context("Couldn't create snapshot with Remote Settings records")?,
+ attachments: HashMap::new(),
+ })
+ }
+
+ /// Adds a data attachment to the snapshot.
+ fn with_data(
+ mut self,
+ location: &'static str,
+ value: serde_json::Value,
+ ) -> anyhow::Result<Self> {
+ self.attachments.insert(
+ location,
+ serde_json::to_vec(&value).context("Couldn't add data attachment to snapshot")?,
+ );
+ Ok(self)
+ }
+ }
+
+ /// A fake Remote Settings client that returns records and attachments from
+ /// a snapshot.
+ struct SnapshotSettingsClient {
+ /// The current snapshot. You can modify it using
+ /// [`RefCell::borrow_mut()`] to simulate remote updates in tests.
+ snapshot: RefCell<Snapshot>,
+ }
+
+ impl SnapshotSettingsClient {
+ /// Creates a client with an initial snapshot.
+ fn with_snapshot(snapshot: Snapshot) -> Self {
+ Self {
+ snapshot: RefCell::new(snapshot),
+ }
+ }
+ }
+
+ impl RelevancyRemoteSettingsClient for SnapshotSettingsClient {
+ fn get_records(&self) -> Result<RemoteSettingsResponse> {
+ let records = self.snapshot.borrow().records.clone();
+ let last_modified = records
+ .iter()
+ .map(|record: &RemoteSettingsRecord| record.last_modified)
+ .max()
+ .unwrap_or(0);
+ Ok(RemoteSettingsResponse {
+ records,
+ last_modified,
+ })
+ }
+
+ fn get_attachment(&self, location: &str) -> Result<Vec<u8>> {
+ Ok(self
+ .snapshot
+ .borrow()
+ .attachments
+ .get(location)
+ .unwrap_or_else(|| unreachable!("Unexpected request for attachment `{}`", location))
+ .clone())
+ }
+ }
+
+ #[test]
+ fn test_interest_vectors() {
+ let db = RelevancyDb::new_for_test();
+ db.read_write(|dao| {
+ // Test that the interest data matches the values we started from in
+ // `bin/generate-test-data.rs`
+
+ dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?;
+ dao.add_url_interest(hash_url("https://dogs.com").unwrap(), Interest::Animals)?;
+ dao.add_url_interest(hash_url("https://cars.com").unwrap(), Interest::Autos)?;
+ dao.add_url_interest(
+ hash_url("https://www.vouge.com").unwrap(),
+ Interest::Fashion,
+ )?;
+ dao.add_url_interest(hash_url("https://slashdot.org").unwrap(), Interest::Tech)?;
+ dao.add_url_interest(hash_url("https://www.nascar.com").unwrap(), Interest::Autos)?;
+ dao.add_url_interest(
+ hash_url("https://www.nascar.com").unwrap(),
+ Interest::Sports,
+ )?;
+ dao.add_url_interest(
+ hash_url("https://unknown.url").unwrap(),
+ Interest::Inconclusive,
+ )?;
+
+ assert_eq!(
+ dao.get_url_interest_vector("https://espn.com/").unwrap(),
+ InterestVector {
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://dogs.com/").unwrap(),
+ InterestVector {
+ animals: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://cars.com/").unwrap(),
+ InterestVector {
+ autos: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://www.vouge.com/")
+ .unwrap(),
+ InterestVector {
+ fashion: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://slashdot.org/")
+ .unwrap(),
+ InterestVector {
+ tech: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://www.nascar.com/")
+ .unwrap(),
+ InterestVector {
+ autos: 1,
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ assert_eq!(
+ dao.get_url_interest_vector("https://unknown.url/").unwrap(),
+ InterestVector {
+ inconclusive: 1,
+ ..InterestVector::default()
+ }
+ );
+ Ok(())
+ })
+ .unwrap();
+ }
+
+ #[test]
+ fn test_variations_on_the_url() {
+ let db = RelevancyDb::new_for_test();
+ db.read_write(|dao| {
+ dao.add_url_interest(hash_url("https://espn.com").unwrap(), Interest::Sports)?;
+ dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Autos)?;
+ dao.add_url_interest(hash_url("https://nascar.com").unwrap(), Interest::Sports)?;
+
+ // Different paths/queries should work
+ assert_eq!(
+ dao.get_url_interest_vector("https://espn.com/foo/bar/?baz")
+ .unwrap(),
+ InterestVector {
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ // Different schemes should too
+ assert_eq!(
+ dao.get_url_interest_vector("http://espn.com/").unwrap(),
+ InterestVector {
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ // But changes to the domain shouldn't
+ assert_eq!(
+ dao.get_url_interest_vector("http://espn2.com/").unwrap(),
+ InterestVector::default()
+ );
+ // However, extra components past the 2nd one in the domain are ignored
+ assert_eq!(
+ dao.get_url_interest_vector("https://www.nascar.com/")
+ .unwrap(),
+ InterestVector {
+ autos: 1,
+ sports: 1,
+ ..InterestVector::default()
+ }
+ );
+ Ok(())
+ })
+ .unwrap();
+ }
+
+ #[test]
+ fn test_parse_records() -> anyhow::Result<()> {
+ let snapshot = Snapshot::with_records(json!([{
+ "id": "animals-0001",
+ "last_modified": 15,
+ "type": "category_to_domains",
+ "attachment": {
+ "filename": "data-1.json",
+ "mimetype": "application/json",
+ "location": "data-1.json",
+ "hash": "",
+ "size": 0
+ },
+ "record_custom_details": {
+ "category_to_domains": {
+ "category": "animals",
+ "category_code": 1,
+ "version": 1
+ }
+ }
+ }]))?
+ .with_data(
+ "data-1.json",
+ json!([
+ {"domain": "J2jtyjQtYQ/+/p//xhz43Q=="},
+ {"domain": "Zd4awCwGZLkat59nIWje3g=="}]),
+ )?;
+ let rs_client = SnapshotSettingsClient::with_snapshot(snapshot);
+ assert_eq!(
+ fetch_interest_data_inner(rs_client).unwrap(),
+ vec![
+ (Interest::Animals, hash_url("https://dogs.com").unwrap()),
+ (Interest::Animals, hash_url("https://cats.com").unwrap())
+ ]
+ );
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_parse_records_with_bad_domain_strings() -> anyhow::Result<()> {
+ let snapshot = Snapshot::with_records(json!([{
+ "id": "animals-0001",
+ "last_modified": 15,
+ "type": "category_to_domains",
+ "attachment": {
+ "filename": "data-1.json",
+ "mimetype": "application/json",
+ "location": "data-1.json",
+ "hash": "",
+ "size": 0
+ },
+ "record_custom_details": {
+ "category_to_domains": {
+ "category": "animals",
+ "category_code": 1,
+ "version": 1
+ }
+ }
+ }]))?
+ .with_data(
+ "data-1.json",
+ json!([
+ {"domain": "badString"},
+ {"domain": "notBase64"}]),
+ )?;
+ let rs_client = SnapshotSettingsClient::with_snapshot(snapshot);
+ fetch_interest_data_inner(rs_client).expect_err("Invalid base64 error");
+
+ Ok(())
+ }
+}
diff --git a/third_party/rust/relevancy/src/interest.rs b/third_party/rust/relevancy/src/interest.rs
index 0573c743fc..797df11236 100644
--- a/third_party/rust/relevancy/src/interest.rs
+++ b/third_party/rust/relevancy/src/interest.rs
@@ -2,32 +2,37 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+use crate::Error;
+
/// List of possible interests for a domain. Domains can have be associated with one or multiple
/// interests. `Inconclusive` is used for domains in the user's top sites that we can't classify
/// because there's no corresponding entry in the interest database.
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
#[repr(u32)]
pub enum Interest {
- Animals,
- Arts,
- Autos,
- Business,
- Career,
- Education,
- Fashion,
- Finance,
- Food,
- Government,
- Health,
- Hobbies,
- Home,
- News,
- RealEstate,
- Society,
- Sports,
- Tech,
- Travel,
- Inconclusive,
+ // Note: if you change these codes, make sure to update the `TryFrom<u32>` implementation and
+ // the `test_interest_code_conversion` test.
+ Inconclusive = 0,
+ Animals = 1,
+ Arts = 2,
+ Autos = 3,
+ Business = 4,
+ Career = 5,
+ Education = 6,
+ Fashion = 7,
+ Finance = 8,
+ Food = 9,
+ Government = 10,
+ //Disable this per policy consultation
+ // Health = 11,
+ Hobbies = 12,
+ Home = 13,
+ News = 14,
+ RealEstate = 15,
+ Society = 16,
+ Sports = 17,
+ Tech = 18,
+ Travel = 19,
}
impl From<Interest> for u32 {
@@ -42,22 +47,44 @@ impl From<Interest> for usize {
}
}
-impl From<u32> for Interest {
- fn from(code: u32) -> Self {
- if code as usize > Self::COUNT {
- panic!("Invalid interest code: {code}")
+impl TryFrom<u32> for Interest {
+ // On error, return the invalid code back
+ type Error = Error;
+
+ fn try_from(code: u32) -> Result<Self, Self::Error> {
+ match code {
+ 0 => Ok(Self::Inconclusive),
+ 1 => Ok(Self::Animals),
+ 2 => Ok(Self::Arts),
+ 3 => Ok(Self::Autos),
+ 4 => Ok(Self::Business),
+ 5 => Ok(Self::Career),
+ 6 => Ok(Self::Education),
+ 7 => Ok(Self::Fashion),
+ 8 => Ok(Self::Finance),
+ 9 => Ok(Self::Food),
+ 10 => Ok(Self::Government),
+ //Disable this per policy consultation
+ // 11 => Ok(Self::Health),
+ 12 => Ok(Self::Hobbies),
+ 13 => Ok(Self::Home),
+ 14 => Ok(Self::News),
+ 15 => Ok(Self::RealEstate),
+ 16 => Ok(Self::Society),
+ 17 => Ok(Self::Sports),
+ 18 => Ok(Self::Tech),
+ 19 => Ok(Self::Travel),
+ n => Err(Error::InvalidInterestCode(n)),
}
- // Safety: This is safe since Interest has a u32 representation and we've done a bounds
- // check
- unsafe { std::mem::transmute(code) }
}
}
impl Interest {
- const COUNT: usize = 20;
+ const COUNT: usize = 19;
pub fn all() -> [Interest; Self::COUNT] {
[
+ Self::Inconclusive,
Self::Animals,
Self::Arts,
Self::Autos,
@@ -68,7 +95,7 @@ impl Interest {
Self::Finance,
Self::Food,
Self::Government,
- Self::Health,
+ // Self::Health,
Self::Hobbies,
Self::Home,
Self::News,
@@ -77,7 +104,6 @@ impl Interest {
Self::Sports,
Self::Tech,
Self::Travel,
- Self::Inconclusive,
]
}
}
@@ -88,6 +114,7 @@ impl Interest {
/// number of elements.
#[derive(Debug, Default, PartialEq, Eq)]
pub struct InterestVector {
+ pub inconclusive: u32,
pub animals: u32,
pub arts: u32,
pub autos: u32,
@@ -98,7 +125,7 @@ pub struct InterestVector {
pub finance: u32,
pub food: u32,
pub government: u32,
- pub health: u32,
+ // pub health: u32,
pub hobbies: u32,
pub home: u32,
pub news: u32,
@@ -107,7 +134,34 @@ pub struct InterestVector {
pub sports: u32,
pub tech: u32,
pub travel: u32,
- pub inconclusive: u32,
+}
+
+impl std::ops::Add for InterestVector {
+ type Output = Self;
+
+ fn add(self, other: Self) -> Self {
+ Self {
+ inconclusive: self.inconclusive + other.inconclusive,
+ animals: self.animals + other.animals,
+ arts: self.arts + other.arts,
+ autos: self.autos + other.autos,
+ business: self.business + other.business,
+ career: self.career + other.career,
+ education: self.education + other.education,
+ fashion: self.fashion + other.fashion,
+ finance: self.finance + other.finance,
+ food: self.food + other.food,
+ government: self.government + other.government,
+ hobbies: self.hobbies + other.hobbies,
+ home: self.home + other.home,
+ news: self.news + other.news,
+ real_estate: self.real_estate + other.real_estate,
+ society: self.society + other.society,
+ sports: self.sports + other.sports,
+ tech: self.tech + other.tech,
+ travel: self.travel + other.travel,
+ }
+ }
}
impl std::ops::Index<Interest> for InterestVector {
@@ -115,6 +169,7 @@ impl std::ops::Index<Interest> for InterestVector {
fn index(&self, index: Interest) -> &u32 {
match index {
+ Interest::Inconclusive => &self.inconclusive,
Interest::Animals => &self.animals,
Interest::Arts => &self.arts,
Interest::Autos => &self.autos,
@@ -125,7 +180,7 @@ impl std::ops::Index<Interest> for InterestVector {
Interest::Finance => &self.finance,
Interest::Food => &self.food,
Interest::Government => &self.government,
- Interest::Health => &self.health,
+ // Interest::Health => &self.health,
Interest::Hobbies => &self.hobbies,
Interest::Home => &self.home,
Interest::News => &self.news,
@@ -134,7 +189,6 @@ impl std::ops::Index<Interest> for InterestVector {
Interest::Sports => &self.sports,
Interest::Tech => &self.tech,
Interest::Travel => &self.travel,
- Interest::Inconclusive => &self.inconclusive,
}
}
}
@@ -142,6 +196,7 @@ impl std::ops::Index<Interest> for InterestVector {
impl std::ops::IndexMut<Interest> for InterestVector {
fn index_mut(&mut self, index: Interest) -> &mut u32 {
match index {
+ Interest::Inconclusive => &mut self.inconclusive,
Interest::Animals => &mut self.animals,
Interest::Arts => &mut self.arts,
Interest::Autos => &mut self.autos,
@@ -152,7 +207,7 @@ impl std::ops::IndexMut<Interest> for InterestVector {
Interest::Finance => &mut self.finance,
Interest::Food => &mut self.food,
Interest::Government => &mut self.government,
- Interest::Health => &mut self.health,
+ // Interest::Health => &mut self.health,
Interest::Hobbies => &mut self.hobbies,
Interest::Home => &mut self.home,
Interest::News => &mut self.news,
@@ -161,7 +216,32 @@ impl std::ops::IndexMut<Interest> for InterestVector {
Interest::Sports => &mut self.sports,
Interest::Tech => &mut self.tech,
Interest::Travel => &mut self.travel,
- Interest::Inconclusive => &mut self.inconclusive,
}
}
}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn test_interest_code_conversion() {
+ for interest in Interest::all() {
+ assert_eq!(Interest::try_from(u32::from(interest)).unwrap(), interest)
+ }
+ // try_from() for out of bounds codes should return an error
+ assert!(matches!(
+ Interest::try_from(20),
+ Err(Error::InvalidInterestCode(20))
+ ));
+ assert!(matches!(
+ Interest::try_from(100),
+ Err(Error::InvalidInterestCode(100))
+ ));
+ // Health is currently disabled, so it's code should return None for now
+ assert!(matches!(
+ Interest::try_from(11),
+ Err(Error::InvalidInterestCode(11))
+ ));
+ }
+}
diff --git a/third_party/rust/relevancy/src/lib.rs b/third_party/rust/relevancy/src/lib.rs
index 157a26277e..4bc774a261 100644
--- a/third_party/rust/relevancy/src/lib.rs
+++ b/third_party/rust/relevancy/src/lib.rs
@@ -11,8 +11,9 @@
mod db;
mod error;
+mod ingest;
mod interest;
-mod populate_interests;
+mod rs;
mod schema;
pub mod url_hash;
@@ -28,11 +29,18 @@ pub struct RelevancyStore {
/// Top-level API for the Relevancy component
impl RelevancyStore {
- #[handle_error(Error)]
- pub fn new(db_path: String) -> ApiResult<Self> {
- Ok(Self {
- db: RelevancyDb::open(db_path)?,
- })
+ pub fn new(db_path: String) -> Self {
+ Self {
+ db: RelevancyDb::new(db_path),
+ }
+ }
+
+ pub fn close(&self) {
+ self.db.close()
+ }
+
+ pub fn interrupt(&self) {
+ self.db.interrupt()
}
/// Ingest top URLs to build the user's interest vector.
@@ -47,9 +55,21 @@ impl RelevancyStore {
///
/// This method may execute for a long time and should only be called from a worker thread.
#[handle_error(Error)]
- pub fn ingest(&self, _top_urls_by_frecency: Vec<String>) -> ApiResult<()> {
- populate_interests::ensure_interest_data_populated(&self.db)?;
- todo!()
+ pub fn ingest(&self, top_urls_by_frecency: Vec<String>) -> ApiResult<InterestVector> {
+ ingest::ensure_interest_data_populated(&self.db)?;
+ self.classify(top_urls_by_frecency)
+ }
+
+ pub fn classify(&self, top_urls_by_frecency: Vec<String>) -> Result<InterestVector> {
+ // For experimentation purposes we are going to return an interest vector.
+ // Eventually we would want to store this data in the DB and incrementally update it.
+ let mut interest_vector = InterestVector::default();
+ for url in top_urls_by_frecency {
+ let interest_count = self.db.read(|dao| dao.get_url_interest_vector(&url))?;
+ interest_vector = interest_vector + interest_count;
+ }
+
+ Ok(interest_vector)
}
/// Calculate metrics for the validation phase
@@ -79,3 +99,45 @@ pub struct InterestMetrics {
}
uniffi::include_scaffolding!("relevancy");
+
+#[cfg(test)]
+mod test {
+ use crate::url_hash::hash_url;
+
+ use super::*;
+
+ #[test]
+ fn test_ingest() {
+ let top_urls = vec![
+ "https://food.com/".to_string(),
+ "https://hello.com".to_string(),
+ "https://pasta.com".to_string(),
+ "https://dog.com".to_string(),
+ ];
+ let relevancy_store =
+ RelevancyStore::new("file:test_store_data?mode=memory&cache=shared".to_owned());
+ relevancy_store
+ .db
+ .read_write(|dao| {
+ dao.add_url_interest(hash_url("https://food.com").unwrap(), Interest::Food)?;
+ dao.add_url_interest(
+ hash_url("https://hello.com").unwrap(),
+ Interest::Inconclusive,
+ )?;
+ dao.add_url_interest(hash_url("https://pasta.com").unwrap(), Interest::Food)?;
+ dao.add_url_interest(hash_url("https://dog.com").unwrap(), Interest::Animals)?;
+ Ok(())
+ })
+ .expect("Insert should succeed");
+
+ assert_eq!(
+ relevancy_store.ingest(top_urls).unwrap(),
+ InterestVector {
+ inconclusive: 1,
+ animals: 1,
+ food: 2,
+ ..InterestVector::default()
+ }
+ );
+ }
+}
diff --git a/third_party/rust/relevancy/src/populate_interests.rs b/third_party/rust/relevancy/src/populate_interests.rs
deleted file mode 100644
index e33b677dd6..0000000000
--- a/third_party/rust/relevancy/src/populate_interests.rs
+++ /dev/null
@@ -1,157 +0,0 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
- * License, v. 2.0. If a copy of the MPL was not distributed with this
- * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-use crate::{url_hash::UrlHash, Error, Interest, RelevancyDb, Result};
-use std::io::{Cursor, Read};
-
-pub fn ensure_interest_data_populated(db: &RelevancyDb) -> Result<()> {
- if !db.read(|dao| dao.need_to_load_url_interests())? {
- return Ok(());
- }
- let interest_data = match fetch_interest_data() {
- Ok(data) => data,
- Err(e) => {
- log::warn!("error fetching interest data: {e}");
- return Err(Error::FetchInterestDataError);
- }
- };
- db.read_write(move |dao| {
- for (url_hash, interest) in interest_data {
- dao.add_url_interest(url_hash, interest)?;
- }
- Ok(())
- })
-}
-
-/// Fetch the interest data
-fn fetch_interest_data() -> std::io::Result<Vec<(UrlHash, Interest)>> {
- // TODO: this hack should be replaced with something that fetches from remote settings
- let bytes = include_bytes!("../test-data");
- let mut reader = Cursor::new(&bytes);
- let mut data = vec![];
-
- // Loop over all possible interests
- for interest in Interest::all() {
- // read the count
- let mut buf = [0u8; 4];
- reader.read_exact(&mut buf)?;
- let count = u32::from_le_bytes(buf);
- for _ in 0..count {
- let mut url_hash: UrlHash = [0u8; 16];
- reader.read_exact(&mut url_hash)?;
- data.push((url_hash, interest));
- }
- }
- Ok(data)
-}
-
-#[cfg(test)]
-mod test {
- use super::*;
- use crate::InterestVector;
-
- #[test]
- fn test_interest_vectors() {
- let db = RelevancyDb::open_for_test();
- ensure_interest_data_populated(&db).unwrap();
- db.read(|dao| {
- // Test that the interest data matches the values we started from in
- // `bin/generate-test-data.rs`
- assert_eq!(
- dao.get_url_interest_vector("https://espn.com/").unwrap(),
- InterestVector {
- sports: 1,
- ..InterestVector::default()
- }
- );
- assert_eq!(
- dao.get_url_interest_vector("https://dogs.com/").unwrap(),
- InterestVector {
- animals: 1,
- ..InterestVector::default()
- }
- );
- assert_eq!(
- dao.get_url_interest_vector("https://cars.com/").unwrap(),
- InterestVector {
- autos: 1,
- ..InterestVector::default()
- }
- );
- assert_eq!(
- dao.get_url_interest_vector("https://www.vouge.com/")
- .unwrap(),
- InterestVector {
- fashion: 1,
- ..InterestVector::default()
- }
- );
- assert_eq!(
- dao.get_url_interest_vector("https://slashdot.org/")
- .unwrap(),
- InterestVector {
- tech: 1,
- ..InterestVector::default()
- }
- );
- assert_eq!(
- dao.get_url_interest_vector("https://www.nascar.com/")
- .unwrap(),
- InterestVector {
- autos: 1,
- sports: 1,
- ..InterestVector::default()
- }
- );
- assert_eq!(
- dao.get_url_interest_vector("https://unknown.url/").unwrap(),
- InterestVector::default()
- );
- Ok(())
- })
- .unwrap();
- }
-
- #[test]
- fn test_variations_on_the_url() {
- let db = RelevancyDb::open_for_test();
- ensure_interest_data_populated(&db).unwrap();
- db.read(|dao| {
- // Different paths/queries should work
- assert_eq!(
- dao.get_url_interest_vector("https://espn.com/foo/bar/?baz")
- .unwrap(),
- InterestVector {
- sports: 1,
- ..InterestVector::default()
- }
- );
- // Different schemes should too
- assert_eq!(
- dao.get_url_interest_vector("http://espn.com/").unwrap(),
- InterestVector {
- sports: 1,
- ..InterestVector::default()
- }
- );
- // But changes to the domain shouldn't
- assert_eq!(
- dao.get_url_interest_vector("http://www.espn.com/").unwrap(),
- InterestVector::default()
- );
- // However, extra components past the 3rd one in the domain are ignored
- assert_eq!(
- dao.get_url_interest_vector("https://foo.www.nascar.com/")
- .unwrap(),
- InterestVector {
- autos: 1,
- sports: 1,
- ..InterestVector::default()
- }
- );
- Ok(())
- })
- .unwrap();
- }
-}
diff --git a/third_party/rust/relevancy/src/relevancy.udl b/third_party/rust/relevancy/src/relevancy.udl
index e07243ec28..ba9eb09969 100644
--- a/third_party/rust/relevancy/src/relevancy.udl
+++ b/third_party/rust/relevancy/src/relevancy.udl
@@ -8,12 +8,21 @@ interface RelevancyApiError {
// Top-level class for the Relevancy component
interface RelevancyStore {
// Construct a new RelevancyStore
- [Throws=RelevancyApiError]
+ //
+ // This is non-blocking since databases and other resources are lazily opened.
constructor(string dbpath);
+ // Close any open resources (for example databases)
+ //
+ // Calling `close` will interrupt any in-progress queries on other threads.
+ void close();
+
+ // Interrupt any current database queries
+ void interrupt();
+
// Ingest the top URLs by frequency to build up the user's interest vector
[Throws=RelevancyApiError]
- void ingest(sequence<string> top_urls);
+ InterestVector ingest(sequence<string> top_urls);
// Calculate metrics for the user's interest vector in order to measure how strongly we're
// identifying interests. See the `InterestMetrics` struct for details.
@@ -39,7 +48,7 @@ enum Interest {
"Finance",
"Food",
"Government",
- "Health",
+ // "Health",
"Hobbies",
"Home",
"News",
@@ -93,7 +102,7 @@ dictionary InterestVector {
u32 finance;
u32 food;
u32 government;
- u32 health;
+ // u32 health;
u32 hobbies;
u32 home;
u32 news;
diff --git a/third_party/rust/relevancy/src/rs.rs b/third_party/rust/relevancy/src/rs.rs
new file mode 100644
index 0000000000..bc8cc938e8
--- /dev/null
+++ b/third_party/rust/relevancy/src/rs.rs
@@ -0,0 +1,60 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+use crate::Result;
+use remote_settings::RemoteSettingsResponse;
+use serde::Deserialize;
+/// The Remote Settings collection name.
+pub(crate) const REMOTE_SETTINGS_COLLECTION: &str = "content-relevance";
+
+/// A trait for a client that downloads records from Remote Settings.
+///
+/// This trait lets tests use a mock client.
+pub(crate) trait RelevancyRemoteSettingsClient {
+ /// Fetches records from the Suggest Remote Settings collection.
+ fn get_records(&self) -> Result<RemoteSettingsResponse>;
+
+ /// Fetches a record's attachment from the Suggest Remote Settings
+ /// collection.
+ fn get_attachment(&self, location: &str) -> Result<Vec<u8>>;
+}
+
+impl RelevancyRemoteSettingsClient for remote_settings::Client {
+ fn get_records(&self) -> Result<RemoteSettingsResponse> {
+ Ok(remote_settings::Client::get_records(self)?)
+ }
+
+ fn get_attachment(&self, location: &str) -> Result<Vec<u8>> {
+ Ok(remote_settings::Client::get_attachment(self, location)?)
+ }
+}
+
+/// A record in the Relevancy Remote Settings collection.
+#[derive(Clone, Debug, Deserialize)]
+pub struct RelevancyRecord {
+ #[serde(rename = "type")]
+ pub record_type: String,
+ pub record_custom_details: RecordCustomDetails,
+}
+
+// Custom details related to category of the record.
+#[derive(Clone, Debug, Deserialize)]
+pub struct RecordCustomDetails {
+ pub category_to_domains: CategoryToDomains,
+}
+
+/// Category information related to the record.
+#[derive(Clone, Debug, Deserialize)]
+pub struct CategoryToDomains {
+ pub version: i32,
+ pub category: String,
+ pub category_code: i32,
+}
+
+/// A downloaded Remote Settings attachment that contains domain data.
+#[derive(Clone, Debug, Deserialize)]
+pub struct RelevancyAttachmentData {
+ pub domain: String,
+}
diff --git a/third_party/rust/relevancy/src/url_hash.rs b/third_party/rust/relevancy/src/url_hash.rs
index d31a45d06b..c010dcaf12 100644
--- a/third_party/rust/relevancy/src/url_hash.rs
+++ b/third_party/rust/relevancy/src/url_hash.rs
@@ -8,11 +8,10 @@ use url::{Host, Url};
pub type UrlHash = [u8; 16];
/// Given a URL, extract the part of it that we want to use to identify it.
-///
-/// We currently use the final 3 components of the URL domain.
-///
-/// TODO: decide if this should be 3 or 3 components.
pub fn url_hash_source(url: &str) -> Option<String> {
+ // We currently use the final 2 components of the URL domain.
+ const URL_COMPONENTS_TO_USE: usize = 2;
+
let url = Url::parse(url).ok()?;
let domain = match url.host() {
Some(Host::Domain(d)) => d,
@@ -20,7 +19,7 @@ pub fn url_hash_source(url: &str) -> Option<String> {
};
// This will store indexes of `.` chars as we search backwards.
let mut pos = domain.len();
- for _ in 0..3 {
+ for _ in 0..URL_COMPONENTS_TO_USE {
match domain[0..pos].rfind('.') {
Some(p) => pos = p,
// The domain has less than 3 dots, return it all
@@ -47,12 +46,12 @@ mod test {
fn test_url_hash_source() {
let table = [
("http://example.com/some-path", Some("example.com")),
- ("http://foo.example.com/some-path", Some("foo.example.com")),
+ ("http://foo.example.com/some-path", Some("example.com")),
(
"http://foo.bar.baz.example.com/some-path",
- Some("baz.example.com"),
+ Some("example.com"),
),
- ("http://foo.com.uk/some-path", Some("foo.com.uk")),
+ ("http://foo.com.uk/some-path", Some("com.uk")),
("http://amazon.com/some-path", Some("amazon.com")),
("http://192.168.0.1/some-path", None),
];