Merging upstream version 126.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-15 03:35:49 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-15 03:35:49 +0000
commit: d8bbc7858622b6d9c278469aab701ca0b609cddf (patch)
tree: eff41dc61d9f714852212739e6b3738b82a2af87 /third_party/rust/relevancy/src
parent: Releasing progress-linux version 125.0.3-1~progress7.99u1. (diff)
download: firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz
firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip
9 files changed, 832 insertions, 0 deletions
diff --git a/third_party/rust/relevancy/src/bin/generate-test-data.rs b/third_party/rust/relevancy/src/bin/generate-test-data.rs
new file mode 100644
index 0000000000..04c5827275
--- /dev/null
+++ b/third_party/rust/relevancy/src/bin/generate-test-data.rs
@@ -0,0 +1,43 @@
+use relevancy::{
+    url_hash::{hash_url, UrlHash},
+    Interest,
+};
+use std::{collections::HashMap, fs::File, io::Write};
+
+// Generate a set of test data and output it to the `test-data` file.
+//
+// This is meant to be a placeholder until we can get this data stored in remote settings.
+
+const TEST_INTEREST_DATA: &[(&str, Interest)] = &[
+    ("https://espn.com/", Interest::Sports),
+    ("https://dogs.com/", Interest::Animals),
+    ("https://cars.com/", Interest::Autos),
+    ("https://www.vouge.com/", Interest::Fashion),
+    ("https://slashdot.org/", Interest::Tech),
+    ("https://www.nascar.com/", Interest::Autos),
+    ("https://www.nascar.com/", Interest::Sports),
+];
+
+fn main() {
+    let mut interest_map: HashMap<Interest, Vec<UrlHash>> =
+        HashMap::from_iter(Interest::all().into_iter().map(|i| (i, vec![])));
+    for (url, interest) in TEST_INTEREST_DATA {
+        if let Some(hash) = hash_url(url) {
+            interest_map.get_mut(interest).unwrap().push(hash)
+        }
+    }
+
+    let mut f = File::create("test-data").expect("Error opening file");
+    // Loop over all possible interests
+    for interest in Interest::all() {
+        // Get the list of URL hashes for that interest
+        let hashes = interest_map.get(&interest).unwrap();
+        // Write the count
+        f.write_all(&(hashes.len() as u32).to_le_bytes())
+            .expect("Error writing file");
+        // Write the hashes
+        for hash in hashes {
+            f.write_all(hash).expect("Error writing file");
+        }
+    }
+}
diff --git a/third_party/rust/relevancy/src/db.rs b/third_party/rust/relevancy/src/db.rs
new file mode 100644
index 0000000000..08684c45af
--- /dev/null
+++ b/third_party/rust/relevancy/src/db.rs
@@ -0,0 +1,118 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+use crate::{
+    schema::RelevancyConnectionInitializer,
+    url_hash::{hash_url, UrlHash},
+    Interest, InterestVector, Result,
+};
+use parking_lot::Mutex;
+use rusqlite::{Connection, OpenFlags};
+use sql_support::{open_database::open_database_with_flags, ConnExt};
+use std::path::Path;
+
+/// A thread-safe wrapper around an SQLite connection to the Relevancy database
+pub struct RelevancyDb {
+    pub conn: Mutex<Connection>,
+}
+
+impl RelevancyDb {
+    pub fn open(path: impl AsRef<Path>) -> Result<Self> {
+        let conn = open_database_with_flags(
+            path,
+            OpenFlags::SQLITE_OPEN_URI
+                | OpenFlags::SQLITE_OPEN_NO_MUTEX
+                | OpenFlags::SQLITE_OPEN_CREATE
+                | OpenFlags::SQLITE_OPEN_READ_WRITE,
+            &RelevancyConnectionInitializer,
+        )?;
+        Ok(Self {
+            conn: Mutex::new(conn),
+        })
+    }
+
+    #[cfg(test)]
+    pub fn open_for_test() -> Self {
+        use std::sync::atomic::{AtomicU32, Ordering};
+        static COUNTER: AtomicU32 = AtomicU32::new(0);
+        let count = COUNTER.fetch_add(1, Ordering::Relaxed);
+        Self::open(format!("file:test{count}.sqlite?mode=memory&cache=shared")).unwrap()
+    }
+
+    /// Accesses the Suggest database in a transaction for reading.
+    pub fn read<T>(&self, op: impl FnOnce(&RelevancyDao) -> Result<T>) -> Result<T> {
+        let mut conn = self.conn.lock();
+        let tx = conn.transaction()?;
+        let dao = RelevancyDao::new(&tx);
+        op(&dao)
+    }
+
+    /// Accesses the Suggest database in a transaction for reading and writing.
+    pub fn read_write<T>(&self, op: impl FnOnce(&mut RelevancyDao) -> Result<T>) -> Result<T> {
+        let mut conn = self.conn.lock();
+        let tx = conn.transaction()?;
+        let mut dao = RelevancyDao::new(&tx);
+        let result = op(&mut dao)?;
+        tx.commit()?;
+        Ok(result)
+    }
+}
+
+/// A data access object (DAO) that wraps a connection to the Relevancy database
+///
+/// Methods that only read from the database take an immutable reference to
+/// `self` (`&self`), and methods that write to the database take a mutable
+/// reference (`&mut self`).
+pub struct RelevancyDao<'a> {
+    pub conn: &'a Connection,
+}
+
+impl<'a> RelevancyDao<'a> {
+    fn new(conn: &'a Connection) -> Self {
+        Self { conn }
+    }
+
+    /// Associate a URL with an interest
+    pub fn add_url_interest(&mut self, url_hash: UrlHash, interest: Interest) -> Result<()> {
+        let sql = "
+            INSERT OR REPLACE INTO url_interest(url_hash, interest_code)
+            VALUES (?, ?)
+        ";
+        self.conn.execute(sql, (url_hash, interest as u32))?;
+        Ok(())
+    }
+
+    /// Get an interest vector for a URL
+    pub fn get_url_interest_vector(&self, url: &str) -> Result<InterestVector> {
+        let hash = match hash_url(url) {
+            Some(u) => u,
+            None => return Ok(InterestVector::default()),
+        };
+        let mut stmt = self.conn.prepare_cached(
+            "
+            SELECT interest_code
+            FROM url_interest
+            WHERE url_hash=?
+        ",
+        )?;
+        let interests = stmt.query_and_then((hash,), |row| -> Result<Interest> {
+            Ok(row.get::<_, u32>(0)?.into())
+        })?;
+
+        let mut interest_vec = InterestVector::default();
+        for interest in interests {
+            interest_vec[interest?] += 1
+        }
+        Ok(interest_vec)
+    }
+
+    /// Do we need to load the interest data?
+    pub fn need_to_load_url_interests(&self) -> Result<bool> {
+        // TODO: we probably will need a better check than this.
+        Ok(self
+            .conn
+            .query_one("SELECT NOT EXISTS (SELECT 1 FROM url_interest)")?)
+    }
+}
diff --git a/third_party/rust/relevancy/src/error.rs b/third_party/rust/relevancy/src/error.rs
new file mode 100644
index 0000000000..93ca7aabaa
--- /dev/null
+++ b/third_party/rust/relevancy/src/error.rs
@@ -0,0 +1,44 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+* License, v. 2.0. If a copy of the MPL was not distributed with this
+* file, You can obtain one at http://mozilla.org/MPL/2.0/.
+*/
+
+use error_support::{ErrorHandling, GetErrorHandling};
+
+/// Errors we return via the public interface.
+#[derive(Debug, thiserror::Error)]
+pub enum RelevancyApiError {
+    #[error("Unexpected Error: {reason}")]
+    Unexpected { reason: String },
+}
+
+/// Errors we use internally
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("Error opening database: {0}")]
+    OpenDatabase(#[from] sql_support::open_database::Error),
+
+    #[error("Sql error: {0}")]
+    SqlError(#[from] rusqlite::Error),
+
+    #[error("Error fetching interest data")]
+    FetchInterestDataError,
+}
+
+/// Result enum for the public API
+pub type ApiResult<T> = std::result::Result<T, RelevancyApiError>;
+
+/// Result enum for internal functions
+pub type Result<T> = std::result::Result<T, Error>;
+
+// Define how our internal errors are handled and converted to external errors
+// See `support/error/README.md` for how this works, especially the warning about PII.
+impl GetErrorHandling for Error {
+    type ExternalError = RelevancyApiError;
+
+    fn get_error_handling(&self) -> ErrorHandling<Self::ExternalError> {
+        ErrorHandling::convert(RelevancyApiError::Unexpected {
+            reason: self.to_string(),
+        })
+    }
+}
diff --git a/third_party/rust/relevancy/src/interest.rs b/third_party/rust/relevancy/src/interest.rs
new file mode 100644
index 0000000000..0573c743fc
--- /dev/null
+++ b/third_party/rust/relevancy/src/interest.rs
@@ -0,0 +1,167 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/// List of possible interests for a domain.  Domains can have be associated with one or multiple
+/// interests.  `Inconclusive` is used for domains in the user's top sites that we can't classify
+/// because there's no corresponding entry in the interest database.
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+#[repr(u32)]
+pub enum Interest {
+    Animals,
+    Arts,
+    Autos,
+    Business,
+    Career,
+    Education,
+    Fashion,
+    Finance,
+    Food,
+    Government,
+    Health,
+    Hobbies,
+    Home,
+    News,
+    RealEstate,
+    Society,
+    Sports,
+    Tech,
+    Travel,
+    Inconclusive,
+}
+
+impl From<Interest> for u32 {
+    fn from(interest: Interest) -> Self {
+        interest as u32
+    }
+}
+
+impl From<Interest> for usize {
+    fn from(interest: Interest) -> Self {
+        interest as usize
+    }
+}
+
+impl From<u32> for Interest {
+    fn from(code: u32) -> Self {
+        if code as usize > Self::COUNT {
+            panic!("Invalid interest code: {code}")
+        }
+        // Safety: This is safe since Interest has a u32 representation and we've done a bounds
+        // check
+        unsafe { std::mem::transmute(code) }
+    }
+}
+
+impl Interest {
+    const COUNT: usize = 20;
+
+    pub fn all() -> [Interest; Self::COUNT] {
+        [
+            Self::Animals,
+            Self::Arts,
+            Self::Autos,
+            Self::Business,
+            Self::Career,
+            Self::Education,
+            Self::Fashion,
+            Self::Finance,
+            Self::Food,
+            Self::Government,
+            Self::Health,
+            Self::Hobbies,
+            Self::Home,
+            Self::News,
+            Self::RealEstate,
+            Self::Society,
+            Self::Sports,
+            Self::Tech,
+            Self::Travel,
+            Self::Inconclusive,
+        ]
+    }
+}
+
+/// Vector storing a count value for each interest
+///
+/// Here "vector" refers to the mathematical object, not a Rust `Vec`.  It always has a fixed
+/// number of elements.
+#[derive(Debug, Default, PartialEq, Eq)]
+pub struct InterestVector {
+    pub animals: u32,
+    pub arts: u32,
+    pub autos: u32,
+    pub business: u32,
+    pub career: u32,
+    pub education: u32,
+    pub fashion: u32,
+    pub finance: u32,
+    pub food: u32,
+    pub government: u32,
+    pub health: u32,
+    pub hobbies: u32,
+    pub home: u32,
+    pub news: u32,
+    pub real_estate: u32,
+    pub society: u32,
+    pub sports: u32,
+    pub tech: u32,
+    pub travel: u32,
+    pub inconclusive: u32,
+}
+
+impl std::ops::Index<Interest> for InterestVector {
+    type Output = u32;
+
+    fn index(&self, index: Interest) -> &u32 {
+        match index {
+            Interest::Animals => &self.animals,
+            Interest::Arts => &self.arts,
+            Interest::Autos => &self.autos,
+            Interest::Business => &self.business,
+            Interest::Career => &self.career,
+            Interest::Education => &self.education,
+            Interest::Fashion => &self.fashion,
+            Interest::Finance => &self.finance,
+            Interest::Food => &self.food,
+            Interest::Government => &self.government,
+            Interest::Health => &self.health,
+            Interest::Hobbies => &self.hobbies,
+            Interest::Home => &self.home,
+            Interest::News => &self.news,
+            Interest::RealEstate => &self.real_estate,
+            Interest::Society => &self.society,
+            Interest::Sports => &self.sports,
+            Interest::Tech => &self.tech,
+            Interest::Travel => &self.travel,
+            Interest::Inconclusive => &self.inconclusive,
+        }
+    }
+}
+
+impl std::ops::IndexMut<Interest> for InterestVector {
+    fn index_mut(&mut self, index: Interest) -> &mut u32 {
+        match index {
+            Interest::Animals => &mut self.animals,
+            Interest::Arts => &mut self.arts,
+            Interest::Autos => &mut self.autos,
+            Interest::Business => &mut self.business,
+            Interest::Career => &mut self.career,
+            Interest::Education => &mut self.education,
+            Interest::Fashion => &mut self.fashion,
+            Interest::Finance => &mut self.finance,
+            Interest::Food => &mut self.food,
+            Interest::Government => &mut self.government,
+            Interest::Health => &mut self.health,
+            Interest::Hobbies => &mut self.hobbies,
+            Interest::Home => &mut self.home,
+            Interest::News => &mut self.news,
+            Interest::RealEstate => &mut self.real_estate,
+            Interest::Society => &mut self.society,
+            Interest::Sports => &mut self.sports,
+            Interest::Tech => &mut self.tech,
+            Interest::Travel => &mut self.travel,
+            Interest::Inconclusive => &mut self.inconclusive,
+        }
+    }
+}
diff --git a/third_party/rust/relevancy/src/lib.rs b/third_party/rust/relevancy/src/lib.rs
new file mode 100644
index 0000000000..157a26277e
--- /dev/null
+++ b/third_party/rust/relevancy/src/lib.rs
@@ -0,0 +1,81 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+//! Proposed API for the relevancy component (validation phase)
+//!
+//! The goal here is to allow us to validate that we can reliably detect user interests from
+//! history data, without spending too much time building the API out.  There's some hand-waving
+//! towards how we would use this data to rank search results, but we don't need to come to a final
+//! decision on that yet.
+
+mod db;
+mod error;
+mod interest;
+mod populate_interests;
+mod schema;
+pub mod url_hash;
+
+pub use db::RelevancyDb;
+pub use error::{ApiResult, Error, RelevancyApiError, Result};
+pub use interest::{Interest, InterestVector};
+
+use error_support::handle_error;
+
+pub struct RelevancyStore {
+    db: RelevancyDb,
+}
+
+/// Top-level API for the Relevancy component
+impl RelevancyStore {
+    #[handle_error(Error)]
+    pub fn new(db_path: String) -> ApiResult<Self> {
+        Ok(Self {
+            db: RelevancyDb::open(db_path)?,
+        })
+    }
+
+    /// Ingest top URLs to build the user's interest vector.
+    ///
+    /// Consumer should pass a list of the user's top URLs by frecency to this method.  It will
+    /// then:
+    ///
+    ///  - Download the URL interest data from remote settings.  Eventually this should be cached /
+    ///    stored in the database, but for now it would be fine to download fresh data each time.
+    ///  - Match the user's top URls against the interest data to build up their interest vector.
+    ///  - Store the user's interest vector in the database.
+    ///
+    ///  This method may execute for a long time and should only be called from a worker thread.
+    #[handle_error(Error)]
+    pub fn ingest(&self, _top_urls_by_frecency: Vec<String>) -> ApiResult<()> {
+        populate_interests::ensure_interest_data_populated(&self.db)?;
+        todo!()
+    }
+
+    /// Calculate metrics for the validation phase
+    ///
+    /// This runs after [Self::ingest].  It takes the interest vector that ingest created and
+    /// calculates a set of metrics that we can report to glean.
+    #[handle_error(Error)]
+    pub fn calculate_metrics(&self) -> ApiResult<InterestMetrics> {
+        todo!()
+    }
+
+    /// Get the user's interest vector directly.
+    ///
+    /// This runs after [Self::ingest].  It returns the interest vector directly so that the
+    /// consumer can show it in an `about:` page.
+    #[handle_error(Error)]
+    pub fn user_interest_vector(&self) -> ApiResult<InterestVector> {
+        todo!()
+    }
+}
+
+/// Interest metric data.  See `relevancy.udl` for details.
+pub struct InterestMetrics {
+    pub top_single_interest_similarity: u32,
+    pub top_2interest_similarity: u32,
+    pub top_3interest_similarity: u32,
+}
+
+uniffi::include_scaffolding!("relevancy");
diff --git a/third_party/rust/relevancy/src/populate_interests.rs b/third_party/rust/relevancy/src/populate_interests.rs
new file mode 100644
index 0000000000..e33b677dd6
--- /dev/null
+++ b/third_party/rust/relevancy/src/populate_interests.rs
@@ -0,0 +1,157 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+use crate::{url_hash::UrlHash, Error, Interest, RelevancyDb, Result};
+use std::io::{Cursor, Read};
+
+pub fn ensure_interest_data_populated(db: &RelevancyDb) -> Result<()> {
+    if !db.read(|dao| dao.need_to_load_url_interests())? {
+        return Ok(());
+    }
+    let interest_data = match fetch_interest_data() {
+        Ok(data) => data,
+        Err(e) => {
+            log::warn!("error fetching interest data: {e}");
+            return Err(Error::FetchInterestDataError);
+        }
+    };
+    db.read_write(move |dao| {
+        for (url_hash, interest) in interest_data {
+            dao.add_url_interest(url_hash, interest)?;
+        }
+        Ok(())
+    })
+}
+
+/// Fetch the interest data
+fn fetch_interest_data() -> std::io::Result<Vec<(UrlHash, Interest)>> {
+    // TODO: this hack should be replaced with something that fetches from remote settings
+    let bytes = include_bytes!("../test-data");
+    let mut reader = Cursor::new(&bytes);
+    let mut data = vec![];
+
+    // Loop over all possible interests
+    for interest in Interest::all() {
+        // read the count
+        let mut buf = [0u8; 4];
+        reader.read_exact(&mut buf)?;
+        let count = u32::from_le_bytes(buf);
+        for _ in 0..count {
+            let mut url_hash: UrlHash = [0u8; 16];
+            reader.read_exact(&mut url_hash)?;
+            data.push((url_hash, interest));
+        }
+    }
+    Ok(data)
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::InterestVector;
+
+    #[test]
+    fn test_interest_vectors() {
+        let db = RelevancyDb::open_for_test();
+        ensure_interest_data_populated(&db).unwrap();
+        db.read(|dao| {
+            // Test that the interest data matches the values we started from in
+            // `bin/generate-test-data.rs`
+            assert_eq!(
+                dao.get_url_interest_vector("https://espn.com/").unwrap(),
+                InterestVector {
+                    sports: 1,
+                    ..InterestVector::default()
+                }
+            );
+            assert_eq!(
+                dao.get_url_interest_vector("https://dogs.com/").unwrap(),
+                InterestVector {
+                    animals: 1,
+                    ..InterestVector::default()
+                }
+            );
+            assert_eq!(
+                dao.get_url_interest_vector("https://cars.com/").unwrap(),
+                InterestVector {
+                    autos: 1,
+                    ..InterestVector::default()
+                }
+            );
+            assert_eq!(
+                dao.get_url_interest_vector("https://www.vouge.com/")
+                    .unwrap(),
+                InterestVector {
+                    fashion: 1,
+                    ..InterestVector::default()
+                }
+            );
+            assert_eq!(
+                dao.get_url_interest_vector("https://slashdot.org/")
+                    .unwrap(),
+                InterestVector {
+                    tech: 1,
+                    ..InterestVector::default()
+                }
+            );
+            assert_eq!(
+                dao.get_url_interest_vector("https://www.nascar.com/")
+                    .unwrap(),
+                InterestVector {
+                    autos: 1,
+                    sports: 1,
+                    ..InterestVector::default()
+                }
+            );
+            assert_eq!(
+                dao.get_url_interest_vector("https://unknown.url/").unwrap(),
+                InterestVector::default()
+            );
+            Ok(())
+        })
+        .unwrap();
+    }
+
+    #[test]
+    fn test_variations_on_the_url() {
+        let db = RelevancyDb::open_for_test();
+        ensure_interest_data_populated(&db).unwrap();
+        db.read(|dao| {
+            // Different paths/queries should work
+            assert_eq!(
+                dao.get_url_interest_vector("https://espn.com/foo/bar/?baz")
+                    .unwrap(),
+                InterestVector {
+                    sports: 1,
+                    ..InterestVector::default()
+                }
+            );
+            // Different schemes should too
+            assert_eq!(
+                dao.get_url_interest_vector("http://espn.com/").unwrap(),
+                InterestVector {
+                    sports: 1,
+                    ..InterestVector::default()
+                }
+            );
+            // But changes to the domain shouldn't
+            assert_eq!(
+                dao.get_url_interest_vector("http://www.espn.com/").unwrap(),
+                InterestVector::default()
+            );
+            // However, extra components past the 3rd one in the domain are ignored
+            assert_eq!(
+                dao.get_url_interest_vector("https://foo.www.nascar.com/")
+                    .unwrap(),
+                InterestVector {
+                    autos: 1,
+                    sports: 1,
+                    ..InterestVector::default()
+                }
+            );
+            Ok(())
+        })
+        .unwrap();
+    }
+}
diff --git a/third_party/rust/relevancy/src/relevancy.udl b/third_party/rust/relevancy/src/relevancy.udl
new file mode 100644
index 0000000000..e07243ec28
--- /dev/null
+++ b/third_party/rust/relevancy/src/relevancy.udl
@@ -0,0 +1,106 @@
+namespace relevancy { };
+
+[Error]
+interface RelevancyApiError {
+    Unexpected(string reason);
+};
+
+// Top-level class for the Relevancy component
+interface RelevancyStore {
+    // Construct a new RelevancyStore
+    [Throws=RelevancyApiError]
+    constructor(string dbpath);
+
+    // Ingest the top URLs by frequency to build up the user's interest vector
+    [Throws=RelevancyApiError]
+    void ingest(sequence<string> top_urls);
+
+    // Calculate metrics for the user's interest vector in order to measure how strongly we're
+    // identifying interests.  See the `InterestMetrics` struct for details.
+    [Throws=RelevancyApiError]
+    InterestMetrics calculate_metrics();
+
+    // Get the interest vector for the user.
+    //
+    // This is intended to be show to the user in an `about:` page so that users can judge if it
+    // feels correct.
+    [Throws=RelevancyApiError]
+    InterestVector user_interest_vector();
+};
+
+enum Interest {
+    "Animals",
+    "Arts",
+    "Autos",
+    "Business",
+    "Career",
+    "Education",
+    "Fashion",
+    "Finance",
+    "Food",
+    "Government",
+    "Health",
+    "Hobbies",
+    "Home",
+    "News",
+    "RealEstate",
+    "Society",
+    "Sports",
+    "Tech",
+    "Travel",
+    "Inconclusive",
+};
+
+// Interest metrics that we want to send to Glean as part of the validation process.  These contain
+// the cosine similarity when comparing the user's interest against various interest vectors that
+// consumers may use.
+//
+// Cosine similary was chosen because it seems easy to calculate.  This was then matched against
+// some semi-plausible real-world interest vectors that consumers might use.  This is all up for
+// debate and we may decide to switch to some other metrics.
+//
+// Similarity values are transformed to integers by multiplying the floating point value by 1000 and
+// rounding.  This is to make them compatible with Glean's distribution metrics.
+dictionary InterestMetrics {
+    // Similarity between the user's interest vector and an interest vector where the element for
+    // the user's top interest is copied, but all other interests are set to zero.  This measures
+    // the highest possible similarity with consumers that used interest vectors with a single
+    // interest set.
+    u32 top_single_interest_similarity;
+
+    // The same as before, but the top 2 interests are copied. This measures the highest possible
+    // similarity with consumers that used interest vectors with a two interests (note: this means
+    // they would need to choose the user's top two interests and have the exact same proportion
+    // between them as the user).
+    u32 top_2interest_similarity;
+
+    // The same as before, but the top 3 interests are copied.
+    u32 top_3interest_similarity;
+};
+
+// Vector storing a count value for each interest
+//
+// Here "vector" refers to the mathematical object, not a Rust `Vec`.  It always has a fixed
+// number of elements.
+dictionary InterestVector {
+    u32 animals;
+    u32 arts;
+    u32 autos;
+    u32 business;
+    u32 career;
+    u32 education;
+    u32 fashion;
+    u32 finance;
+    u32 food;
+    u32 government;
+    u32 health;
+    u32 hobbies;
+    u32 home;
+    u32 news;
+    u32 real_estate;
+    u32 society;
+    u32 sports;
+    u32 tech;
+    u32 travel;
+    u32 inconclusive;
+};
diff --git a/third_party/rust/relevancy/src/schema.rs b/third_party/rust/relevancy/src/schema.rs
new file mode 100644
index 0000000000..bcb2f260d9
--- /dev/null
+++ b/third_party/rust/relevancy/src/schema.rs
@@ -0,0 +1,53 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+use rusqlite::{Connection, Transaction};
+use sql_support::open_database::{self, ConnectionInitializer};
+
+/// The current database schema version.
+///
+/// For any changes to the schema [`SQL`], please make sure to:
+///
+///  1. Bump this version.
+///  2. Add a migration from the old version to the new version in
+///     [`RelevancyConnectionInitializer::upgrade_from`].
+pub const VERSION: u32 = 13;
+
+/// The current database schema.
+pub const SQL: &str = "
+    CREATE TABLE url_interest(
+        url_hash BLOB NOT NULL,
+        interest_code INTEGER NOT NULL,
+        PRIMARY KEY (url_hash, interest_code)
+    ) WITHOUT ROWID;
+";
+
+/// Initializes an SQLite connection to the Relevancy database, performing
+/// migrations as needed.
+pub struct RelevancyConnectionInitializer;
+
+impl ConnectionInitializer for RelevancyConnectionInitializer {
+    const NAME: &'static str = "relevancy db";
+    const END_VERSION: u32 = VERSION;
+
+    fn prepare(&self, conn: &Connection, _db_empty: bool) -> open_database::Result<()> {
+        let initial_pragmas = "
+            -- Use in-memory storage for TEMP tables.
+            PRAGMA temp_store = 2;
+            PRAGMA journal_mode = WAL;
+            PRAGMA foreign_keys = ON;
+        ";
+        conn.execute_batch(initial_pragmas)?;
+        Ok(())
+    }
+
+    fn init(&self, db: &Transaction<'_>) -> open_database::Result<()> {
+        Ok(db.execute_batch(SQL)?)
+    }
+
+    fn upgrade_from(&self, _db: &Transaction<'_>, version: u32) -> open_database::Result<()> {
+        Err(open_database::Error::IncompatibleVersion(version))
+    }
+}
diff --git a/third_party/rust/relevancy/src/url_hash.rs b/third_party/rust/relevancy/src/url_hash.rs
new file mode 100644
index 0000000000..d31a45d06b
--- /dev/null
+++ b/third_party/rust/relevancy/src/url_hash.rs
@@ -0,0 +1,63 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+use md5::{Digest, Md5};
+use url::{Host, Url};
+
+pub type UrlHash = [u8; 16];
+
+/// Given a URL, extract the part of it that we want to use to identify it.
+///
+/// We currently use the final 3 components of the URL domain.
+///
+/// TODO: decide if this should be 3 or 3 components.
+pub fn url_hash_source(url: &str) -> Option<String> {
+    let url = Url::parse(url).ok()?;
+    let domain = match url.host() {
+        Some(Host::Domain(d)) => d,
+        _ => return None,
+    };
+    // This will store indexes of `.` chars as we search backwards.
+    let mut pos = domain.len();
+    for _ in 0..3 {
+        match domain[0..pos].rfind('.') {
+            Some(p) => pos = p,
+            // The domain has less than 3 dots, return it all
+            None => return Some(domain.to_owned()),
+        }
+    }
+    Some(domain[pos + 1..].to_owned())
+}
+
+pub fn hash_url(url: &str) -> Option<UrlHash> {
+    url_hash_source(url).map(|hash_source| {
+        let mut hasher = Md5::new();
+        hasher.update(hash_source);
+        let result = hasher.finalize();
+        result.into()
+    })
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_url_hash_source() {
+        let table = [
+            ("http://example.com/some-path", Some("example.com")),
+            ("http://foo.example.com/some-path", Some("foo.example.com")),
+            (
+                "http://foo.bar.baz.example.com/some-path",
+                Some("baz.example.com"),
+            ),
+            ("http://foo.com.uk/some-path", Some("foo.com.uk")),
+            ("http://amazon.com/some-path", Some("amazon.com")),
+            ("http://192.168.0.1/some-path", None),
+        ];
+        for (url, expected) in table {
+            assert_eq!(url_hash_source(url).as_deref(), expected)
+        }
+    }
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-15 03:35:49 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-15 03:35:49 +0000
commit	d8bbc7858622b6d9c278469aab701ca0b609cddf (patch)
tree	eff41dc61d9f714852212739e6b3738b82a2af87 /third_party/rust/relevancy/src
parent	Releasing progress-linux version 125.0.3-1~progress7.99u1. (diff)
download	firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip