third_party/rust/relevancy/src/lib.rs


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

//! Proposed API for the relevancy component (validation phase)
//!
//! The goal here is to allow us to validate that we can reliably detect user interests from
//! history data, without spending too much time building the API out.  There's some hand-waving
//! towards how we would use this data to rank search results, but we don't need to come to a final
//! decision on that yet.

mod db;
mod error;
mod ingest;
mod interest;
mod rs;
mod schema;
pub mod url_hash;

pub use db::RelevancyDb;
pub use error::{ApiResult, Error, RelevancyApiError, Result};
pub use interest::{Interest, InterestVector};

use error_support::handle_error;

pub struct RelevancyStore {
    db: RelevancyDb,
}

/// Top-level API for the Relevancy component
impl RelevancyStore {
    pub fn new(db_path: String) -> Self {
        Self {
            db: RelevancyDb::new(db_path),
        }
    }

    pub fn close(&self) {
        self.db.close()
    }

    pub fn interrupt(&self) {
        self.db.interrupt()
    }

    /// Ingest top URLs to build the user's interest vector.
    ///
    /// Consumer should pass a list of the user's top URLs by frecency to this method.  It will
    /// then:
    ///
    ///  - Download the URL interest data from remote settings.  Eventually this should be cached /
    ///    stored in the database, but for now it would be fine to download fresh data each time.
    ///  - Match the user's top URls against the interest data to build up their interest vector.
    ///  - Store the user's interest vector in the database.
    ///
    ///  This method may execute for a long time and should only be called from a worker thread.
    #[handle_error(Error)]
    pub fn ingest(&self, top_urls_by_frecency: Vec<String>) -> ApiResult<InterestVector> {
        ingest::ensure_interest_data_populated(&self.db)?;
        self.classify(top_urls_by_frecency)
    }

    pub fn classify(&self, top_urls_by_frecency: Vec<String>) -> Result<InterestVector> {
        // For experimentation purposes we are going to return an interest vector.
        // Eventually we would want to store this data in the DB and incrementally update it.
        let mut interest_vector = InterestVector::default();
        for url in top_urls_by_frecency {
            let interest_count = self.db.read(|dao| dao.get_url_interest_vector(&url))?;
            interest_vector = interest_vector + interest_count;
        }

        Ok(interest_vector)
    }

    /// Calculate metrics for the validation phase
    ///
    /// This runs after [Self::ingest].  It takes the interest vector that ingest created and
    /// calculates a set of metrics that we can report to glean.
    #[handle_error(Error)]
    pub fn calculate_metrics(&self) -> ApiResult<InterestMetrics> {
        todo!()
    }

    /// Get the user's interest vector directly.
    ///
    /// This runs after [Self::ingest].  It returns the interest vector directly so that the
    /// consumer can show it in an `about:` page.
    #[handle_error(Error)]
    pub fn user_interest_vector(&self) -> ApiResult<InterestVector> {
        todo!()
    }
}

/// Interest metric data.  See `relevancy.udl` for details.
pub struct InterestMetrics {
    pub top_single_interest_similarity: u32,
    pub top_2interest_similarity: u32,
    pub top_3interest_similarity: u32,
}

uniffi::include_scaffolding!("relevancy");

#[cfg(test)]
mod test {
    use crate::url_hash::hash_url;

    use super::*;

    #[test]
    fn test_ingest() {
        let top_urls = vec![
            "https://food.com/".to_string(),
            "https://hello.com".to_string(),
            "https://pasta.com".to_string(),
            "https://dog.com".to_string(),
        ];
        let relevancy_store =
            RelevancyStore::new("file:test_store_data?mode=memory&cache=shared".to_owned());
        relevancy_store
            .db
            .read_write(|dao| {
                dao.add_url_interest(hash_url("https://food.com").unwrap(), Interest::Food)?;
                dao.add_url_interest(
                    hash_url("https://hello.com").unwrap(),
                    Interest::Inconclusive,
                )?;
                dao.add_url_interest(hash_url("https://pasta.com").unwrap(), Interest::Food)?;
                dao.add_url_interest(hash_url("https://dog.com").unwrap(), Interest::Animals)?;
                Ok(())
            })
            .expect("Insert should succeed");

        assert_eq!(
            relevancy_store.ingest(top_urls).unwrap(),
            InterestVector {
                inconclusive: 1,
                animals: 1,
                food: 2,
                ..InterestVector::default()
            }
        );
    }
}