diff options
Diffstat (limited to 'toolkit/components/contentrelevancy')
9 files changed, 464 insertions, 47 deletions
diff --git a/toolkit/components/contentrelevancy/ContentRelevancyManager.sys.mjs b/toolkit/components/contentrelevancy/ContentRelevancyManager.sys.mjs index ea3f2a78a2..aec1ba30db 100644 --- a/toolkit/components/contentrelevancy/ContentRelevancyManager.sys.mjs +++ b/toolkit/components/contentrelevancy/ContentRelevancyManager.sys.mjs @@ -11,6 +11,7 @@ ChromeUtils.defineESModuleGetters(lazy, { "resource://gre/modules/contentrelevancy/private/InputUtils.sys.mjs", NimbusFeatures: "resource://nimbus/ExperimentAPI.sys.mjs", RelevancyStore: "resource://gre/modules/RustRelevancy.sys.mjs", + InterestVector: "resource://gre/modules/RustRelevancy.sys.mjs", }); XPCOMUtils.defineLazyServiceGetter( @@ -40,6 +41,7 @@ const NIMBUS_VARIABLE_ENABLED = "enabled"; const NIMBUS_VARIABLE_MAX_INPUT_URLS = "maxInputUrls"; const NIMBUS_VARIABLE_MIN_INPUT_URLS = "minInputUrls"; const NIMBUS_VARIABLE_TIMER_INTERVAL = "timerInterval"; +const NIMBUS_VARIABLE_INGEST_ENABLED = "ingestEnabled"; ChromeUtils.defineLazyGetter(lazy, "log", () => { return console.createInstance({ @@ -66,7 +68,7 @@ class RelevancyManager { * Note that this should be called once only. `#enable` and `#disable` can be * used to toggle the feature once the manager is initialized. */ - async init() { + init() { if (this.initialized) { return; } @@ -74,7 +76,7 @@ class RelevancyManager { lazy.log.info("Initializing the manager"); if (this.shouldEnable) { - await this.#enable(); + this.#enable(); } this._nimbusUpdateCallback = this.#onNimbusUpdate.bind(this); @@ -143,14 +145,14 @@ class RelevancyManager { ); } - async #enable() { + #enable() { if (!this.#_store) { // Init the relevancy store. const path = this.#storePath; lazy.log.info(`Initializing RelevancyStore: ${path}`); try { - this.#_store = await lazy.RelevancyStore.init(path); + this.#_store = lazy.RelevancyStore.init(path); } catch (error) { lazy.log.error(`Error initializing RelevancyStore: ${error}`); return; @@ -166,13 +168,16 @@ class RelevancyManager { * called. */ #disable() { - this.#_store = null; + if (this._isStoreReady) { + this.#_store.close(); + this.#_store = null; + } lazy.timerManager.unregisterTimer(TIMER_ID); } - async #toggleFeature() { + #toggleFeature() { if (this.shouldEnable) { - await this.#enable(); + this.#enable(); } else { this.#disable(); } @@ -199,8 +204,11 @@ class RelevancyManager { * * The classification will not be performed if the total number of input URLs * is less than `DEFAULT_MIN_URLS` (or the corresponding Nimbus value). + * + * @param {object} options + * options.minUrlsForTest {number} A minimal URL count used only for testing. */ - async #doClassification() { + async #doClassification(options = {}) { if (this.isInProgress) { lazy.log.info( "Another classification is in progress, aborting interest classification" @@ -212,6 +220,8 @@ class RelevancyManager { // exit points & success. this.#isInProgress = true; + let timerId; + try { lazy.log.info("Fetching input data for interest classification"); @@ -222,21 +232,47 @@ class RelevancyManager { const minUrls = lazy.NimbusFeatures.contentRelevancy.getVariable( NIMBUS_VARIABLE_MIN_INPUT_URLS - ) ?? DEFAULT_MIN_URLS; + ) ?? + options.minUrlsForTest ?? + DEFAULT_MIN_URLS; const urls = await lazy.getFrecentRecentCombinedUrls(maxUrls); if (urls.length < minUrls) { lazy.log.info("Aborting interest classification: insufficient input"); + Glean.relevancyClassify.fail.record({ reason: "insufficient-input" }); return; } lazy.log.info("Starting interest classification"); - await this.#doClassificationHelper(urls); + timerId = Glean.relevancyClassify.duration.start(); + + const interestVector = await this.#doClassificationHelper(urls); + const sortedVector = Object.entries(interestVector).sort( + ([, a], [, b]) => b - a // descending + ); + lazy.log.info(`Classification results: ${JSON.stringify(sortedVector)}`); + + Glean.relevancyClassify.duration.stopAndAccumulate(timerId); + Glean.relevancyClassify.succeed.record({ + input_size: urls.length, + input_classified_size: sortedVector.reduce((acc, [, v]) => acc + v, 0), + input_inconclusive_size: interestVector.inconclusive, + output_interest_size: sortedVector.filter(([, v]) => v != 0).length, + interest_top_1_hits: sortedVector[0][1], + interest_top_2_hits: sortedVector[1][1], + interest_top_3_hits: sortedVector[2][1], + }); } catch (error) { + let reason; + if (error instanceof StoreNotAvailableError) { lazy.log.error("#store became null, aborting interest classification"); + reason = "store-not-ready"; } else { lazy.log.error("Classification error: " + (error.reason ?? error)); + reason = "component-errors"; } + Glean.relevancyClassify.fail.record({ reason }); + Glean.relevancyClassify.duration.cancel(timerId); // No error is recorded if `start` was not called. } finally { this.#isInProgress = false; } @@ -245,6 +281,13 @@ class RelevancyManager { } /** + * Exposed for testing. + */ + async _test_doClassification(options = {}) { + await this.#doClassification(options); + } + + /** * Classification helper. Use the getter `this.#store` rather than `#_store` * to access the store so that when it becomes null, a `StoreNotAvailableError` * will be raised. Likewise, other store related errors should be propagated @@ -252,28 +295,48 @@ class RelevancyManager { * * @param {Array} urls * An array of URLs. + * @returns {InterestVector} + * An interest vector. * @throws {StoreNotAvailableError} * Thrown when the store became unavailable (i.e. set to null elsewhere). * @throws {RelevancyAPIError} * Thrown for other API errors on the store. */ async #doClassificationHelper(urls) { - // The following logs are unnecessary, only used to suppress the linting error. - // TODO(nanj): delete me once the following TODO is done. - if (!this.#store) { - lazy.log.error("#store became null, aborting interest classification"); - } lazy.log.info("Classification input: " + urls); - // TODO(nanj): uncomment the following once `ingest()` is implemented. - // await this.#store.ingest(urls); - } + let interestVector = new lazy.InterestVector({ + animals: 0, + arts: 0, + autos: 0, + business: 0, + career: 0, + education: 0, + fashion: 0, + finance: 0, + food: 0, + government: 0, + hobbies: 0, + home: 0, + news: 0, + realEstate: 0, + society: 0, + sports: 0, + tech: 0, + travel: 0, + inconclusive: 0, + }); + + if ( + lazy.NimbusFeatures.contentRelevancy.getVariable( + NIMBUS_VARIABLE_INGEST_ENABLED + ) ?? + false + ) { + interestVector = await this.#store.ingest(urls); + } - /** - * Exposed for testing. - */ - async _test_doClassification(urls) { - await this.#doClassificationHelper(urls); + return interestVector; } /** diff --git a/toolkit/components/contentrelevancy/docs/index.md b/toolkit/components/contentrelevancy/docs/index.md index bd377d68dc..e7f6802495 100644 --- a/toolkit/components/contentrelevancy/docs/index.md +++ b/toolkit/components/contentrelevancy/docs/index.md @@ -1,3 +1,120 @@ # Content Relevancy -This is the home for the project: Interest-based Content Relevance Ranking & Personalization for Firefox, a client-based privacy preserving approach to enhancing content experience of Firefox. +This is the home for the project: Interest-based Content Relevance Ranking & Personalization for Firefox, +a client-based privacy preserving approach to enhancing content experience of Firefox. + +```{toctree} +:titlesonly: +:maxdepth: 1 +:glob: + +telemetry.md +``` + +## Overview + +The following diagram illustrates the system overview of the component. +The system consists of three main parts: the relevancy manager, the relevancy store, +and the internal & external data sources. +The cross-platform component [`relevancy`][relevancy-component] serves as the backbone +that is responsible for interest classification, persistence, and ranking. + +```{mermaid} +graph TB +subgraph Firefox + direction LR + subgraph rmgr[Relevancy Manager] + subgraph rcmp[Rust Component] + classify[Perform Interest Classification] + manage[Manage] + query[Query & Rank] + end + iic[Initiate Interest Classification] + rtuc[Respond to User Commands] + irt[Input Retriever] + end + + subgraph places[Places] + direction TB + tfv[(Top Frecent Visits)] + mrv[(Most Recent Visits)] + others[(Other Inputs)] + end + + subgraph rstore[Relevancy Store] + direction TB + iui[Inferred User Interests] + icm[Ingested Classifier Metadata] + end +end + +subgraph settings[Remote Settings] + rs[("content-relevancy")] +end + +iic --> |call| classify +classify --> |write| rstore +query --> |"query (read-only)"| rstore +manage --> |update| rstore +places --> |input| irt --> iic +rs --- |fetch<br/> classification<br/> data| classify +rtuc --> |call| manage +``` + +### Relevancy Manager + +The relevancy manager manages the following tasks for the relevancy component: +- Start a browser update timer to periodically perform interest classifications +- Listen and respond to user commands such as enable / disable the component, + update / reset the relevancy store upon Places changes, etc. +- Nimbus integration +- Telemetry + +The interest classification is fulfilled by the `relevancy` Rust component. +The `relevancy` component downloads & ingests interest data (e.g. "small classifiers") +from Remote Settings and then applies those classifiers to the chosen inputs. +Classification results are persisted in the relevancy store for future uses. + +### Relevancy Store + +The relevancy store, an SQLite database, serves as the storage for both +the user interests and the ingested classifier metadata. It's managed by the +`relevancy` Rust component, which provides consumers (e.g. the relevancy manager) +with a series of APIs for ingestion, querying, ranking, and updating. + +### Data Sources + +Interest classification relies on various internal & external data sources to function. +- Interest classifiers and metadata are prepared offline and served through + Remote Settings. The download and ingestion are handled by the Rust component +- Classification inputs (e.g. top frecent visits and most recent visits) are retrieved + from Places. An input utility module is provided to facilitate the input retrieval + +## Working on the Code + +### Component Structure + +- `ContentRelevancyManager.sys.mjs` implements the relevancy manager. A singleton + relevancy manager is initialized as a background task in `BrowserGlue.sys.mjs` +- Modules defined in `private` should be treated as internal artifacts for the + component and should not be consumed by other browser components. + `InputUtils.sys.mjs` defines several utility functions for input retrieval + +### Telemetry + +Please reference the [telemetry](./telemetry.md) section. + +### Testing + +Since the component is relatively small and does not have any user facing interfaces, +XPCShell tests are recommended for general testing. Mochitests can be used if you're +testing functionalities that would require a more complete browser environment. Only +the Nimbus tests are written as Mochitests at the moment. + +Notes: +- The component requires a browser profile for both Places and the relevancy store, + `do_get_profile()` is called in `head.js` +- Most XPCShell tests are skipped for Android as some test dependencies (`PlacesTestUtils`) + are not available on Android + +[relevancy-component]: https://github.com/mozilla/application-services/tree/main/components/relevancy diff --git a/toolkit/components/contentrelevancy/docs/telemetry.md b/toolkit/components/contentrelevancy/docs/telemetry.md new file mode 100644 index 0000000000..90baa28cae --- /dev/null +++ b/toolkit/components/contentrelevancy/docs/telemetry.md @@ -0,0 +1,62 @@ +# Telemetry + +This document serves as a complementary doc for all the telemetry we collect for `contentrelevancy`. +Note that we use FoG ([Firefox on Glean][FoG]) to record telemetry, +all the metrics described below follow the standard Glean metric types. +You can reference `metrics.yaml` or [Glean Dictionary][glean-dictionary] for more details for each metric. + +## Classification Metrics - `relevancyClassify` + +When the `contentrelevancy` feature is enabled, it will periodically perform classifications in the background. +The following metrics will be recorded for each classification. + +### Events + +#### `succeed` + +This is recorded when a classification is successfully performed. +Various counters are recorded in the `extra_keys` to measure the classification results. + +Example: + +``` +"extra_keys": { + "input_size": 100, // The input size was 100 items + "input_classified_size": 50, // 50 items (out of 100) were classified with at least one conclusive interest + "input_inconclusive_size": 10, // 10 items were classified with the inconclusive interest + "output_interest_size": 4 // The total number of unique interests classified + "interest_top_1_hits": 20, // 20 items were classified with the interest with the most hits + "interest_top_2_hits": 15, // 15 items were classified with the interest with the 2nd most hits + "interest_top_3_hits": 10, // 10 items were classified with the interest with the 3rd most hits +} +``` + +#### `fail` + +This is recorded when a classification is failed or aborted. +The `reason` of the failure is recorded in the `extra_keys`. + +``` +"extra_keys": { + "reason": "insufficient-input" // The classification was aborted due to insufficient input. + // `store-not-ready` indicates the store is null. + // `component-errors` indicates an error in the Rust component. +} +``` + +### Timing Distribution + +#### `duration` + +This records the time duration (in milliseconds) of a successful classification. +The durations of unsuccessful classifications are not measured. + +## Changelog + +### 2024-04 + +* [Bug 1889404][bug-1889404]: Added basic metrics for relevancy manager + +[FoG]: https://firefox-source-docs.mozilla.org/toolkit/components/glean/index.html +[glean-dictionary]: https://dictionary.telemetry.mozilla.org/ +[bug-1889404]: https://bugzilla.mozilla.org/show_bug.cgi?id=1889404 diff --git a/toolkit/components/contentrelevancy/metrics.yaml b/toolkit/components/contentrelevancy/metrics.yaml new file mode 100644 index 0000000000..a17e40bab4 --- /dev/null +++ b/toolkit/components/contentrelevancy/metrics.yaml @@ -0,0 +1,103 @@ +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# Adding a new metric? We have docs for that! +# https://firefox-source-docs.mozilla.org/toolkit/components/glean/user/new_definitions_file.html + +--- +$schema: moz://mozilla.org/schemas/glean/metrics/2-0-0 +$tags: + - "Application Services :: Relevancy" + +relevancy.classify: + succeed: + type: event + description: > + Record an event of a successful user interest classification. + bugs: + - https://bugzilla.mozilla.org/show_bug.cgi?id=1889404 + data_reviews: + - https://bugzilla.mozilla.org/show_bug.cgi?id=1889404 + data_sensitivity: + - interaction + notification_emails: + - disco-team@mozilla.com + - najiang@mozilla.com + expires: never + extra_keys: + input_size: + description: > + The total number of input elements used for classification. + type: quantity + input_classified_size: + description: > + The total number of input elements classified with at least one conclusive interest. + type: quantity + input_inconclusive_size: + description: > + The total number of input elements classified with the inconclusive interest. + type: quantity + output_interest_size: + description: > + The total number of output interests for classification. + type: quantity + interest_top_1_hits: + description: > + The total number of input elements for the interest with the most hits. + Defaults to 0. This, along with `input_size` and `input_classified_size`, + can be used to calculate the top-N classification percentages and top-N + hit percentages. + type: quantity + interest_top_2_hits: + description: > + The total number of input elements for the interest with the 2nd most hits. + Defaults to 0. This, along with `input_size` and `input_classified_size`, + can be used to calculate the top-N classification percentages and top-N + hit percentages. + type: quantity + interest_top_3_hits: + description: > + The total number of input elements for the interest with the 3rd most hits. + Defaults to 0. This, along with `input_size` and `input_classified_size`, + can be used to calculate the top-N classification percentages and top-N + hit percentages. + type: quantity + fail: + type: event + description: > + Record an event of a failed / aborted user interest classification. + bugs: + - https://bugzilla.mozilla.org/show_bug.cgi?id=1889404 + data_reviews: + - https://bugzilla.mozilla.org/show_bug.cgi?id=1889404 + data_sensitivity: + - interaction + notification_emails: + - disco-team@mozilla.com + - najiang@mozilla.com + expires: never + extra_keys: + reason: + description: > + The reason of the failed / aborted classification. + One of + * `insufficient-input` + * `component-errors` + * `store-not-ready` + type: string + duration: + type: timing_distribution + time_unit: millisecond + description: > + Record the duration (in ms) of a successful classification. + bugs: + - https://bugzilla.mozilla.org/show_bug.cgi?id=1889404 + data_reviews: + - https://bugzilla.mozilla.org/show_bug.cgi?id=1889404 + data_sensitivity: + - interaction + notification_emails: + - disco-team@mozilla.com + - najiang@mozilla.com + expires: never diff --git a/toolkit/components/contentrelevancy/tests/browser/browser.toml b/toolkit/components/contentrelevancy/tests/browser/browser.toml index ec1d3a3e66..8de1bd7c46 100644 --- a/toolkit/components/contentrelevancy/tests/browser/browser.toml +++ b/toolkit/components/contentrelevancy/tests/browser/browser.toml @@ -4,3 +4,4 @@ prefs = [ ] ["browser_contentrelevancy_nimbus.js"] +lineno = "6" diff --git a/toolkit/components/contentrelevancy/tests/browser/browser_contentrelevancy_nimbus.js b/toolkit/components/contentrelevancy/tests/browser/browser_contentrelevancy_nimbus.js index 47d54c2a87..5147b3792f 100644 --- a/toolkit/components/contentrelevancy/tests/browser/browser_contentrelevancy_nimbus.js +++ b/toolkit/components/contentrelevancy/tests/browser/browser_contentrelevancy_nimbus.js @@ -41,6 +41,7 @@ add_task(async function test_NimbusIntegration_enable() { maxInputUrls: 3, // Set the timer interval to 0 will trigger the timer right away. timerInterval: 0, + ingestEnabled: false, }, }); @@ -73,6 +74,7 @@ add_task(async function test_NimbusIntegration_disable() { maxInputUrls: 3, // Set the timer interval to 0 will trigger the timer right away. timerInterval: 0, + ingestEnabled: false, }, }); diff --git a/toolkit/components/contentrelevancy/tests/xpcshell/test_ContentRelevancyManager.js b/toolkit/components/contentrelevancy/tests/xpcshell/test_ContentRelevancyManager.js index 633f9fc49b..7f89c7d615 100644 --- a/toolkit/components/contentrelevancy/tests/xpcshell/test_ContentRelevancyManager.js +++ b/toolkit/components/contentrelevancy/tests/xpcshell/test_ContentRelevancyManager.js @@ -23,11 +23,11 @@ const CATEGORY_UPDATE_TIMER = "update-timer"; let gSandbox; -add_setup(async () => { +add_setup(() => { gSandbox = sinon.createSandbox(); initUpdateTimerManager(); Services.prefs.setBoolPref(PREF_CONTENT_RELEVANCY_ENABLED, true); - await ContentRelevancyManager.init(); + ContentRelevancyManager.init(); registerCleanupFunction(() => { Services.prefs.clearUserPref(PREF_CONTENT_RELEVANCY_ENABLED); @@ -35,11 +35,11 @@ add_setup(async () => { }); }); -add_task(async function test_init() { +add_task(function test_init() { Assert.ok(ContentRelevancyManager.initialized, "Init should succeed"); }); -add_task(async function test_uninit() { +add_task(function test_uninit() { ContentRelevancyManager.uninit(); Assert.ok(!ContentRelevancyManager.initialized, "Uninit should succeed"); @@ -50,7 +50,7 @@ add_task(async function test_timer() { Services.prefs.setIntPref(PREF_TIMER_INTERVAL, 0); gSandbox.spy(ContentRelevancyManager, "notify"); - await ContentRelevancyManager.init(); + ContentRelevancyManager.init(); await TestUtils.waitForCondition( () => ContentRelevancyManager.notify.called, @@ -100,24 +100,6 @@ add_task(async function test_call_disable_twice() { Services.prefs.clearUserPref(PREF_CONTENT_RELEVANCY_ENABLED); }); -add_task(async function test_doClassification() { - Services.prefs.setBoolPref(PREF_CONTENT_RELEVANCY_ENABLED, true); - await TestUtils.waitForCondition(() => ContentRelevancyManager._isStoreReady); - await ContentRelevancyManager._test_doClassification([]); - - // Disable it to reset the store. - Services.prefs.setBoolPref(PREF_CONTENT_RELEVANCY_ENABLED, false); - await TestUtils.waitForTick(); - - await Assert.rejects( - ContentRelevancyManager._test_doClassification([]), - /Store is not available/, - "Should throw with an unset store" - ); - - Services.prefs.clearUserPref(PREF_CONTENT_RELEVANCY_ENABLED); -}); - /** * Sets up the update timer manager for testing: makes it fire more often, * removes all existing timers, and initializes it for testing. The body of this diff --git a/toolkit/components/contentrelevancy/tests/xpcshell/test_Telemetry.js b/toolkit/components/contentrelevancy/tests/xpcshell/test_Telemetry.js new file mode 100644 index 0000000000..9deef6bad5 --- /dev/null +++ b/toolkit/components/contentrelevancy/tests/xpcshell/test_Telemetry.js @@ -0,0 +1,81 @@ +/* Any copyright is dedicated to the Public Domain. + http://creativecommons.org/publicdomain/zero/1.0/ */ + +"use strict"; + +const { ContentRelevancyManager } = ChromeUtils.importESModule( + "resource://gre/modules/ContentRelevancyManager.sys.mjs" +); + +const PREF_CONTENT_RELEVANCY_ENABLED = "toolkit.contentRelevancy.enabled"; + +add_setup(async function setup() { + // FOG needs a profile directory to put its data in. + do_get_profile(); + + // FOG needs to be initialized in order for data to flow. + Services.fog.initializeFOG(); + + Services.prefs.setBoolPref(PREF_CONTENT_RELEVANCY_ENABLED, true); + ContentRelevancyManager.init(); + + registerCleanupFunction(() => { + Services.prefs.clearUserPref(PREF_CONTENT_RELEVANCY_ENABLED); + }); +}); + +/** + * Test classification metrics - succeed. + */ +add_task(async function test_classify_succeed() { + Services.fog.testResetFOG(); + + Assert.equal(null, Glean.relevancyClassify.succeed.testGetValue()); + Assert.equal(null, Glean.relevancyClassify.duration.testGetValue()); + + await ContentRelevancyManager._test_doClassification(); + + Assert.deepEqual( + { + input_size: 0, + input_classified_size: 0, + input_inconclusive_size: 0, + output_interest_size: 0, + interest_top_1_hits: 0, + interest_top_2_hits: 0, + interest_top_3_hits: 0, + }, + Glean.relevancyClassify.succeed.testGetValue()[0].extra, + "Should record the succeed event" + ); + Assert.ok( + Glean.relevancyClassify.duration.testGetValue().sum > 0, + "Should record the duration" + ); +}); + +/** + * Test classification metrics - fail - insufficient-input. + */ +add_task(async function test_classify_fail_case1() { + Services.fog.testResetFOG(); + + Assert.equal(null, Glean.relevancyClassify.fail.testGetValue()); + Assert.equal(null, Glean.relevancyClassify.duration.testGetValue()); + + // Require at least 1 input to trigger the failure. + await ContentRelevancyManager._test_doClassification({ minUrlsForTest: 1 }); + + Assert.deepEqual( + { + reason: "insufficient-input", + }, + Glean.relevancyClassify.fail.testGetValue()[0].extra, + "Should record the fail event" + ); + Assert.equal( + null, + Glean.relevancyClassify.duration.testGetValue(), + "Should not record the duration" + ); +}); diff --git a/toolkit/components/contentrelevancy/tests/xpcshell/xpcshell.toml b/toolkit/components/contentrelevancy/tests/xpcshell/xpcshell.toml index 70f6d45c2d..0d76584e6c 100644 --- a/toolkit/components/contentrelevancy/tests/xpcshell/xpcshell.toml +++ b/toolkit/components/contentrelevancy/tests/xpcshell/xpcshell.toml @@ -4,6 +4,12 @@ firefox-appdir = "browser" ["test_ContentRelevancyManager.js"] skip-if = ["os == 'android'"] # bug 1886601 +lineno = "5" ["test_InputUtils.js"] skip-if = ["os == 'android'"] # bug 1886601 +lineno = "9" + +["test_Telemetry.js"] +skip-if = ["os == 'android'"] # bug 1886601 +run-sequentially = "concurrent runs would interfere with each other" |