summaryrefslogtreecommitdiffstats
path: root/browser/components/pagedata/PageDataSchema.sys.mjs
blob: 307b906fdd59a9360755da917237db7423c91927 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

import { XPCOMUtils } from "resource://gre/modules/XPCOMUtils.sys.mjs";

const lazy = {};

ChromeUtils.defineESModuleGetters(lazy, {
  JsonSchemaValidator:
    "resource://gre/modules/components-utils/JsonSchemaValidator.sys.mjs",
  OpenGraphPageData: "resource:///modules/pagedata/OpenGraphPageData.sys.mjs",
  SchemaOrgPageData: "resource:///modules/pagedata/SchemaOrgPageData.sys.mjs",
  TwitterPageData: "resource:///modules/pagedata/TwitterPageData.sys.mjs",
});

XPCOMUtils.defineLazyGetter(lazy, "logConsole", function () {
  return console.createInstance({
    prefix: "PageData",
    maxLogLevel: Services.prefs.getBoolPref("browser.pagedata.log", false)
      ? "Debug"
      : "Warn",
  });
});

/**
 * The list of page data collectors. These should be sorted in order of
 * specificity, if the same piece of data is provided by two collectors then the
 * earlier wins.
 *
 * Collectors must provide a `collect` function which will be passed the
 * document object and should return the PageData structure. The function may be
 * asynchronous if needed.
 *
 * The data returned need not be valid, collectors should return whatever they
 * can and then we drop anything that is invalid once all data is joined.
 */
XPCOMUtils.defineLazyGetter(lazy, "DATA_COLLECTORS", function () {
  return [lazy.SchemaOrgPageData, lazy.OpenGraphPageData, lazy.TwitterPageData];
});

let SCHEMAS = new Map();

/**
 * Loads the schema for the given name.
 *
 * @param {string} schemaName
 *   The name of the schema to load.
 */
async function loadSchema(schemaName) {
  if (SCHEMAS.has(schemaName)) {
    return SCHEMAS.get(schemaName);
  }

  let url = `chrome://browser/content/pagedata/schemas/${schemaName.toLocaleLowerCase()}.schema.json`;
  let response = await fetch(url);
  if (!response.ok) {
    throw new Error(`Failed to load schema: ${response.statusText}`);
  }

  let schema = await response.json();
  SCHEMAS.set(schemaName, schema);
  return schema;
}

/**
 * Validates the data using the schema with the given name.
 *
 * @param {string} schemaName
 *   The name of the schema to validate against.
 * @param {object} data
 *   The data to validate.
 */
async function validateData(schemaName, data) {
  let schema = await loadSchema(schemaName.toLocaleLowerCase());

  let result = lazy.JsonSchemaValidator.validate(data, schema, {
    allowExplicitUndefinedProperties: true,
    // Allowed for future expansion of the schema.
    allowExtraProperties: true,
  });

  if (!result.valid) {
    throw result.error;
  }
}

/**
 * A shared API that can be used in parent or child processes
 */
export const PageDataSchema = {
  // Enumeration of data types. The keys must match the schema name.
  DATA_TYPE: Object.freeze({
    // Note that 1 and 2 were used as types in earlier versions and should not be used here.
    PRODUCT: 3,
    DOCUMENT: 4,
    ARTICLE: 5,
    AUDIO: 6,
    VIDEO: 7,
  }),

  /**
   * Gets the data type name.
   *
   * @param {DATA_TYPE} type
   *   The data type from the DATA_TYPE enumeration
   *
   * @returns {string | null} The name for the type or null if not found.
   */
  nameForType(type) {
    for (let [name, value] of Object.entries(this.DATA_TYPE)) {
      if (value == type) {
        return name;
      }
    }

    return null;
  },

  /**
   * Asynchronously validates some page data against the expected schema. Throws
   * an exception if validation fails.
   *
   * @param {DATA_TYPE} type
   *   The data type from the DATA_TYPE enumeration
   * @param {object} data
   *   The page data
   */
  async validateData(type, data) {
    let name = this.nameForType(type);

    if (!name) {
      throw new Error(`Unknown data type ${type}`);
    }

    return validateData(name, data);
  },

  /**
   * Asynchronously validates an entire PageData structure. Any invalid or
   * unknown data types are dropped.
   *
   * @param {PageData} pageData
   *   The page data
   *
   * @returns {PageData} The validated page data structure
   */
  async validatePageData(pageData) {
    let { data: dataMap = {}, ...general } = pageData;

    await validateData("general", general);

    let validData = {};

    for (let [type, data] of Object.entries(dataMap)) {
      let name = this.nameForType(type);
      // Ignore unknown types here.
      if (!name) {
        continue;
      }

      try {
        await validateData(name, data);

        validData[type] = data;
      } catch (e) {
        // Invalid data is dropped.
      }
    }

    return {
      ...general,
      data: validData,
    };
  },

  /**
   * Adds new page data into an existing data set. Any existing data is not
   * overwritten.
   *
   * @param {PageData} existingPageData
   *   The existing page data
   * @param {PageData} newPageData
   *   The new page data
   *
   * @returns {PageData} The joined data.
   */
  coalescePageData(existingPageData, newPageData) {
    // Split out the general data from the map of specific data.
    let { data: existingMap = {}, ...existingGeneral } = existingPageData;
    let { data: newMap = {}, ...newGeneral } = newPageData;

    Object.assign(newGeneral, existingGeneral);

    let dataMap = {};
    for (let [type, data] of Object.entries(existingMap)) {
      if (type in newMap) {
        dataMap[type] = Object.assign({}, newMap[type], data);
      } else {
        dataMap[type] = data;
      }
    }

    for (let [type, data] of Object.entries(newMap)) {
      if (!(type in dataMap)) {
        dataMap[type] = data;
      }
    }

    return {
      ...newGeneral,
      data: dataMap,
    };
  },

  /**
   * Collects page data from a DOM document.
   *
   * @param {Document} document
   *   The DOM document to collect data from
   *
   * @returns {Promise<PageData | null>} The data collected or null in case of
   *   error.
   */
  async collectPageData(document) {
    lazy.logConsole.debug("Starting collection", document.documentURI);

    let pending = lazy.DATA_COLLECTORS.map(async collector => {
      try {
        return await collector.collect(document);
      } catch (e) {
        lazy.logConsole.error("Error collecting page data", e);
        return null;
      }
    });

    let pageDataList = await Promise.all(pending);

    let pageData = pageDataList.reduce(PageDataSchema.coalescePageData, {
      date: Date.now(),
      url: document.documentURI,
    });

    try {
      return this.validatePageData(pageData);
    } catch (e) {
      lazy.logConsole.error("Failed to collect valid page data", e);
      return null;
    }
  },
};