1
0
Fork 0
firefox/browser/components/genai/LinkPreviewChild.sys.mjs
Daniel Baumann 5e9a113729
Adding upstream version 140.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
2025-06-25 09:37:52 +02:00

327 lines
10 KiB
JavaScript

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
const lazy = {};
ChromeUtils.defineESModuleGetters(lazy, {
NetUtil: "resource://gre/modules/NetUtil.sys.mjs",
ReaderMode: "moz-src:///toolkit/components/reader/ReaderMode.sys.mjs",
Readerable: "resource://gre/modules/Readerable.sys.mjs",
isProbablyReaderable: "resource://gre/modules/Readerable.sys.mjs",
});
/**
* Represents a child actor for handling link previews in the browser.
* Interacts with content windows and handles events related to link previews.
*
* @class LinkPreviewChild
* @augments {JSWindowActorChild}
*/
export class LinkPreviewChild extends JSWindowActorChild {
/**
* Handles incoming messages from the parent actor.
*
* @param {object} message - The message object containing name and data.
* @param {string} message.name - The name of the message.
* @param {object} message.data - Data associated with the message.
* @returns {Promise<object>|undefined} The result of fetchPageData if applicable.
*/
async receiveMessage({ name, data }) {
if (name === "LinkPreview:FetchPageData") {
return this.fetchPageData(data.url);
}
//expected a return value. consistent-return (eslint)
return undefined;
}
/**
* Fetches the HTML content from the given URL.
*
* @param {string} url - The URL to fetch.
* @returns {Promise<string>} The HTML content as a string.
* @throws {Error} If the fetch fails or the content type is invalid.
*/
fetchHTML(url) {
const uri = lazy.NetUtil.newURI(url);
if (!uri.schemeIs("https")) {
throw Components.Exception(
"Only handling https",
Cr.NS_ERROR_UNKNOWN_PROTOCOL
);
}
// Make requests with a channel to automatically get safe browsing checks.
// Use null principals in combination with anonymous for now ahead of
// fetching content with cookies to handle sites requiring login.
const principal = Services.scriptSecurityManager.createNullPrincipal({});
const channel = lazy.NetUtil.newChannel({
contentPolicyType: Ci.nsIContentPolicy.TYPE_DOCUMENT,
loadingPrincipal: principal,
securityFlags: Ci.nsILoadInfo.SEC_ALLOW_CROSS_ORIGIN_INHERITS_SEC_CONTEXT,
triggeringPrincipal: principal,
uri,
}).QueryInterface(Ci.nsIHttpChannel);
channel.loadFlags = Ci.nsIRequest.LOAD_ANONYMOUS;
// Specially identify this request, e.g., for publishers to opt out
channel.setRequestHeader("x-firefox-ai", "1", false);
const { promise, resolve, reject } = Promise.withResolvers();
const MAX_CONTENT_LENGTH = 5 * 1024 * 1024; // 5 MB limit
let charset = "utf-8";
const byteChunks = [];
let totalLength = 0;
channel.asyncOpen({
onDataAvailable(request, stream, offset, count) {
totalLength += count;
if (totalLength > MAX_CONTENT_LENGTH) {
request.cancel(Cr.NS_ERROR_FILE_TOO_BIG);
} else {
byteChunks.push(lazy.NetUtil.readInputStream(stream, count));
}
},
onStartRequest(request) {
const http = request.QueryInterface(Ci.nsIHttpChannel);
// Enforce text/html if provided by server
let contentType = "";
try {
contentType = http.getResponseHeader("content-type");
} catch (ex) {}
if (contentType && !contentType.startsWith("text/html")) {
request.cancel(Cr.NS_ERROR_FILE_UNKNOWN_TYPE);
}
// Save charset without quotes or spaces for TextDecoder
const match = contentType.match(/charset=["' ]*([^;"' ]+)/i);
if (match) {
charset = match[1];
}
// Enforce max length if provided by server
try {
if (http.getResponseHeader("content-length") > MAX_CONTENT_LENGTH) {
request.cancel(Cr.NS_ERROR_FILE_TOO_BIG);
}
} catch (ex) {}
},
onStopRequest(_request, status) {
if (Components.isSuccessCode(status)) {
const bytes = new Uint8Array(totalLength);
let offset = 0;
for (const chunk of byteChunks) {
bytes.set(new Uint8Array(chunk), offset);
offset += chunk.byteLength;
}
const decoder = new TextDecoder(charset);
resolve(decoder.decode(bytes));
} else {
reject(Components.Exception("Failed to fetch HTML", status));
}
},
});
return promise;
}
/**
* Fetches HTML content from a URL and parses its meta tags and page text.
*
* @param {string} url - The URL to fetch and parse.
* @returns {Promise<object>} An object containing meta information, page text, and HTML code.
*/
async fetchPageData(url) {
const ret = {
article: {},
rawMetaInfo: {},
url,
};
try {
const htmlCode = await this.fetchHTML(url);
ret.urlComponents = this.extractUrlComponents(url);
const parser = new DOMParser();
const doc = parser.parseFromString(htmlCode, "text/html");
ret.rawMetaInfo = this.parseMetaTagsFromDoc(doc);
if (
!lazy.Readerable.shouldCheckUri(lazy.NetUtil.newURI(url)) ||
!lazy.isProbablyReaderable(doc)
) {
// Add normalized metadata even if the document isn't reader-able
ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo);
return ret;
}
ret.article = await this.getArticleDataFromDoc(doc);
ret.meta = this.extractNormalizedMetadata(ret.rawMetaInfo, ret.article);
} catch (error) {
console.error(`Failed to fetch and parse page data: ${error}`);
ret.error = { message: error.message, result: error.result };
// Add empty normalized metadata in case of error
ret.meta = this.extractNormalizedMetadata();
}
return ret;
}
/**
* Extracts and normalizes metadata from the page's meta tags and article content.
*
* @param {object} metaData - Metadata extracted from the page's meta tags (Open Graph, Twitter, HTML)
* @param {object} articleData - Data extracted from the article content using ReaderMode
* @returns {object} Normalized metadata containing:
* - title: Page title prioritizing Open Graph, Twitter, then HTML title
* - description: Content excerpt or meta description from various sources
* - imageUrl: HTTPS-only URL of the page's primary image
* - isMissingMetadata: Boolean flag indicating if description is missing
*/
extractNormalizedMetadata(metaData = {}, articleData = {}) {
const title =
metaData["og:title"] ||
metaData["twitter:title"] ||
metaData["html:title"] ||
"";
const description =
articleData.excerpt ||
metaData["og:description"] ||
metaData["twitter:description"] ||
metaData.description ||
"";
let imageUrl = metaData["og:image"] || metaData["twitter:image:src"] || "";
if (!imageUrl.startsWith("https://")) {
imageUrl = "";
}
return {
title,
description,
imageUrl,
};
}
/**
* Extracts URL components including domain and filename.
*
* @param {string} url - The URL to extract information from.
* @returns {object} Object containing domain and filename.
*/
extractUrlComponents(url) {
try {
const urlObj = new URL(url);
const domain = urlObj.hostname;
// Extract the filename (last part of pathname)
let pathname = urlObj.pathname;
// Remove trailing slash if present
if (pathname.endsWith("/")) {
pathname = pathname.slice(0, -1);
}
// Get last segment of path
const pathParts = pathname.split("/");
const filename = pathParts[pathParts.length - 1] || domain;
return { domain, filename };
} catch (e) {
// Return both properties with same fallback value if URL is invalid
return { domain: url, filename: url };
}
}
/**
* Parses meta tags from the provided Document into a key-value object.
* Also extracts the title if available.
*
* @param {Document} doc - The parsed HTML document.
* @returns {object} An object containing meta tag key-value pairs.
*/
parseMetaTagsFromDoc(doc) {
const metaTags = doc.querySelectorAll("meta");
const metaInfo = {};
// TODO: Define the meta tags we are interested in
const desiredMetaNames = [
"description",
"og:image",
"title",
"og:title",
"twitter:title",
"og:description",
"twitter:description",
"twitter:image:src",
];
metaTags.forEach(tag => {
const name = tag.getAttribute("name") || tag.getAttribute("property");
const content = tag.getAttribute("content");
if (name && content) {
if (desiredMetaNames.includes(name.toLowerCase())) {
metaInfo[name] = content;
}
}
});
const title = doc.querySelector("title")?.textContent;
if (title) {
metaInfo["html:title"] = title;
}
return metaInfo;
}
/**
* Extracts article data from the provided Document using ReaderMode.
*
* @param {Document} doc - The parsed HTML document.
* @returns {Promise<object>} The extracted article data including specified fields.
*/
async getArticleDataFromDoc(doc) {
try {
const article = await lazy.ReaderMode.parseDocument(doc);
if (article) {
const {
title,
byline,
content,
detectedLanguage,
length,
siteName,
excerpt,
readingTimeMinsSlow,
readingTimeMinsFast,
} = article;
// parseDocument return a `textContent` that strips structure and newlines, which we need for the model.
// So we convert the HTML `content` to plain text directly, preserving formatting and newlines.
const textContent = Cc["@mozilla.org/parserutils;1"]
.getService(Ci.nsIParserUtils)
.convertToPlainText(
content,
null,
0 // No line-wrapping
);
return {
title,
byline,
textContent,
detectedLanguage,
length,
siteName,
excerpt,
readingTimeMinsFast,
readingTimeMinsSlow,
};
}
} catch (error) {
console.error("Error parsing document with ReaderMode:", error);
}
return {};
}
}