Source: pageText.js

/**
 * This module enables analyzing the text content of webpages, including with
 * natural language processing methods. The module uses Mozilla Readability
 * in a content script to parse document title and content when possible.
 * 
 * ## Training, Testing, and Deploying Natural Language Processing Models
 * A motivating use case for this module is applying natural language
 * processing methods to webpage text. The module provides infrastructure for
 * NLP models, but leaves implementation and evaluation of models to study
 * authors. We recommend using existing toolkits for NLP feature generation
 * (e.g., Natural or NLP.js) and for working with models (e.g., TensorFlow.js,
 * ONNX.js, WebDNN, or sklearn-porter). We also recommend using the same
 * codebase for collecting data (e.g., with web crawls), constructing models,
 * evaluating models, and deploying models in browser-based studies. When
 * maintaining multiple NLP codebases for a browser-based study, subtle
 * inconsistencies are easy to introduce and can call into question NLP model
 * performance.
 * 
 * ## Web Crawls to Collect Natural Language Processing Training Data
 * Because WebScience integrates with ordinary browser extensions, you can
 * use this module in a web crawl to collect page text content as NLP training
 * data. All the major browser automation toolkits (e.g., Selenium, Puppeteer,
 * Playwright, and WebdriverIO) support running web crawls with browser
 * extensions installed. We recommend running an online crawl to collect NLP
 * data, using this module to extract webpage text, then training and testing
 * models offline. If you use web crawl data to construct an NLP model for a
 * browser-based study, be sure to carefully consider how the distribution
 * of pages in the crawl compares to the distribution of pages that a user in
 * the study might visit. If a crawl is not representative of user browsing,
 * NLP model performance on crawl data might significantly differ from
 * performance when deployed in a browser-based study.
 * 
 * ## Implementing Natural Language Processing in Web Workers
 * Because natural language processing methods can be computationally
 * expensive, it is very important to offload NLP tasks from an extension's
 * main thread. We recommend pairing this module with the `workers` module to 
 * implement NLP tasks inside of Web Workers, which run in separate threads
 * and will not block the extension's main thread. Some NLP toolkits support
 * additional optimizations, such as WebAssembly or WebGL, and we recommend
 * enabling all available optimizations to minimize the possibility of impact
 * on the user's browsing experience. 
 * 
 * @see {@link https://github.com/mozilla/readability}
 * @see {@link https://github.com/NaturalNode/natural}
 * @see {@link https://github.com/axa-group/nlp.js}
 * @see {@link https://www.tensorflow.org/js}
 * @see {@link https://github.com/microsoft/onnxjs}
 * @see {@link https://mil-tokyo.github.io/webdnn/}
 * @see {@link https://github.com/nok/sklearn-porter}
 * @module pageText
 */

import * as messaging from "./messaging.js";
import * as matching from "./matching.js";
import * as events from "./events.js";
import * as pageManager from "./pageManager.js";
import pageTextContentScript from "include:./content-scripts/pageText.content.js";

/**
 * A listener for the `onTextParsed` event.
 * @callback textParsedListener
 * @memberof module:pageText.onTextParsed
 * @param {Object} details - Additional information about the page data event.
 * @param {string} details.pageId - The ID for the page, unique across browsing sessions.
 * @param {string} details.url - The URL of the page, without any hash.
 * @param {string} details.title - The title of the document, parsed by Readability.
 * @param {string} details.content - The document text content as an HTML string, parsed by Readability.
 * @param {string} details.textContent - The document text content with HTML tags removed, parsed by Readability.
 * @param {boolean} details.privateWindow - Whether the page loaded in a private window.
 */

/**
 * @typedef {Object} TextParsedListenerRecord
 * @property {matching.MatchPatternSet} matchPatternSet - The match patterns for the listener.
 * @property {boolean} privateWindows - Whether to notify the listener about pages in private windows.
 * @property {browser.scripts.RegisteredContentScript} contentScript - The content
 * script associated with the listener.
 * @private
 */

/**
 * A map where each key is a listener and each value is a record for that listener.
 * @constant {Map<textParsedListener, TextParsedListenerRecord>}
 * @private
 */
const textParsedListeners = new Map();

/**
 * Add a listener for the `onTextParsed` event.
 * @function addListener
 * @memberof module:pageText.onTextParsed
 * @param {textParsedListener} listener - The listener to add.
 * @param {Object} options - Options for the listener.
 * @param {string[]} options.matchPatterns - The webpages where the listener should be notified about page text.
 * @param {boolean} [options.privateWindows=false] - Whether to notify the listener about pages in private windows.
 */

/**
 * Remove a listener for the `onTextParsed` event.
 * @function removeListener
 * @memberof module:pageText.onTextParsed
 * @param {textParsedListener} listener - The listener to remove.
 */

/**
 * Whether a specified listener has been added for the `onTextParsed` event.
 * @function hasListener
 * @memberof module:pageText.onTextParsed
 * @param {textParsedListener} listener - The listener to check.
 * @returns {boolean} Whether the listener has been added for the event.
 */

/**
 * Whether the `onTextParsed` event has any listeners.
 * @function hasAnyListeners
 * @memberof module:pageText.onTextParsed
 * @returns {boolean} Whether the event has any listeners.
 */

/**
 * An event that fires when a page's text content has been parsed with Readability. If the text
 * content is not parseable, this event does not fire.
 * @namespace
 */
export const onTextParsed = events.createEvent({
    name: "webScience.pageText.onTextParsed",
    addListenerCallback: addListener,
    removeListenerCallback: removeListener,
    notifyListenersCallback: () => { return false; }
});

/**
 * Whether the module has completed initialization.
 * @type {boolean}
 * @private
 */
let initialized = false;

/**
 * A callback function for adding a text parsed listener. The options for this private function must
 * be kept in sync with the options for the public `onTextParsed.addListener` function.
 * @param {textParsedListener} listener - The listener being added.
 * @param {Object} options - Options for the listener.
 * @param {string[]} options.matchPatterns - The match patterns for pages where the listener should
 * be notified.
 * @param {boolean} [options.privateWindows=false] - Whether the listener should be notified for
 * pages in private windows.
 * @private
 */
async function addListener(listener, {
    matchPatterns,
    privateWindows = false
}) {
    // Initialization
    if (!initialized) {
        initialized = true;
        await pageManager.initialize();

        // Listen for content script messages
        messaging.onMessage.addListener(textParsedDetails => {
            // Remove the type string from the content script message
            delete textParsedDetails.type;

            // Notify listeners when the private window and match pattern requirements are met
            for (const [listener, listenerRecord] of textParsedListeners) {
                if ((!textParsedDetails.privateWindow || listenerRecord.privateWindows)
                    && (listenerRecord.matchPatternSet.matches(textParsedDetails.url))) {
                    listener(textParsedDetails);
                }
            }
        },
        {
            type: "webScience.pageText.parsedText",
            schema: {
                pageId: "string",
                url: "string",
                title: "string",
                content: "string",
                textContent: "string",
                privateWindow: "boolean"
            }
        });

        // Notify the content script when the page URL matches at least one listener.
        // Readability status will be checked in the content script.
        messaging.registerSchema("webScience.pageText.isArticle", {});
        browser.tabs.onUpdated.addListener((tabId, _changeInfo, tab) => {
            if ("url" in tab) {
                // Test match patterns here rather than in the tabs.onUpdated
                // listener options so we don't have to manage multiple listeners
                // or remove and add the listener while events might be queued
                for (const listenerRecord of textParsedListeners.values()) {
                    if (listenerRecord.matchPatternSet.matches(tab.url)) {
                        messaging.sendMessageToTab(tabId, {
                            type: "webScience.pageText.isArticle"
                        });
                        break;
                    }
                }
            }
        });
    }

    // Compile the match patterns for the listener
    const matchPatternSet = matching.createMatchPatternSet(matchPatterns);
    // Register a content script for the listener

    // Firefox only supports this as of version 105, remove this check when that version of Firefox ships.
    let persistAcrossSessions = true;
    const browserInfo = browser.runtime && browser.runtime.getBrowserInfo && await browser.runtime.getBrowserInfo();
    if (browserInfo && browserInfo.name === "Firefox") {
        persistAcrossSessions = false;
    }

    const contentScriptId = "pageText";
    let scripts = await browser.scripting.getRegisteredContentScripts({
        ids: [contentScriptId],
    });

    if (scripts.length === 0) {
        await browser.scripting.registerContentScripts([{
            id: contentScriptId,
            js: ["dist/browser-polyfill.min.js", pageTextContentScript],
            matches: matchPatterns,
            persistAcrossSessions,
            runAt: "document_idle"
        }]);
    }

    // Store a record for the listener
    textParsedListeners.set(listener, {
        matchPatternSet,
        contentScriptId,
        privateWindows
    });
}

/**
 * A callback function for removing a text parsed listener.
 * @param {textParsedListener} listener - The listener that is being removed.
 * @private
 */
async function removeListener(listener) {
    // If there is a record of the listener, unregister its content script
    // and delete the record
    const listenerRecord = textParsedListeners.get(listener);
    if (listenerRecord === undefined) {
        return;
    }
    await browser.scripting.unregisterContentScripts({
        ids: [listenerRecord.contentScriptId]
    });
    textParsedListeners.delete(listener);
}