/**
* This module provides utilities for matching URLs against criteria.
*
* ## Matching Criteria
* The module supports two types of criteria:
* * Match Patterns (preferred) - a syntax used in the WebExtensions API for expressing possible URL matches.
* See: {@link https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/Match_patterns}.
* * Domains - a simple list of domain names, which are converted into match patterns.
*
* ## Matching Output
* The module supports three types of output for matching URLs:
* * Match Pattern Sets (preferred) - optimized objects that compare a URL against the criteria.
* * Regular Expressions - `RegExp` objects that compare a URL against the criteria.
* * Regular Expression Strings - strings expressing regular expressions for comparing a URL against the criteria.
*
* ## Implementation Notes
* We use Rollup pure annotations (`@__PURE__` comments) because Rollup assumes that iterators might have side
* effects (including subtle cases of iteration like `Array.map` and `Array.join`). Without the annotations, Rollup
* would mark arguments for many of this module's functions (which might be large string arrays) as tainted by side
* effects and always include those arguments in bundled output. The pure annotations are associated with either
* iteration functions or class instantiation to provide clarity about why they're needed.
*
* @see {@link https://github.com/rollup/rollup/issues/3127}
*
* @module matching
*/
/**
* A RegExp for validating WebExtensions match patterns, using the same regular expressions for manifest
* validation as Firefox.
* @see {@link https://searchfox.org/mozilla-central/source/toolkit/components/extensions/schemas/manifest.json}
* @constant
* @type {RegExp}
* @private
*/
const matchPatternValidationRegExp = new RegExp("(^<all_urls>$)|(^(https?|wss?|file|ftp|\\*)://(\\*|\\*\\.[^*/]+|[^*/]+)/.*$)|(^file:///.*$)|(^resource://(\\*|\\*\\.[^*/]+|[^*/]+)/.*$|^about:|^data:)", "i");
/**
* A Set of URL schemes permitted in WebExtensions match patterns.
* @see {@link https://searchfox.org/mozilla-central/source/toolkit/components/extensions/MatchPattern.cpp}
* @constant {Set<string>}
* @private
*/
const permittedMatchPatternSchemes = new Set(["*", "http", "https", "ws", "wss", "file", "ftp", "data", "file"]);
/**
* A Set of URL schemes that require a host locator (i.e., are followed by `://` rather than `:`).
* @see {@link https://searchfox.org/mozilla-central/source/toolkit/components/extensions/MatchPattern.cpp}
* @constant {Set<string>}
* @private
*/
const hostLocatorMatchPatternSchemes = new Set(["*", "http", "https", "ws", "wss", "file", "ftp", "moz-extension", "chrome", "resource", "moz", "moz-icon", "moz-gio"]);
/**
* A regular expression string for the special "<all_urls>" wildcard match pattern, which matches
* "http", "https", "ws", "wss", "ftp", "file", and "data" schemes with any hostname and path.
* This regular expression includes a little sanity checking: hostnames are limited to alphanumerics,
* hyphen, period, and brackets at the start and end (for IPv6 literals).
* @constant {string}
* @private
*/
const allUrlsRegExpString = "^(?:(?:(?:https?)|(?:wss?)|(?:ftp))://[?[a-zA-Z0-9\\-\\.]+\\]?(?::[0-9]+)?(?:(?:)|(?:/.*)))|(?:file://[?[a-zA-Z0-9\\-\\.]*\\]?/.*)|(?:data:.*)$";
/**
* An internal object that represents a parsed match pattern.
* @typedef {Object} ParsedMatchPattern
* @property {boolean} allUrls - Whether the match pattern is the special all URLs match pattern.
* @property {string} scheme - The scheme for the match pattern. Must be one of: "http", "https", "ws",
* wss", "file", "ftp", "data", "file", or "*". The special wildcard value "*" matches "http", "https",
* "ws", or "wss".
* @property {boolean} matchSubdomains - If this scheme involves a hostname, and the hostname is not the
* special wildcard value, whether to match any subdomains of the domain.
* @property {boolean} host - If this scheme involves a hostname, either the hostname for the match pattern
* or the special wildcard value "*" that matches all domains.
* @property {string} path - The path for the match pattern. The special wildcard value "/*" matches all
* paths.
* @see {@link https://developer.mozilla.org/en-US/docs/Mozilla/Add-ons/WebExtensions/Match_patterns}
* @private
*/
/**
* Parses a match pattern string into an object that represents the match pattern. We use this internal,
* intermediate representation to enable constructing efficient matching objects. The parsing logic is
* nearly identical to the parsing logic in Firefox.
* @throws Throws an error if the match pattern is not valid.
* @param {string} matchPattern - The match pattern string.
* @returns {ParsedMatchPattern} - The parsed match pattern.
* @see {@link https://searchfox.org/mozilla-central/source/toolkit/components/extensions/MatchPattern.cpp}
* @private
*/
function parseMatchPattern(matchPattern) {
if(!matchPatternValidationRegExp.test(matchPattern))
throw new Error(`Invalid match pattern, failed validation: ${matchPattern}`);
const parsedMatchPattern = {
allUrls: false,
scheme: "",
matchSubdomains: false,
host: "",
path: ""
};
let tail = matchPattern.repeat(1);
if(matchPattern === "<all_urls>") {
parsedMatchPattern.allUrls = true;
return parsedMatchPattern;
}
// Parse the scheme
let index = matchPattern.indexOf(":");
if(index <= 0) {
throw new Error(`Invalid match pattern, missing : after scheme: ${matchPattern}`);
}
const scheme = matchPattern.substr(0, index);
if(!permittedMatchPatternSchemes.has(scheme)) {
throw new Error(`Invalid match pattern, unsupported scheme: ${matchPattern}`);
}
const hostLocatorScheme = hostLocatorMatchPatternSchemes.has(scheme);
parsedMatchPattern.scheme = scheme;
// Parse the host
let offset = index + 1;
tail = matchPattern.substr(offset);
if(hostLocatorScheme) {
if(!tail.startsWith("//")) {
throw new Error(`Invalid match pattern, missing // required by scheme: ${matchPattern}`);
}
offset += 2;
tail = matchPattern.substr(offset);
index = tail.indexOf("/");
if(index < 0) {
index = tail.length;
}
let host = tail.substring(0, index);
if((host === "") && (scheme !== "file")) {
throw new Error(`Invalid match pattern, missing host required by scheme: ${matchPattern}`);
}
offset += index;
tail = matchPattern.substring(offset);
if(host !== "*") {
if(host.startsWith("*.")) {
host = host.substring(2);
if(host === "*") {
throw new Error(`Invalid match pattern, subdomain wildcard with host wildcard: ${matchPattern}`);
}
parsedMatchPattern.matchSubdomains = true;
}
}
parsedMatchPattern.host = host;
}
// Parse the path
const path = tail;
if(path === "") {
throw new Error(`Invalid match pattern, missing path: ${matchPattern}`);
}
parsedMatchPattern.path = path;
return parsedMatchPattern;
}
/**
* Create a new MatchPatternSet for matching a set of match patterns.
* @param {string[]} matchPatterns - An array of match pattern strings.
* @returns {MatchPatternSet} - The new MatchPatternSet.
*/
export function createMatchPatternSet(matchPatterns) {
return /*@__PURE__*/new MatchPatternSet(matchPatterns);
}
/**
* Restore a MatchPatternSet that was serialized to an object with
* the `export` function.
* @param {Object} exportedMatchPatternSet - A serialized MatchPatternSet.
* @returns {MatchPatternSet} - The new MatchPatternSet.
* @example <caption>Example usage of import.</caption>
* const matchPatternSet1 = webScience.matching.createMatchPatternSet([ "*://example.com/*" ]);
* const exportedMatchPatternSet = matchPatternSet.export();
* const matchPatternSet2 = webScience.matching.importMatchPatternSet(exportedMatchPatternSet);
*/
export function importMatchPatternSet(exportedMatchPatternSet) {
const matchPatternSet = new MatchPatternSet([]);
matchPatternSet.import(exportedMatchPatternSet);
return matchPatternSet;
}
/**
* An optimized object for matching against match patterns. A `MatchPatternSet` can provide
* a significant performance improvement in comparison to `RegExp`s, in some instances
* greater than 100x. A `MatchPatternSet` can also be exported to an object that uses only
* built-in types, so it can be persisted or passed to content scripts in extension storage.
*
* There are several key optimizations in `MatchPatternSet`:
* * URLs are parsed with the `URL` class, which has native implementation.
* * Match patterns are indexed by hostname in a hash map. Lookups are much faster than
* iteratively advancing and backtracking through a complex regular expression, which
* is how domain matching currently occurs with the `Irregexp` regular expression
* engine in Firefox and Chrome.
* * Match patterns with identical scheme, subdomain matching, and host (i.e., that
* differ only in path) are combined.
* * The only remaining use of regular expressions is in path matching, where expressions
* can be (relatively) uncomplicated.
*
* Future performance improvements could include:
* * Replacing the path matching implementation to eliminate regular expressions entirely.
* * Replacing the match pattern index, such as by implementing a trie.
* @hideconstructor
*/
class MatchPatternSet {
/**
* Creates a match pattern set from an array of match patterns.
* @param {string[]} matchPatterns - The match patterns for the set.
*/
constructor(matchPatterns) {
// Defining the special sets of `<all_url>` and wildcard schemes inside the class so
// keeping content scripts in sync with this implementation will be easier
this.allUrls = false;
this.allUrlsSchemeSet = new Set(["http", "https", "ws", "wss", "ftp", "file", "data"]);
this.wildcardSchemeSet = new Set(["http", "https", "ws", "wss"]);
this.patternsByHost = { };
for(const matchPattern of matchPatterns) {
const parsedMatchPattern = parseMatchPattern(matchPattern);
if(parsedMatchPattern.allUrls)
this.allUrls = true;
else {
let hostPatterns = this.patternsByHost[parsedMatchPattern.host];
if(hostPatterns === undefined) {
hostPatterns = [ ];
this.patternsByHost[parsedMatchPattern.host] = hostPatterns;
}
let addedToHostPattern = false;
for(const hostPattern of hostPatterns) {
if((hostPattern.scheme === parsedMatchPattern.scheme) && (hostPattern.matchSubdomains === parsedMatchPattern.matchSubdomains)) {
addedToHostPattern = true;
hostPattern.paths.push(parsedMatchPattern.path);
break;
}
}
if(!addedToHostPattern)
hostPatterns.push({
scheme: parsedMatchPattern.scheme,
matchSubdomains: parsedMatchPattern.matchSubdomains,
host: parsedMatchPattern.host,
paths: [ parsedMatchPattern.path ]
});
}
}
for(const host of Object.keys(this.patternsByHost)) {
const hostPatterns = this.patternsByHost[host];
for(const hostPattern of hostPatterns) {
let wildcardPath = false;
const pathRegExps = hostPattern.paths.map(path => {
if(path === "/")
return "/";
else if(path === "/*") {
wildcardPath = true;
return "/.*";
}
else {
// Including regular expression special character escaping in
// the constructor so keeping content scripts in sync with this
// implementation will be easier
const escapedPathArray = [ ];
for(const c of path) {
if(c === "*")
escapedPathArray.push(".*");
else
escapedPathArray.push(c.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"));
}
return escapedPathArray.join("");
}
});
if(wildcardPath) {
hostPattern.wildcardPath = true;
}
else {
hostPattern.wildcardPath = false;
hostPattern.pathRegExp = new RegExp("^(?:" + pathRegExps.join("|") + ")$");
}
delete hostPattern.paths;
}
}
}
/**
* Compares a URL string to the match patterns in the set.
* @param {string} url - The URL string to compare.
* @returns {boolean} Whether the URL string matches a pattern in the set.
*/
matches(url) {
let parsedUrl;
try {
parsedUrl = new URL(url);
} catch {
// If the target isn't a true URL, it certainly doesn't match
return false;
}
// Remove the trailing : from the parsed protocol
const scheme = parsedUrl.protocol.substring(0, parsedUrl.protocol.length - 1);
const host = parsedUrl.hostname;
const path = parsedUrl.pathname;
// Check the special `<all_urls>` match pattern
if(this.allUrls && this.allUrlsSchemeSet.has(scheme)) {
return true;
}
// Identify candidate match patterns
let candidatePatterns = [ ];
// Check each component suffix of the hostname for candidate match patterns
const hostComponents = parsedUrl.hostname.split(".");
let hostSuffix = "";
for(let i = hostComponents.length - 1; i >= 0; i--) {
hostSuffix = hostComponents[i] + (i < hostComponents.length - 1 ? "." : "") + hostSuffix;
const hostSuffixPatterns = this.patternsByHost[hostSuffix];
if(hostSuffixPatterns !== undefined) {
candidatePatterns = candidatePatterns.concat(hostSuffixPatterns);
}
}
// Add match patterns with a wildcard host to the set of candidates
const hostWildcardPatterns = this.patternsByHost["*"];
if(hostWildcardPatterns !== undefined)
candidatePatterns = candidatePatterns.concat(hostWildcardPatterns);
// Check the scheme, then the host, then the path for a match
for(const candidatePattern of candidatePatterns) {
if((candidatePattern.scheme === scheme) ||
((candidatePattern.scheme === "*") && this.wildcardSchemeSet.has(scheme))) {
if(candidatePattern.matchSubdomains ||
(candidatePattern.host === "*") ||
(candidatePattern.host === host)) {
if(candidatePattern.wildcardPath ||
candidatePattern.pathRegExp.test(path)) {
return true;
}
}
}
}
return false;
}
/**
* Serializes the internals of the match pattern set to a serializable object, for purposes
* of saving to extension local storage or messaging across contexts.
* @returns {Object} - An opaque serializable object representing the match pattern set
* internals.
*/
export() {
return {
allUrls: this.allUrls,
patternsByHost: this.patternsByHost
};
}
/**
* Imports the match pattern set from an opaque object previously generated by `export`.
* @param {exportedInternals} - The previously exported internals for the match pattern set.
* @private
*/
import(exportedInternals) {
this.allUrls = exportedInternals.allUrls;
this.patternsByHost = exportedInternals.patternsByHost;
}
}
/**
* Escapes regular expression special characters in a string.
* @param {string} string - The input string.
* @returns {string} The input string with regular expression special characters escaped.
* @see {@link https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions}
*/
export function escapeRegExpString(string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
/**
* Converts a parsed match pattern into a regular expression string.
* @param {ParsedMatchPattern} parsedMatchPattern - The parsed match pattern object.
* @returns {string} - The regular expression string.
* @private
*/
function parsedMatchPatternToRegExpString(parsedMatchPattern) {
if(parsedMatchPattern.allUrls)
return allUrlsRegExpString.repeat(1);
// Scheme
const hostLocatorScheme = hostLocatorMatchPatternSchemes.has(parsedMatchPattern.scheme);
let schemeRegExpString = parsedMatchPattern.scheme;
// The special "*" wildcard scheme should match the "http", "https", "ws", and "wss" schemes
if(parsedMatchPattern.scheme === "*")
schemeRegExpString = "(?:https?|wss?)";
// Host
let hostRegExpString = "";
if(hostLocatorScheme) {
// The special "*" wildcard host should match any valid hostname
// This isn't a robust check, just limiting to permitted characters and IPv6 literal brackets
if(parsedMatchPattern.host === "*")
hostRegExpString = "\\[?[a-zA-Z0-9\\-\\.]+\\]?";
else {
hostRegExpString = escapeRegExpString(parsedMatchPattern.host);
// The check for subdomains also isn't robust, limiting to permitted characters, no repeated
// periods, and ending in a period
if(parsedMatchPattern.matchSubdomains)
hostRegExpString = "(?:[a-zA-Z0-9\\-]+\\.)*" + hostRegExpString;
// If this is a scheme that requires "://" and isn't "file", there might be a port specified
if(parsedMatchPattern.scheme !== "file")
hostRegExpString = hostRegExpString + "(?::[0-9]+)?";
}
}
// Path
let pathRegExpString = "";
// If the path is / or /*, allow a URL with no path specified to match
if(parsedMatchPattern.path === "/" )
pathRegExpString = "/?";
else if(parsedMatchPattern.path === "/*")
pathRegExpString = "(?:/.*)?";
else {
const escapedPathArray = [ ];
for(const c of parsedMatchPattern.path) {
if(c === "*")
escapedPathArray.push(".*");
else
escapedPathArray.push(escapeRegExpString(c))
}
pathRegExpString = escapedPathArray.join("");
}
// Allow arbitrary parameters or an arbitrary fragment identifier
pathRegExpString += "(?:\\?.*)?(?:#.*)?";
return "^" + schemeRegExpString + (hostLocatorScheme ? "://" : ":") + hostRegExpString + pathRegExpString + "$";
}
/**
* Converts a match pattern into a regular expression string.
* @throws Throws an error if the match pattern is not valid.
* @param {string} matchPattern - The match pattern.
* @returns {string} The regular expression.
* @private
*/
function matchPatternToRegExpString(matchPattern) {
return parsedMatchPatternToRegExpString(parseMatchPattern(matchPattern));
}
/**
* Combines an array of regular expression strings into one regular expression string, encapsulated as
* a non-capturing group, where each input string is an alternative.
* @param {string[]} regExpStrings - An array of regular expression strings.
* @private
*/
function combineRegExpStrings(regExpStrings) {
return "(?:" + /*@__PURE__*/(/*@__PURE__*/regExpStrings.map((regExpString) => { return regExpStrings.length > 1 ? `(?:${regExpString})` : regExpString; })).join("|") + ")";
}
/**
* Converts an array of match patterns into a regular expression string.
* @throws Throws an error if the match pattern is not valid.
* @param {string[]} matchPatterns - The match patterns.
* @returns {string} The regular expression string.
*/
export function matchPatternsToRegExpString(matchPatterns) {
return combineRegExpStrings(/*@__PURE__*/matchPatterns.map(matchPattern => { return matchPatternToRegExpString(matchPattern); }));
}
/**
* Converts an array of match patterns into a RegExp object.
* @throws Throws an error if the match pattern is not valid.
* @param {string[]} matchPatterns - The match patterns.
* @returns {RegExp} The regular expression RegExp object.
*/
export function matchPatternsToRegExp(matchPatterns) {
// Set the entire regular expression to case insensitive, because JavaScript regular expressions
// do not (currently) support partial case insensitivity
return new RegExp(matchPatternsToRegExpString(matchPatterns), "i");
}
/**
* Generates a set of match patterns for a set of domains. The match patterns will use the special
* "\*" wildcard scheme (matching "http", "https", "ws", and "wss") and the special "/*" wildcard
* path (matching any path).
* @param {string[]} domains - The set of domains to match against.
* @param {boolean} [matchSubdomains=true] - Whether to match subdomains of domains in the set.
* @returns {string[]} Match patterns for the domains in the set.
*/
export function domainsToMatchPatterns(domains, matchSubdomains = true) {
return /*@__PURE__*/domains.map(domain => { return `*://${matchSubdomains ? "*." : ""}${domain}/*` });
}
/**
* Generates a regular expression string for a set of domains. The regular expression is based on
* match patterns generated by `domainsToMatchPatterns` and has the same matching properties.
* @param {string[]} domains - The set of domains to match against.
* @param {boolean} [matchSubdomains=true] - Whether to match subdomains of domains in the set.
* @returns {string} A regular expression string for matching a URL against the set of domains.
*/
export function domainsToRegExpString(domains, matchSubdomains = true) {
return matchPatternsToRegExpString(domainsToMatchPatterns(domains, matchSubdomains));
}
/**
* Generates a RegExp object for matching a URL against a set of domains. The regular expression
* is based on match patterns generated by `domainsToMatchPatterns` and has the same matching
* properties.
* @param {string[]} domains - The set of domains to match against.
* @param {boolean} [matchSubdomains=true] - Whether to match subdomains of domains in the set.
* @returns {RegExp} A RegExp object for matching a URL against the set of domains.
*/
export function domainsToRegExp(domains, matchSubdomains = true) {
// Set the entire regular expression to case insensitive, because JavaScript regular expressions
// do not (currently) support partial case insensitivity
return new RegExp(domainsToRegExpString(domains, matchSubdomains), "i");
}
/**
* Normalizes a URL string for subsequent comparison. Normalization includes the following steps:
* * Parse the string as a `URL` object, which will (among other normalization) lowercase the
* scheme and hostname.
* * Remove the port number, if any. For example, https://www.mozilla.org:443/ becomes https://www.mozilla.org/.
* * Remove query parameters, if any. For example, https://www.mozilla.org/?foo becomes https://www.mozilla.org/.
* * Remove the fragment identifier, if any. For example, https://www.mozilla.org/#foo becomes https://www.mozilla.org/.
* @param {string} url - The URL string to normalize.
* @return {string} The normalized URL string.
* @throws Throws an error if the match pattern is not valid, absolute URL.
*/
export function normalizeUrl(url) {
const urlObj = new URL(url);
urlObj.port = "";
urlObj.search = "";
urlObj.hash = "";
return urlObj.href;
}