import * as regex from './regex.js'; import { inherit } from './utils.js'; import * as EXT from "./compiler_extensions.js"; import { beforeMatchExt } from "./exts/before_match.js"; import { compileKeywords } from "./compile_keywords.js"; import { MultiClass } from "./ext/multi_class.js"; /** @typedef {import('highlight.js').Mode} Mode @typedef {import('highlight.js').CompiledMode} CompiledMode @typedef {import('highlight.js').Language} Language @typedef {import('highlight.js').HLJSPlugin} HLJSPlugin @typedef {import('highlight.js').CompiledLanguage} CompiledLanguage */ // compilation /** * Compiles a language definition result * * Given the raw result of a language definition (Language), compiles this so * that it is ready for highlighting code. * @param {Language} language * @returns {CompiledLanguage} */ export function compileLanguage(language) { /** * Builds a regex with the case sensitivity of the current language * * @param {RegExp | string} value * @param {boolean} [global] */ function langRe(value, global) { return new RegExp( regex.source(value), 'm' + (language.case_insensitive ? 'i' : '') + (language.unicodeRegex ? 'u' : '') + (global ? 'g' : '') ); } /** Stores multiple regular expressions and allows you to quickly search for them all in a string simultaneously - returning the first match. It does this by creating a huge (a|b|c) regex - each individual item wrapped with () and joined by `|` - using match groups to track position. When a match is found checking which position in the array has content allows us to figure out which of the original regexes / match groups triggered the match. The match object itself (the result of `Regex.exec`) is returned but also enhanced by merging in any meta-data that was registered with the regex. This is how we keep track of which mode matched, and what type of rule (`illegal`, `begin`, end, etc). */ class MultiRegex { constructor() { this.matchIndexes = {}; // @ts-ignore this.regexes = []; this.matchAt = 1; this.position = 0; } // @ts-ignore addRule(re, opts) { opts.position = this.position++; // @ts-ignore this.matchIndexes[this.matchAt] = opts; this.regexes.push([opts, re]); this.matchAt += regex.countMatchGroups(re) + 1; } compile() { if (this.regexes.length === 0) { // avoids the need to check length every time exec is called // @ts-ignore this.exec = () => null; } const terminators = this.regexes.map(el => el[1]); this.matcherRe = langRe(regex._rewriteBackreferences(terminators, { joinWith: '|' }), true); this.lastIndex = 0; } /** @param {string} s */ exec(s) { this.matcherRe.lastIndex = this.lastIndex; const match = this.matcherRe.exec(s); if (!match) { return null; } // eslint-disable-next-line no-undefined const i = match.findIndex((el, i) => i > 0 && el !== undefined); // @ts-ignore const matchData = this.matchIndexes[i]; // trim off any earlier non-relevant match groups (ie, the other regex // match groups that make up the multi-matcher) match.splice(0, i); return Object.assign(match, matchData); } } /* Created to solve the key deficiently with MultiRegex - there is no way to test for multiple matches at a single location. Why would we need to do that? In the future a more dynamic engine will allow certain matches to be ignored. An example: if we matched say the 3rd regex in a large group but decided to ignore it - we'd need to started testing again at the 4th regex... but MultiRegex itself gives us no real way to do that. So what this class creates MultiRegexs on the fly for whatever search position they are needed. NOTE: These additional MultiRegex objects are created dynamically. For most grammars most of the time we will never actually need anything more than the first MultiRegex - so this shouldn't have too much overhead. Say this is our search group, and we match regex3, but wish to ignore it. regex1 | regex2 | regex3 | regex4 | regex5 ' ie, startAt = 0 What we need is a new MultiRegex that only includes the remaining possibilities: regex4 | regex5 ' ie, startAt = 3 This class wraps all that complexity up in a simple API... `startAt` decides where in the array of expressions to start doing the matching. It auto-increments, so if a match is found at position 2, then startAt will be set to 3. If the end is reached startAt will return to 0. MOST of the time the parser will be setting startAt manually to 0. */ class ResumableMultiRegex { constructor() { // @ts-ignore this.rules = []; // @ts-ignore this.multiRegexes = []; this.count = 0; this.lastIndex = 0; this.regexIndex = 0; } // @ts-ignore getMatcher(index) { if (this.multiRegexes[index]) return this.multiRegexes[index]; const matcher = new MultiRegex(); this.rules.slice(index).forEach(([re, opts]) => matcher.addRule(re, opts)); matcher.compile(); this.multiRegexes[index] = matcher; return matcher; } resumingScanAtSamePosition() { return this.regexIndex !== 0; } considerAll() { this.regexIndex = 0; } // @ts-ignore addRule(re, opts) { this.rules.push([re, opts]); if (opts.type === "begin") this.count++; } /** @param {string} s */ exec(s) { const m = this.getMatcher(this.regexIndex); m.lastIndex = this.lastIndex; let result = m.exec(s); // The following is because we have no easy way to say "resume scanning at the // existing position but also skip the current rule ONLY". What happens is // all prior rules are also skipped which can result in matching the wrong // thing. Example of matching "booger": // our matcher is [string, "booger", number] // // ....booger.... // if "booger" is ignored then we'd really need a regex to scan from the // SAME position for only: [string, number] but ignoring "booger" (if it // was the first match), a simple resume would scan ahead who knows how // far looking only for "number", ignoring potential string matches (or // future "booger" matches that might be valid.) // So what we do: We execute two matchers, one resuming at the same // position, but the second full matcher starting at the position after: // /--- resume first regex match here (for [number]) // |/---- full match here for [string, "booger", number] // vv // ....booger.... // Which ever results in a match first is then used. So this 3-4 step // process essentially allows us to say "match at this position, excluding // a prior rule that was ignored". // // 1. Match "booger" first, ignore. Also proves that [string] does non match. // 2. Resume matching for [number] // 3. Match at index + 1 for [string, "booger", number] // 4. If #2 and #3 result in matches, which came first? if (this.resumingScanAtSamePosition()) { if (result && result.index === this.lastIndex) { // result is position +0 and therefore a valid // "resume" match so result stays result } else { // use the second matcher result const m2 = this.getMatcher(0); m2.lastIndex = this.lastIndex + 1; result = m2.exec(s); } } if (result) { this.regexIndex += result.position + 1; if (this.regexIndex === this.count) { // wrap-around to considering all matches again this.considerAll(); } } return result; } } /** * Given a mode, builds a huge ResumableMultiRegex that can be used to walk * the content and find matches. * * @param {CompiledMode} mode * @returns {ResumableMultiRegex} */ function buildModeRegex(mode) { const mm = new ResumableMultiRegex(); mode.contains.forEach(term => mm.addRule(term.begin, { rule: term, type: "begin" })); if (mode.terminatorEnd) { mm.addRule(mode.terminatorEnd, { type: "end" }); } if (mode.illegal) { mm.addRule(mode.illegal, { type: "illegal" }); } return mm; } /** skip vs abort vs ignore * * @skip - The mode is still entered and exited normally (and contains rules apply), * but all content is held and added to the parent buffer rather than being * output when the mode ends. Mostly used with `sublanguage` to build up * a single large buffer than can be parsed by sublanguage. * * - The mode begin ands ends normally. * - Content matched is added to the parent mode buffer. * - The parser cursor is moved forward normally. * * @abort - A hack placeholder until we have ignore. Aborts the mode (as if it * never matched) but DOES NOT continue to match subsequent `contains` * modes. Abort is bad/suboptimal because it can result in modes * farther down not getting applied because an earlier rule eats the * content but then aborts. * * - The mode does not begin. * - Content matched by `begin` is added to the mode buffer. * - The parser cursor is moved forward accordingly. * * @ignore - Ignores the mode (as if it never matched) and continues to match any * subsequent `contains` modes. Ignore isn't technically possible with * the current parser implementation. * * - The mode does not begin. * - Content matched by `begin` is ignored. * - The parser cursor is not moved forward. */ /** * Compiles an individual mode * * This can raise an error if the mode contains certain detectable known logic * issues. * @param {Mode} mode * @param {CompiledMode | null} [parent] * @returns {CompiledMode | never} */ function compileMode(mode, parent) { const cmode = /** @type CompiledMode */ (mode); if (mode.isCompiled) return cmode; [ EXT.scopeClassName, // do this early so compiler extensions generally don't have to worry about // the distinction between match/begin EXT.compileMatch, MultiClass, beforeMatchExt ].forEach(ext => ext(mode, parent)); language.compilerExtensions.forEach(ext => ext(mode, parent)); // __beforeBegin is considered private API, internal use only mode.__beforeBegin = null; [ EXT.beginKeywords, // do this later so compiler extensions that come earlier have access to the // raw array if they wanted to perhaps manipulate it, etc. EXT.compileIllegal, // default to 1 relevance if not specified EXT.compileRelevance ].forEach(ext => ext(mode, parent)); mode.isCompiled = true; let keywordPattern = null; if (typeof mode.keywords === "object" && mode.keywords.$pattern) { // we need a copy because keywords might be compiled multiple times // so we can't go deleting $pattern from the original on the first // pass mode.keywords = Object.assign({}, mode.keywords); keywordPattern = mode.keywords.$pattern; delete mode.keywords.$pattern; } keywordPattern = keywordPattern || /\w+/; if (mode.keywords) { mode.keywords = compileKeywords(mode.keywords, language.case_insensitive); } cmode.keywordPatternRe = langRe(keywordPattern, true); if (parent) { if (!mode.begin) mode.begin = /\B|\b/; cmode.beginRe = langRe(cmode.begin); if (!mode.end && !mode.endsWithParent) mode.end = /\B|\b/; if (mode.end) cmode.endRe = langRe(cmode.end); cmode.terminatorEnd = regex.source(cmode.end) || ''; if (mode.endsWithParent && parent.terminatorEnd) { cmode.terminatorEnd += (mode.end ? '|' : '') + parent.terminatorEnd; } } if (mode.illegal) cmode.illegalRe = langRe(/** @type {RegExp | string} */ (mode.illegal)); if (!mode.contains) mode.contains = []; mode.contains = [].concat(...mode.contains.map(function(c) { return expandOrCloneMode(c === 'self' ? mode : c); })); mode.contains.forEach(function(c) { compileMode(/** @type Mode */ (c), cmode); }); if (mode.starts) { compileMode(mode.starts, parent); } cmode.matcher = buildModeRegex(cmode); return cmode; } if (!language.compilerExtensions) language.compilerExtensions = []; // self is not valid at the top-level if (language.contains && language.contains.includes('self')) { throw new Error("ERR: contains `self` is not supported at the top-level of a language. See documentation."); } // we need a null object, which inherit will guarantee language.classNameAliases = inherit(language.classNameAliases || {}); return compileMode(/** @type Mode */ (language)); } /** * Determines if a mode has a dependency on it's parent or not * * If a mode does have a parent dependency then often we need to clone it if * it's used in multiple places so that each copy points to the correct parent, * where-as modes without a parent can often safely be re-used at the bottom of * a mode chain. * * @param {Mode | null} mode * @returns {boolean} - is there a dependency on the parent? * */ function dependencyOnParent(mode) { if (!mode) return false; return mode.endsWithParent || dependencyOnParent(mode.starts); } /** * Expands a mode or clones it if necessary * * This is necessary for modes with parental dependenceis (see notes on * `dependencyOnParent`) and for nodes that have `variants` - which must then be * exploded into their own individual modes at compile time. * * @param {Mode} mode * @returns {Mode | Mode[]} * */ function expandOrCloneMode(mode) { if (mode.variants && !mode.cachedVariants) { mode.cachedVariants = mode.variants.map(function(variant) { return inherit(mode, { variants: null }, variant); }); } // EXPAND // if we have variants then essentially "replace" the mode with the variants // this happens in compileMode, where this function is called from if (mode.cachedVariants) { return mode.cachedVariants; } // CLONE // if we have dependencies on parents then we need a unique // instance of ourselves, so we can be reused with many // different parents without issue if (dependencyOnParent(mode)) { return inherit(mode, { starts: mode.starts ? inherit(mode.starts) : null }); } if (Object.isFrozen(mode)) { return inherit(mode); } // no special dependency issues, just return ourselves return mode; }