Skip to content

Commit 2be5b2d

Browse files
authored
feat: introduce experimental JavaScript RegExp Engine (#761)
1 parent 523f5fd commit 2be5b2d

37 files changed

+4880
-985
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,4 @@ packages/shiki/src/assets/themes
1515
packages/shiki/src/assets/*.json
1616
cache
1717
.eslintcache
18+
report-engine-js-compat.json

bench/engines.bench.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import fs from 'node:fs/promises'
2+
import { bench, describe } from 'vitest'
3+
import type { BundledLanguage } from 'shiki'
4+
import { createHighlighter, createJavaScriptRegexEngine, createWasmOnigEngine } from 'shiki'
5+
import type { ReportItem } from '../scripts/report-engine-js-compat'
6+
7+
describe('engines', async () => {
8+
const js = createJavaScriptRegexEngine()
9+
const wasm = await createWasmOnigEngine(() => import('shiki/wasm'))
10+
11+
// Run `npx jiti scripts/report-engine-js-compat.ts` to generate the report first
12+
const report = await fs.readFile('../scripts/report-engine-js-compat.json', 'utf-8').then(JSON.parse) as ReportItem[]
13+
const langs = report.filter(i => i.highlightMatch === true).map(i => i.lang) as BundledLanguage[]
14+
const samples = await Promise.all(langs.map(lang => fs.readFile(`../tm-grammars-themes/samples/${lang}.sample`, 'utf-8')))
15+
16+
const shikiJs = await createHighlighter({
17+
langs,
18+
themes: ['vitesse-dark'],
19+
engine: js,
20+
})
21+
22+
const shikiWasm = await createHighlighter({
23+
langs,
24+
themes: ['vitesse-dark'],
25+
engine: wasm,
26+
})
27+
28+
bench('js', () => {
29+
for (const lang of langs) {
30+
shikiJs.codeToTokensBase(samples[langs.indexOf(lang)], { lang, theme: 'vitesse-dark' })
31+
}
32+
}, { warmupIterations: 10, iterations: 30 })
33+
34+
bench('wasm', () => {
35+
for (const lang of langs) {
36+
shikiWasm.codeToTokensBase(samples[langs.indexOf(lang)], { lang, theme: 'vitesse-dark' })
37+
}
38+
}, { warmupIterations: 10, iterations: 30 })
39+
})

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"type": "module",
33
"version": "1.14.1",
44
"private": true,
5-
"packageManager": "pnpm@9.8.0",
5+
"packageManager": "pnpm@9.9.0",
66
"scripts": {
77
"lint": "eslint . --cache",
88
"release": "bumpp && pnpm -r publish",
@@ -46,6 +46,7 @@
4646
"mdast-util-gfm": "catalog:",
4747
"mdast-util-to-hast": "catalog:",
4848
"ofetch": "catalog:",
49+
"picocolors": "catalog:",
4950
"pnpm": "catalog:",
5051
"prettier": "catalog:",
5152
"rimraf": "catalog:",

packages/core/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
},
7171
"devDependencies": {
7272
"hast-util-to-html": "catalog:",
73+
"oniguruma-to-js": "catalog:",
7374
"vscode-oniguruma": "catalog:"
7475
}
7576
}
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import { onigurumaToRegexp } from 'oniguruma-to-js'
2+
import type { PatternScanner, RegexEngine, RegexEngineString } from '../textmate'
3+
import type { JavaScriptRegexEngineOptions } from '../types/engines'
4+
5+
const MAX = 4294967295
6+
7+
export class JavaScriptScanner implements PatternScanner {
8+
regexps: (RegExp | null)[]
9+
10+
constructor(
11+
public patterns: string[],
12+
public cache: Map<string, RegExp | Error>,
13+
public forgiving: boolean,
14+
) {
15+
this.regexps = patterns.map((p) => {
16+
const cached = cache?.get(p)
17+
if (cached) {
18+
if (cached instanceof RegExp) {
19+
return cached
20+
}
21+
if (forgiving)
22+
return null
23+
throw cached
24+
}
25+
try {
26+
const regex = onigurumaToRegexp(
27+
p
28+
// YAML specific handling; TODO: move to tm-grammars
29+
.replaceAll('[^\\s[-?:,\\[\\]{}#&*!|>\'"%@`]]', '[^\\s\\-?:,\\[\\]{}#&*!|>\'"%@`]'),
30+
{ flags: 'dg' },
31+
)
32+
cache?.set(p, regex)
33+
return regex
34+
}
35+
catch (e) {
36+
cache?.set(p, e as Error)
37+
if (forgiving)
38+
return null
39+
// console.error({ ...e })
40+
throw e
41+
}
42+
})
43+
}
44+
45+
findNextMatchSync(string: string | RegexEngineString, startPosition: number) {
46+
const str = typeof string === 'string'
47+
? string
48+
: string.content
49+
const pending: [index: number, match: RegExpExecArray][] = []
50+
51+
function toResult(index: number, match: RegExpExecArray) {
52+
return {
53+
index,
54+
captureIndices: match.indices!.map((indice) => {
55+
if (indice == null) {
56+
return {
57+
end: MAX,
58+
start: MAX,
59+
length: 0,
60+
}
61+
}
62+
return {
63+
start: indice[0],
64+
length: indice[1] - indice[0],
65+
end: indice[1],
66+
}
67+
}),
68+
}
69+
}
70+
71+
for (let i = 0; i < this.regexps.length; i++) {
72+
const regexp = this.regexps[i]
73+
if (!regexp)
74+
continue
75+
try {
76+
regexp.lastIndex = startPosition
77+
const match = regexp.exec(str)
78+
if (!match)
79+
continue
80+
// If the match is at the start position, return it immediately
81+
if (match.index === startPosition) {
82+
return toResult(i, match)
83+
}
84+
// Otherwise, store it for later
85+
pending.push([i, match])
86+
}
87+
catch (e) {
88+
if (this.forgiving)
89+
continue
90+
throw e
91+
}
92+
}
93+
94+
// Find the closest match to the start position
95+
if (pending.length) {
96+
const minIndex = Math.min(...pending.map(m => m[1].index))
97+
for (const [i, match] of pending) {
98+
if (match.index === minIndex) {
99+
return toResult(i, match)
100+
}
101+
}
102+
}
103+
104+
return null
105+
}
106+
}
107+
108+
/**
109+
* Use the modern JavaScript RegExp engine to implement the OnigScanner.
110+
*
111+
* As Oniguruma regex is more powerful than JavaScript regex, some patterns may not be supported.
112+
* Errors will be thrown when parsing TextMate grammars with unsupported patterns.
113+
* Set `forgiving` to `true` to ignore these errors and skip the unsupported patterns.
114+
*
115+
* @experimental
116+
*/
117+
export function createJavaScriptRegexEngine(options: JavaScriptRegexEngineOptions = {}): RegexEngine {
118+
const {
119+
forgiving = false,
120+
cache = new Map(),
121+
} = options
122+
123+
return {
124+
createScanner(patterns: string[]) {
125+
return new JavaScriptScanner(patterns, cache, forgiving)
126+
},
127+
createString(s: string) {
128+
return {
129+
content: s,
130+
}
131+
},
132+
}
133+
}

packages/core/src/oniguruma/index.ts renamed to packages/core/src/engines/oniguruma/index.ts

Lines changed: 43 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@
22
* Copyright (C) Microsoft Corporation. All rights reserved.
33
*-------------------------------------------------------- */
44

5-
import { ShikiError } from '../error'
6-
import type { IOnigBinding, IOnigCaptureIndex, IOnigMatch, OnigScanner as IOnigScanner, OnigString as IOnigString, Pointer } from './types'
5+
import { ShikiError } from '../../error'
6+
import type { LoadWasmOptions, WebAssemblyInstance, WebAssemblyInstantiator } from '../../types'
7+
import type { IOnigCaptureIndex, IOnigMatch, OnigScanner as IOnigScanner, OnigString as IOnigString } from '../../../vendor/vscode-textmate/src/main'
78
import createOnigasm from './onig'
89

10+
export type Instantiator = (importObject: Record<string, Record<string, WebAssembly.ImportValue>>) => Promise<WebAssembly.Exports>
11+
12+
export type Pointer = number
13+
914
export const enum FindOption {
1015
None = 0,
1116
/**
@@ -20,14 +25,25 @@ export const enum FindOption {
2025
* equivalent of ONIG_OPTION_NOT_BEGIN_POSITION: (start) isn't considered as start position of search (* fail \G)
2126
*/
2227
NotBeginPosition = 4,
23-
/**
24-
* used for debugging purposes.
25-
*/
26-
DebugCall = 8,
28+
}
29+
30+
export interface IOnigBinding {
31+
HEAPU8: Uint8Array
32+
HEAPU32: Uint32Array
33+
34+
UTF8ToString: (ptr: Pointer) => string
35+
36+
omalloc: (count: number) => Pointer
37+
ofree: (ptr: Pointer) => void
38+
getLastOnigError: () => Pointer
39+
createOnigScanner: (strPtrsPtr: Pointer, strLenPtr: Pointer, count: number) => Pointer
40+
freeOnigScanner: (ptr: Pointer) => void
41+
findNextOnigScannerMatch: (scanner: Pointer, strCacheId: number, strData: Pointer, strLength: number, position: number, options: number) => number
42+
// findNextOnigScannerMatchDbg: (scanner: Pointer, strCacheId: number, strData: Pointer, strLength: number, position: number, options: number) => number
2743
}
2844

2945
let onigBinding: IOnigBinding | null = null
30-
let defaultDebugCall = false
46+
// let defaultDebugCall = false
3147

3248
function throwLastOnigError(onigBinding: IOnigBinding): void {
3349
throw new ShikiError(onigBinding.UTF8ToString(onigBinding.getLastOnigError()))
@@ -294,34 +310,33 @@ export class OnigScanner implements IOnigScanner {
294310
public findNextMatchSync(string: string | OnigString, startPosition: number, debugCall: boolean): IOnigMatch | null
295311
public findNextMatchSync(string: string | OnigString, startPosition: number): IOnigMatch | null
296312
public findNextMatchSync(string: string | OnigString, startPosition: number, arg?: number | boolean): IOnigMatch | null {
297-
let debugCall = defaultDebugCall
313+
// let debugCall = defaultDebugCall
298314
let options = FindOption.None
299315
if (typeof arg === 'number') {
300-
if (arg & FindOption.DebugCall)
301-
debugCall = true
302-
316+
// if (arg & FindOption.DebugCall)
317+
// debugCall = true
303318
options = arg
304319
}
305320
else if (typeof arg === 'boolean') {
306-
debugCall = arg
321+
// debugCall = arg
307322
}
308323
if (typeof string === 'string') {
309324
string = new OnigString(string)
310-
const result = this._findNextMatchSync(string, startPosition, debugCall, options)
325+
const result = this._findNextMatchSync(string, startPosition, false, options)
311326
string.dispose()
312327
return result
313328
}
314-
return this._findNextMatchSync(string, startPosition, debugCall, options)
329+
return this._findNextMatchSync(string, startPosition, false, options)
315330
}
316331

317332
private _findNextMatchSync(string: OnigString, startPosition: number, debugCall: boolean, options: number): IOnigMatch | null {
318333
const onigBinding = this._onigBinding
319-
let resultPtr: Pointer
320-
if (debugCall)
321-
resultPtr = onigBinding.findNextOnigScannerMatchDbg(this._ptr, string.id, string.ptr, string.utf8Length, string.convertUtf16OffsetToUtf8(startPosition), options)
334+
// let resultPtr: Pointer
335+
// if (debugCall)
336+
// resultPtr = onigBinding.findNextOnigScannerMatchDbg(this._ptr, string.id, string.ptr, string.utf8Length, string.convertUtf16OffsetToUtf8(startPosition), options)
322337

323-
else
324-
resultPtr = onigBinding.findNextOnigScannerMatch(this._ptr, string.id, string.ptr, string.utf8Length, string.convertUtf16OffsetToUtf8(startPosition), options)
338+
// else
339+
const resultPtr = onigBinding.findNextOnigScannerMatch(this._ptr, string.id, string.ptr, string.utf8Length, string.convertUtf16OffsetToUtf8(startPosition), options)
325340

326341
if (resultPtr === 0) {
327342
// no match
@@ -348,17 +363,6 @@ export class OnigScanner implements IOnigScanner {
348363
}
349364
}
350365

351-
export interface WebAssemblyInstantiator {
352-
(importObject: Record<string, Record<string, WebAssembly.ImportValue>> | undefined): Promise<WebAssemblyInstance>
353-
}
354-
355-
export type WebAssemblyInstance = WebAssembly.WebAssemblyInstantiatedSource | WebAssembly.Instance | WebAssembly.Instance['exports']
356-
357-
export type OnigurumaLoadOptions =
358-
| { instantiator: WebAssemblyInstantiator }
359-
| { default: WebAssemblyInstantiator }
360-
| { data: ArrayBufferView | ArrayBuffer | Response }
361-
362366
function isInstantiatorOptionsObject(dataOrOptions: any): dataOrOptions is { instantiator: WebAssemblyInstantiator } {
363367
return (typeof dataOrOptions.instantiator === 'function')
364368
}
@@ -385,15 +389,6 @@ function isArrayBuffer(data: any): data is ArrayBuffer | ArrayBufferView {
385389

386390
let initPromise: Promise<void>
387391

388-
type Awaitable<T> = T | Promise<T>
389-
390-
export type LoadWasmOptionsPlain =
391-
| OnigurumaLoadOptions
392-
| WebAssemblyInstantiator
393-
| ArrayBufferView | ArrayBuffer | Response
394-
395-
export type LoadWasmOptions = Awaitable<LoadWasmOptionsPlain> | (() => Awaitable<LoadWasmOptionsPlain>)
396-
397392
export function loadWasm(options: LoadWasmOptions): Promise<void> {
398393
if (initPromise)
399394
return initPromise
@@ -461,14 +456,14 @@ function _makeResponseNonStreamingLoader(data: Response): WebAssemblyInstantiato
461456
}
462457
}
463458

464-
export function createOnigString(str: string) {
465-
return new OnigString(str)
466-
}
459+
// export function createOnigString(str: string) {
460+
// return new OnigString(str)
461+
// }
467462

468-
export function createOnigScanner(patterns: string[]) {
469-
return new OnigScanner(patterns)
470-
}
463+
// export function createOnigScanner(patterns: string[]) {
464+
// return new OnigScanner(patterns)
465+
// }
471466

472-
export function setDefaultDebugCall(_defaultDebugCall: boolean): void {
473-
defaultDebugCall = _defaultDebugCall
474-
}
467+
// export function setDefaultDebugCall(_defaultDebugCall: boolean): void {
468+
// defaultDebugCall = _defaultDebugCall
469+
// }

packages/core/src/oniguruma/onig.ts renamed to packages/core/src/engines/oniguruma/onig.ts

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
1-
import type { IOnigBinding, Instantiator } from './types'
1+
import type { IOnigBinding, Instantiator } from '.'
2+
3+
function getHeapMax() {
4+
return 2147483648
5+
}
6+
7+
function _emscripten_get_now() {
8+
return typeof performance !== 'undefined' ? performance.now() : Date.now()
9+
}
10+
11+
const alignUp = (x: number, multiple: number) => x + ((multiple - (x % multiple)) % multiple)
212

313
export default async function main(init: Instantiator): Promise<IOnigBinding> {
414
let wasmMemory: any
@@ -12,15 +22,10 @@ export default async function main(init: Instantiator): Promise<IOnigBinding> {
1222
binding.HEAPU32 = new Uint32Array(buf)
1323
}
1424

15-
function _emscripten_get_now() {
16-
return typeof performance !== 'undefined' ? performance.now() : Date.now()
17-
}
1825
function _emscripten_memcpy_big(dest: number, src: number, num: number) {
1926
binding.HEAPU8.copyWithin(dest, src, src + num)
2027
}
21-
function getHeapMax() {
22-
return 2147483648
23-
}
28+
2429
function emscripten_realloc_buffer(size: number) {
2530
try {
2631
wasmMemory.grow((size - buffer.byteLength + 65535) >>> 16)
@@ -36,7 +41,6 @@ export default async function main(init: Instantiator): Promise<IOnigBinding> {
3641
if (requestedSize > maxHeapSize)
3742
return false
3843

39-
const alignUp = (x: number, multiple: number) => x + ((multiple - (x % multiple)) % multiple)
4044
for (let cutDown = 1; cutDown <= 4; cutDown *= 2) {
4145
let overGrownHeapSize = oldSize * (1 + 0.2 / cutDown)
4246
overGrownHeapSize = Math.min(overGrownHeapSize, requestedSize + 100663296)

0 commit comments

Comments
 (0)