Skip to content

Commit 24d4a5e

Browse files
committed
WIP: safen the text regex via linear-time scans
Sketch implementing text regex as a linear-time RegExp imitator. - A few nits here and there - I haven't tested all of the offsetOfX routines, so 'npm run test' hangs on some bug
1 parent dd26af8 commit 24d4a5e

File tree

1 file changed

+150
-14
lines changed

1 file changed

+150
-14
lines changed

lib/marked.js

Lines changed: 150 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
* https://github.com/markedjs/marked
55
*/
66

7-
var NEW_TEXT = false;
7+
var NEW_TEXT = true;
88

99
var doLog = false;
1010
function log(msg) {
@@ -526,15 +526,146 @@ var inline = {
526526
code: /^(`+)\s*([\s\S]*?[^`]?)\s*\1(?!`)/,
527527
br: /^ {2,}\n(?!\s*$)/,
528528
del: noop,
529-
text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/
529+
text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/ // TODO Vulnerable
530530
};
531531

532+
// Helper for the offsetOf routines.
533+
function offsetOfRegex(str, regex) {
534+
var cap = regex.exec(str);
535+
if (cap) {
536+
log(`offsetOfRegex: str ${str} matches regex ${regex.source}`);
537+
return cap.index;
538+
}
539+
return -1;
540+
}
541+
542+
// Returns earliest offset of "special characters"
543+
function offsetOfSpecialChars(str) {
544+
return offsetOfRegex(str, /[\\<!\[`*]/);
545+
}
546+
547+
// Returns earliest offset of a command to italicize
548+
function offsetOfItalics (str) {
549+
return offsetOfRegex(str, /\b_/);
550+
}
551+
552+
// Returns earliest offset of a run of 2+ spaces then a newline
553+
function offsetOfSpacesThenNewline(str) {
554+
// linear-time implementation of / {2,}\n/
555+
var spaceRunBegins = -1;
556+
var nSpaces = 0;
557+
for (var i = 0; i < str.length; i++) {
558+
if (str.charAt(i) === ' ') {
559+
if (nSpaces === 0) {
560+
spaceRunBegins = i;
561+
}
562+
nSpaces++;
563+
} else if (2 <= nSpaces && str.charAt(i) === '\n') {
564+
return spaceRunBegins;
565+
} else {
566+
nSpaces = 0;
567+
}
568+
}
569+
return -1;
570+
}
571+
572+
// Returns earliest offset of an http protocol
573+
function offsetOfHTTP(str) {
574+
return offsetOfRegex(str, /https?:\/\//);
575+
}
576+
577+
// Returns earliest offset of an ftp protocol
578+
function offsetOfFTP(str) {
579+
return offsetOfRegex(str, /ftp:\/\//);
580+
}
581+
582+
// Returns earliest offset of a www URL
583+
function offsetOfWWW(str) {
584+
return offsetOfRegex(str, /www\./);
585+
}
586+
587+
// Returns earliest offset of an email (username + @)
588+
function offsetOfEmail(str) {
589+
var atSymbolIx = 0;
590+
var emailUsernameChar = /^[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]$/;
591+
// Look for email-like things at every '@'.
592+
while (0 < (atSymbolIx = str.indexOf('@', atSymbolIx))) {
593+
// Found an @, work backwards through valid username chars until we run out of string.
594+
var i = atSymbolIx;
595+
while (0 < i && emailUsernameChar.exec(str.charAt(i - 1))) {
596+
i--;
597+
}
598+
// If we found any, this looks like an email.
599+
if (i < atSymbolIx) {
600+
return i;
601+
}
602+
}
603+
604+
return -1;
605+
}
606+
607+
// Returns earliest offset of a text break in str, based on an array of textBreakFinders functions
608+
// textBreakFinders should be a subset of the offsetOfX functions
609+
// Imitates RegExp.exec
610+
function offsetOfTextBreak(str, textBreakFinders) {
611+
// Clean code means doing several O(n) operations.
612+
// A more complex state machine (like a linear-time regex) might test all options
613+
// in parallel more efficiently, but I don't know how to write one.
614+
log(`Looking for tb in \'${str}\'`);
615+
if (str.length === 0) {
616+
return null;
617+
}
618+
var strToSearch = str.substr(1); // Must be at least one character of text before the break.
619+
620+
// Find the earliest instance of each kind of text break.
621+
var textBreaks = textBreakFinders.map(function (f) {
622+
return f(strToSearch);
623+
});
624+
log(`textBreaks: ${textBreaks}`);
625+
626+
// Pick earliest among them.
627+
var validTextBreaks = textBreaks.filter(function (brk) {
628+
return 0 <= brk;
629+
});
630+
631+
var earliestBreakOffset;
632+
if (validTextBreaks.length) {
633+
// Why doesn't Math.min work here?
634+
var min = validTextBreaks.reduce(function (accum, b) {
635+
if (b < accum) {
636+
return b;
637+
}
638+
return accum;
639+
});
640+
earliestBreakOffset = min + 1; // +1 because strToSearch is missing 1st char of str
641+
} else {
642+
// No text breaks? Then the whole string is text.
643+
earliestBreakOffset = str.length;
644+
}
645+
646+
// Mimic RegExp 'exec' for compatibility.
647+
var result = {};
648+
result[0] = str.substr(0, earliestBreakOffset);
649+
result.index = earliestBreakOffset;
650+
log(`Returning: earliestBreakOffset ${earliestBreakOffset} result ${JSON.stringify(result)}`);
651+
return result;
652+
}
653+
654+
// Find earliest text break according to the rules of the Inline Lexer.
655+
// Imitates RegExp.exec
656+
function offsetOfTextBreakInline(str) {
657+
return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline]);
658+
}
659+
660+
// Find earliest text break according to the rules of the Inline GFM Lexer.
661+
// Imitates RegExp.exec
662+
function offsetOfTextBreakInlineGFM(str) {
663+
return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline, offsetOfHTTP, offsetOfFTP, offsetOfWWW, offsetOfEmail]);
664+
}
665+
666+
// Override vulnerable but readable regex
532667
if (NEW_TEXT) {
533-
// TODO: If we replace ' {2,}\n' with ' \n' and address trailing whitespace,
534-
// we break the definition of GFM inline.breaks further down (affects the gfm_break test).
535-
// Furthermore, we still have trouble with the email pattern substituted in: /|[...]+@/, which
536-
// is vulnerable to REDOS just like /| {2,}\n/ was
537-
inline.text = /[\s\S](?:[\\<!\[`*]|\b_| {2}\n|$)/;
668+
inline.text = { exec: offsetOfTextBreakInline };
538669
}
539670

540671
inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;
@@ -599,10 +730,7 @@ inline.gfm = merge({}, inline.normal, {
599730
.getRegex(),
600731
_backpedal: /(?:[^?!.,:;*_~()&]+|\([^)]*\)|&(?![a-zA-Z0-9]+;$)|[?!.,:;*_~)]+(?!$))+/,
601732
del: /^~~(?=\S)([\s\S]*?\S)~~/,
602-
text: edit(inline.text)
603-
.replace(']|', '~]|')
604-
.replace('|', '|https?://|ftp://|www\\.|[a-zA-Z0-9.!#$%&\'*+/=?^_`{\\|}~-]+@|')
605-
.getRegex()
733+
text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: .replace(']|', '~]|')
606734
});
607735

608736
/**
@@ -611,7 +739,7 @@ inline.gfm = merge({}, inline.normal, {
611739

612740
inline.breaks = merge({}, inline.gfm, {
613741
br: edit(inline.br).replace('{2,}', '*').getRegex(),
614-
text: edit(inline.gfm.text).replace('{2,}', '*').getRegex()
742+
text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: inline.gfm.text.replace('{2,}', '*')
615743
});
616744

617745
/**
@@ -803,16 +931,22 @@ InlineLexer.prototype.output = function(src) {
803931
}
804932

805933
// text
806-
log(`lexer: Matching text: ${this.rules.text.source}\n <${src}>`);
934+
//log(`lexer: Matching text: ${this.rules.text.source}\n <${src}>`);
807935
if (cap = this.rules.text.exec(src)) {
808936
if (NEW_TEXT) {
809-
log(`lexer: Match: ${cap} ${cap.index}`);
937+
log(`lexer: Match: ${JSON.stringify(cap)} ${cap.index}`);
810938
var textLen = cap.index + 1;
811939
// text is not in cap[0], so extract text before advancing src.
812940
out += this.renderer.text(escape(this.smartypants(src.substr(0, textLen))));
813941
src = src.substring(textLen);
814942
continue;
815943
} else {
944+
var offInline = offsetOfTextBreakInline(src);
945+
var offInlineGFM = offsetOfTextBreakInlineGFM(src);
946+
console.log(`cap ${JSON.stringify(cap)}`);
947+
console.log(`offInline ${JSON.stringify(offInline)}`);
948+
console.log(`offInlineGFM ${JSON.stringify(offInlineGFM)}`);
949+
console.log(`regex ${cap[0].length} offInline ${offInline[0].length} offInlineGFM ${offInlineGFM[0].length}`);
816950
src = src.substring(cap[0].length);
817951
out += this.renderer.text(escape(this.smartypants(cap[0])));
818952
continue;
@@ -1530,6 +1664,8 @@ marked.defaults = marked.getDefaults();
15301664
* Expose
15311665
*/
15321666

1667+
marked(' # # ####A');
1668+
15331669
marked.Parser = Parser;
15341670
marked.parser = Parser.parse;
15351671

0 commit comments

Comments
 (0)