Skip to content

security: finish fixing unsafe heading regex #1226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 227 additions & 16 deletions lib/marked.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@
* https://github.com/markedjs/marked
*/

var NEW_TEXT = true;

var doLog = false;
function log(msg) {
if (doLog) {
console.log(msg);
}
}

;(function(root) {
'use strict';

Expand All @@ -16,7 +25,8 @@ var block = {
code: /^( {4}[^\n]+\n*)+/,
fences: noop,
hr: /^ {0,3}((?:- *){3,}|(?:_ *){3,}|(?:\* *){3,})(?:\n+|$)/,
heading: /^ *(#{1,6}) *([^\n]+?) *(?:#+ *)?(?:\n+|$)/,
// cap[2] might be ' HEADING # ' and must be trimmed appropriately.
heading: /^ {0,3}(#{1,6})(?:[^\S\n](.*))?(?:\n+|$)/,
nptable: noop,
blockquote: /^( {0,3}> ?(paragraph|[^\n]*)(?:\n|$))+/,
list: /^( *)(bull) [\s\S]+?(?:hr|def|\n{2,}(?! )(?!\1bull )\n*|\s*$)/,
Expand Down Expand Up @@ -92,8 +102,7 @@ block.normal = merge({}, block);

block.gfm = merge({}, block.normal, {
fences: /^ *(`{3,}|~{3,})[ \.]*(\S+)? *\n([\s\S]*?)\n? *\1 *(?:\n+|$)/,
paragraph: /^/,
heading: /^ *(#{1,6}) +([^\n]+?) *#* *(?:\n+|$)/
paragraph: /^/
});

block.gfm.paragraph = edit(block.paragraph)
Expand All @@ -116,6 +125,7 @@ block.tables = merge({}, block.gfm, {
*/

block.pedantic = merge({}, block.normal, {
heading: /^ *(#{1,6})(.*)(?:\n+|$)/,
html: edit(
'^ *(?:comment *(?:\\n|\\s*$)'
+ '|<(tag)[\\s\\S]+?</\\1> *(?:\\n{2,}|\\s*$)' // closed tag
Expand Down Expand Up @@ -215,7 +225,7 @@ Lexer.prototype.token = function(src, top) {
this.tokens.push({
type: 'code',
text: !this.options.pedantic
? cap.replace(/\n+$/, '')
? rtrim(cap, '\n')
: cap
});
continue;
Expand All @@ -235,10 +245,19 @@ Lexer.prototype.token = function(src, top) {
// heading
if (cap = this.rules.heading.exec(src)) {
src = src.substring(cap[0].length);
// cap[2] might be ' HEADING # '
item = (cap[2] || '').trim();
if (this.options.pedantic) {
item = rtrim(item, '#');
} else {
// CM requires a space before additional #s
item = item.replace(/(\s|^)#+$/, '');
}
item = item.trim();
this.tokens.push({
type: 'heading',
depth: cap[1].length,
text: cap[2]
text: item
});
continue;
}
Expand Down Expand Up @@ -507,9 +526,148 @@ var inline = {
code: /^(`+)\s*([\s\S]*?[^`]?)\s*\1(?!`)/,
br: /^ {2,}\n(?!\s*$)/,
del: noop,
text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/
text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/ // TODO Vulnerable
};

// Helper for the offsetOf routines.
function offsetOfRegex(str, regex) {
var cap = regex.exec(str);
if (cap) {
log(`offsetOfRegex: str ${str} matches regex ${regex.source}`);
return cap.index;
}
return -1;
}

// Returns earliest offset of "special characters"
function offsetOfSpecialChars(str) {
return offsetOfRegex(str, /[\\<!\[`*]/);
}

// Returns earliest offset of a command to italicize
function offsetOfItalics (str) {
return offsetOfRegex(str, /\b_/);
}

// Returns earliest offset of a run of 2+ spaces then a newline
function offsetOfSpacesThenNewline(str) {
// linear-time implementation of / {2,}\n/
var spaceRunBegins = -1;
var nSpaces = 0;
for (var i = 0; i < str.length; i++) {
if (str.charAt(i) === ' ') {
if (nSpaces === 0) {
spaceRunBegins = i;
}
nSpaces++;
} else if (2 <= nSpaces && str.charAt(i) === '\n') {
return spaceRunBegins;
} else {
nSpaces = 0;
}
}
return -1;
}

// Returns earliest offset of an http protocol
function offsetOfHTTP(str) {
return offsetOfRegex(str, /https?:\/\//);
}

// Returns earliest offset of an ftp protocol
function offsetOfFTP(str) {
return offsetOfRegex(str, /ftp:\/\//);
}

// Returns earliest offset of a www URL
function offsetOfWWW(str) {
return offsetOfRegex(str, /www\./);
}

// Returns earliest offset of an email (username + @)
function offsetOfEmail(str) {
var atSymbolIx = 0;
var emailUsernameChar = /^[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]$/;
// Look for email-like things at every '@'.
while (0 < (atSymbolIx = str.indexOf('@', atSymbolIx))) {
// Found an @, work backwards through valid username chars until we run out of string.
var i = atSymbolIx;
while (0 < i && emailUsernameChar.exec(str.charAt(i - 1))) {
i--;
}
// If we found any, this looks like an email.
if (i < atSymbolIx) {
return i;
}
}

return -1;
}

// Returns earliest offset of a text break in str, based on an array of textBreakFinders functions
// textBreakFinders should be a subset of the offsetOfX functions
// Imitates RegExp.exec
function offsetOfTextBreak(str, textBreakFinders) {
// Clean code means doing several O(n) operations.
// A more complex state machine (like a linear-time regex) might test all options
// in parallel more efficiently, but I don't know how to write one.
log(`Looking for tb in \'${str}\'`);
if (str.length === 0) {
return null;
}
var strToSearch = str.substr(1); // Must be at least one character of text before the break.

// Find the earliest instance of each kind of text break.
var textBreaks = textBreakFinders.map(function (f) {
return f(strToSearch);
});
log(`textBreaks: ${textBreaks}`);

// Pick earliest among them.
var validTextBreaks = textBreaks.filter(function (brk) {
return 0 <= brk;
});

var earliestBreakOffset;
if (validTextBreaks.length) {
// Why doesn't Math.min work here?
var min = validTextBreaks.reduce(function (accum, b) {
if (b < accum) {
return b;
}
return accum;
});
earliestBreakOffset = min + 1; // +1 because strToSearch is missing 1st char of str
} else {
// No text breaks? Then the whole string is text.
earliestBreakOffset = str.length;
}

// Mimic RegExp 'exec' for compatibility.
var result = {};
result[0] = str.substr(0, earliestBreakOffset);
result.index = earliestBreakOffset;
log(`Returning: earliestBreakOffset ${earliestBreakOffset} result ${JSON.stringify(result)}`);
return result;
}

// Find earliest text break according to the rules of the Inline Lexer.
// Imitates RegExp.exec
function offsetOfTextBreakInline(str) {
return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline]);
}

// Find earliest text break according to the rules of the Inline GFM Lexer.
// Imitates RegExp.exec
function offsetOfTextBreakInlineGFM(str) {
return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline, offsetOfHTTP, offsetOfFTP, offsetOfWWW, offsetOfEmail]);
}

// Override vulnerable but readable regex
if (NEW_TEXT) {
inline.text = { exec: offsetOfTextBreakInline };
}

inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;

inline._scheme = /[a-zA-Z][a-zA-Z0-9+.-]{1,31}/;
Expand Down Expand Up @@ -572,10 +730,7 @@ inline.gfm = merge({}, inline.normal, {
.getRegex(),
_backpedal: /(?:[^?!.,:;*_~()&]+|\([^)]*\)|&(?![a-zA-Z0-9]+;$)|[?!.,:;*_~)]+(?!$))+/,
del: /^~~(?=\S)([\s\S]*?\S)~~/,
text: edit(inline.text)
.replace(']|', '~]|')
.replace('|', '|https?://|ftp://|www\\.|[a-zA-Z0-9.!#$%&\'*+/=?^_`{\\|}~-]+@|')
.getRegex()
text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: .replace(']|', '~]|')
});

/**
Expand All @@ -584,7 +739,7 @@ inline.gfm = merge({}, inline.normal, {

inline.breaks = merge({}, inline.gfm, {
br: edit(inline.br).replace('{2,}', '*').getRegex(),
text: edit(inline.gfm.text).replace('{2,}', '*').getRegex()
text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: inline.gfm.text.replace('{2,}', '*')
});

/**
Expand Down Expand Up @@ -776,11 +931,28 @@ InlineLexer.prototype.output = function(src) {
}

// text
//log(`lexer: Matching text: ${this.rules.text.source}\n <${src}>`);
if (cap = this.rules.text.exec(src)) {
src = src.substring(cap[0].length);
out += this.renderer.text(escape(this.smartypants(cap[0])));
continue;
if (NEW_TEXT) {
log(`lexer: Match: ${JSON.stringify(cap)} ${cap.index}`);
var textLen = cap.index + 1;
// text is not in cap[0], so extract text before advancing src.
out += this.renderer.text(escape(this.smartypants(src.substr(0, textLen))));
src = src.substring(textLen);
continue;
} else {
var offInline = offsetOfTextBreakInline(src);
var offInlineGFM = offsetOfTextBreakInlineGFM(src);
console.log(`cap ${JSON.stringify(cap)}`);
console.log(`offInline ${JSON.stringify(offInline)}`);
console.log(`offInlineGFM ${JSON.stringify(offInlineGFM)}`);
console.log(`regex ${cap[0].length} offInline ${offInline[0].length} offInlineGFM ${offInlineGFM[0].length}`);
src = src.substring(cap[0].length);
out += this.renderer.text(escape(this.smartypants(cap[0])));
continue;
}
}
log(`lexer: Mismatch`);

if (src) {
throw new Error('Infinite loop on byte: ' + src.charCodeAt(0));
Expand Down Expand Up @@ -923,7 +1095,12 @@ Renderer.prototype.listitem = function(text) {
};

Renderer.prototype.paragraph = function(text) {
return '<p>' + text + '</p>\n';
var lines = text.split(/\n/);
lines = lines.map(function(l) {
return l.replace(/^ +/, '');
});

return '<p>' + lines.join('\n') + '</p>\n';
};

Renderer.prototype.table = function(header, body) {
Expand Down Expand Up @@ -1274,7 +1451,7 @@ function resolveUrl(base, href) {
if (/^[^:]+:\/*[^/]*$/.test(base)) {
baseUrls[' ' + base] = base + '/';
} else {
baseUrls[' ' + base] = base.replace(/[^/]*$/, '');
baseUrls[' ' + base] = rtrim(base, '/', true);
}
}
base = baseUrls[' ' + base];
Expand Down Expand Up @@ -1320,6 +1497,38 @@ function splitCells(tableRow) {
return cells;
}

// Return str with all trailing {c | all but c} removed
// allButC: Default false
function rtrim(str, c, allButC) {
if (typeof allButC === 'undefined') {
allButC = false;
} else {
allButC = true;
}
var mustMatchC = !allButC;

if (str.length === 0) {
return '';
}

// ix+1 of leftmost that fits description
// i.e. the length of the string we should return
var curr = str.length;

while (curr > 0) {
var currChar = str.charAt(curr - 1);
if (mustMatchC && currChar === c) {
curr--;
} else if (!mustMatchC && currChar !== c) {
curr--;
} else {
break;
}
}

return str.substr(0, curr);
}

/**
* Marked
*/
Expand Down Expand Up @@ -1455,6 +1664,8 @@ marked.defaults = marked.getDefaults();
* Expose
*/

marked(' # # ####A');

marked.Parser = Parser;
marked.parser = Parser.parse;

Expand Down
2 changes: 1 addition & 1 deletion test/new/nogfm_hashtag.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
gfm: false
pedantic: true
---
#header

Expand Down
Loading