markedjs · davisjam · Apr 16, 2018 · Apr 18, 2018 · Apr 18, 2018 · Apr 18, 2018
diff --git a/lib/marked.js b/lib/marked.js
@@ -4,6 +4,15 @@
  * https://github.com/markedjs/marked
  */
 
+var NEW_TEXT = true;
+
+var doLog = false;
+function log(msg) {
+  if (doLog) {
+    console.log(msg);
+  }
+}
+
 ;(function(root) {
 'use strict';
 
@@ -16,7 +25,8 @@ var block = {
   code: /^( {4}[^\n]+\n*)+/,
   fences: noop,
   hr: /^ {0,3}((?:- *){3,}|(?:_ *){3,}|(?:\* *){3,})(?:\n+|$)/,
-  heading: /^ *(#{1,6}) *([^\n]+?) *(?:#+ *)?(?:\n+|$)/,
+  // cap[2] might be ' HEADING # ' and must be trimmed appropriately.
+  heading: /^ {0,3}(#{1,6})(?:[^\S\n](.*))?(?:\n+|$)/,
   nptable: noop,
   blockquote: /^( {0,3}> ?(paragraph|[^\n]*)(?:\n|$))+/,
   list: /^( *)(bull) [\s\S]+?(?:hr|def|\n{2,}(?! )(?!\1bull )\n*|\s*$)/,
@@ -92,8 +102,7 @@ block.normal = merge({}, block);
 
 block.gfm = merge({}, block.normal, {
   fences: /^ *(`{3,}|~{3,})[ \.]*(\S+)? *\n([\s\S]*?)\n? *\1 *(?:\n+|$)/,
-  paragraph: /^/,
-  heading: /^ *(#{1,6}) +([^\n]+?) *#* *(?:\n+|$)/
+  paragraph: /^/
 });
 
 block.gfm.paragraph = edit(block.paragraph)
@@ -116,6 +125,7 @@ block.tables = merge({}, block.gfm, {
  */
 
 block.pedantic = merge({}, block.normal, {
+  heading: /^ *(#{1,6})(.*)(?:\n+|$)/,
   html: edit(
     '^ *(?:comment *(?:\\n|\\s*$)'
     + '|<(tag)[\\s\\S]+?</\\1> *(?:\\n{2,}|\\s*$)' // closed tag
@@ -215,7 +225,7 @@ Lexer.prototype.token = function(src, top) {
       this.tokens.push({
         type: 'code',
         text: !this.options.pedantic
-          ? cap.replace(/\n+$/, '')
+          ? rtrim(cap, '\n')
           : cap
       });
       continue;
@@ -235,10 +245,19 @@ Lexer.prototype.token = function(src, top) {
     // heading
     if (cap = this.rules.heading.exec(src)) {
       src = src.substring(cap[0].length);
+      // cap[2] might be ' HEADING # '
+      item = (cap[2] || '').trim();
+      if (this.options.pedantic) {
+        item = rtrim(item, '#');
+      } else {
+        // CM requires a space before additional #s
+        item = item.replace(/(\s|^)#+$/, '');
+      }
+      item = item.trim();
       this.tokens.push({
         type: 'heading',
         depth: cap[1].length,
-        text: cap[2]
+        text: item
       });
       continue;
     }
@@ -507,9 +526,148 @@ var inline = {
   code: /^(`+)\s*([\s\S]*?[^`]?)\s*\1(?!`)/,
   br: /^ {2,}\n(?!\s*$)/,
   del: noop,
-  text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/
+  text: /^[\s\S]+?(?=[\\<!\[`*]|\b_| {2,}\n|$)/ // TODO Vulnerable
 };
 
+// Helper for the offsetOf routines.
+function offsetOfRegex(str, regex) {
+  var cap = regex.exec(str);
+  if (cap) {
+    log(`offsetOfRegex: str ${str} matches regex ${regex.source}`);
+    return cap.index;
+  }
+  return -1;
+}
+
+// Returns earliest offset of "special characters"
+function offsetOfSpecialChars(str) {
+  return offsetOfRegex(str, /[\\<!\[`*]/);
+}
+
+// Returns earliest offset of a command to italicize
+function offsetOfItalics (str) {
+  return offsetOfRegex(str, /\b_/);
+}
+
+// Returns earliest offset of a run of 2+ spaces then a newline
+function offsetOfSpacesThenNewline(str) {
+  // linear-time implementation of / {2,}\n/
+  var spaceRunBegins = -1;
+  var nSpaces = 0;
+  for (var i = 0; i < str.length; i++) {
+    if (str.charAt(i) === ' ') {
+      if (nSpaces === 0) {
+        spaceRunBegins = i;
+      }
+      nSpaces++;
+    } else if (2 <= nSpaces && str.charAt(i) === '\n') {
+      return spaceRunBegins;
+    } else {
+      nSpaces = 0;
+    }
+  }
+  return -1;
+}
+
+// Returns earliest offset of an http protocol
+function offsetOfHTTP(str) {
+  return offsetOfRegex(str, /https?:\/\//);
+}
+
+// Returns earliest offset of an ftp protocol
+function offsetOfFTP(str) {
+  return offsetOfRegex(str, /ftp:\/\//);
+}
+
+// Returns earliest offset of a www URL
+function offsetOfWWW(str) {
+  return offsetOfRegex(str, /www\./);
+}
+
+// Returns earliest offset of an email (username + @)
+function offsetOfEmail(str) {
+  var atSymbolIx = 0;
+  var emailUsernameChar = /^[a-zA-Z0-9.!#$%&'*+\/=?_`{\|}~-]$/;
+  // Look for email-like things at every '@'.
+  while (0 < (atSymbolIx = str.indexOf('@', atSymbolIx))) {
+    // Found an @, work backwards through valid username chars until we run out of string.
+    var i = atSymbolIx;
+    while (0 < i && emailUsernameChar.exec(str.charAt(i - 1))) {
+      i--;
+    }
+    // If we found any, this looks like an email.
+    if (i < atSymbolIx) {
+      return i;
+    }
+  }
+
+  return -1;
+}
+
+// Returns earliest offset of a text break in str, based on an array of textBreakFinders functions
+// textBreakFinders should be a subset of the offsetOfX functions
+// Imitates RegExp.exec
+function offsetOfTextBreak(str, textBreakFinders) {
+  // Clean code means doing several O(n) operations.
+  // A more complex state machine (like a linear-time regex) might test all options
+  // in parallel more efficiently, but I don't know how to write one.
+  log(`Looking for tb in \'${str}\'`);
+  if (str.length === 0) {
+    return null;
+  }
+  var strToSearch = str.substr(1); // Must be at least one character of text before the break.
+
+  // Find the earliest instance of each kind of text break.
+  var textBreaks = textBreakFinders.map(function (f) {
+    return f(strToSearch);
+  });
+  log(`textBreaks: ${textBreaks}`);
+
+  // Pick earliest among them.
+  var validTextBreaks = textBreaks.filter(function (brk) {
+    return 0 <= brk;
+  });
+
+  var earliestBreakOffset;
+  if (validTextBreaks.length) {
+    // Why doesn't Math.min work here?
+    var min = validTextBreaks.reduce(function (accum, b) {
+      if (b < accum) {
+        return b;
+      }
+      return accum;
+    });
+    earliestBreakOffset = min + 1; // +1 because strToSearch is missing 1st char of str
+  } else {
+    // No text breaks? Then the whole string is text.
+    earliestBreakOffset = str.length;
+  }
+
+  // Mimic RegExp 'exec' for compatibility.
+  var result = {};
+  result[0] = str.substr(0, earliestBreakOffset);
+  result.index = earliestBreakOffset;
+  log(`Returning: earliestBreakOffset ${earliestBreakOffset} result ${JSON.stringify(result)}`);
+  return result;
+}
+
+// Find earliest text break according to the rules of the Inline Lexer.
+// Imitates RegExp.exec
+function offsetOfTextBreakInline(str) {
+  return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline]);
+}
+
+// Find earliest text break according to the rules of the Inline GFM Lexer.
+// Imitates RegExp.exec
+function offsetOfTextBreakInlineGFM(str) {
+  return offsetOfTextBreak(str, [offsetOfSpecialChars, offsetOfItalics, offsetOfSpacesThenNewline, offsetOfHTTP, offsetOfFTP, offsetOfWWW, offsetOfEmail]);
+}
+
+// Override vulnerable but readable regex
+if (NEW_TEXT) {
+  inline.text = { exec: offsetOfTextBreakInline };
+}
+
 inline._escapes = /\\([!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~])/g;
 
 inline._scheme = /[a-zA-Z][a-zA-Z0-9+.-]{1,31}/;
@@ -572,10 +730,7 @@ inline.gfm = merge({}, inline.normal, {
     .getRegex(),
   _backpedal: /(?:[^?!.,:;*_~()&]+|\([^)]*\)|&(?![a-zA-Z0-9]+;$)|[?!.,:;*_~)]+(?!$))+/,
   del: /^~~(?=\S)([\s\S]*?\S)~~/,
-  text: edit(inline.text)
-    .replace(']|', '~]|')
-    .replace('|', '|https?://|ftp://|www\\.|[a-zA-Z0-9.!#$%&\'*+/=?^_`{\\|}~-]+@|')
-    .getRegex()
+  text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: .replace(']|', '~]|')
 });
 
 /**
@@ -584,7 +739,7 @@ inline.gfm = merge({}, inline.normal, {
 
 inline.breaks = merge({}, inline.gfm, {
   br: edit(inline.br).replace('{2,}', '*').getRegex(),
-  text: edit(inline.gfm.text).replace('{2,}', '*').getRegex()
+  text: { exec: offsetOfTextBreakInlineGFM } // TODO Missing: inline.gfm.text.replace('{2,}', '*')
 });
 
 /**
@@ -776,11 +931,28 @@ InlineLexer.prototype.output = function(src) {
     }
 
     // text
+    //log(`lexer: Matching text: ${this.rules.text.source}\n  <${src}>`);
     if (cap = this.rules.text.exec(src)) {
-      src = src.substring(cap[0].length);
-      out += this.renderer.text(escape(this.smartypants(cap[0])));
-      continue;
+      if (NEW_TEXT) {
+        log(`lexer: Match: ${JSON.stringify(cap)} ${cap.index}`);
+        var textLen = cap.index + 1;
+        // text is not in cap[0], so extract text before advancing src.
+        out += this.renderer.text(escape(this.smartypants(src.substr(0, textLen))));
+        src = src.substring(textLen);
+        continue;
+      } else {
+        var offInline = offsetOfTextBreakInline(src);
+        var offInlineGFM = offsetOfTextBreakInlineGFM(src);
+        console.log(`cap ${JSON.stringify(cap)}`);
+        console.log(`offInline ${JSON.stringify(offInline)}`);
+        console.log(`offInlineGFM ${JSON.stringify(offInlineGFM)}`);
+        console.log(`regex ${cap[0].length} offInline ${offInline[0].length} offInlineGFM ${offInlineGFM[0].length}`);
+        src = src.substring(cap[0].length);
+        out += this.renderer.text(escape(this.smartypants(cap[0])));
+        continue;
+      }
     }
+    log(`lexer: Mismatch`);
 
     if (src) {
       throw new Error('Infinite loop on byte: ' + src.charCodeAt(0));
@@ -923,7 +1095,12 @@ Renderer.prototype.listitem = function(text) {
 };
 
 Renderer.prototype.paragraph = function(text) {
-  return '<p>' + text + '</p>\n';
+  var lines = text.split(/\n/);
+  lines = lines.map(function(l) {
+    return l.replace(/^ +/, '');
+  });
+
+  return '<p>' + lines.join('\n') + '</p>\n';
 };
 
 Renderer.prototype.table = function(header, body) {
@@ -1274,7 +1451,7 @@ function resolveUrl(base, href) {
     if (/^[^:]+:\/*[^/]*$/.test(base)) {
       baseUrls[' ' + base] = base + '/';
     } else {
-      baseUrls[' ' + base] = base.replace(/[^/]*$/, '');
+      baseUrls[' ' + base] = rtrim(base, '/', true);
     }
   }
   base = baseUrls[' ' + base];
@@ -1320,6 +1497,38 @@ function splitCells(tableRow) {
   return cells;
 }
 
+// Return str with all trailing {c | all but c} removed
+// allButC: Default false
+function rtrim(str, c, allButC) {
+  if (typeof allButC === 'undefined') {
+    allButC = false;
+  } else {
+    allButC = true;
+  }
+  var mustMatchC = !allButC;
+
+  if (str.length === 0) {
+    return '';
+  }
+
+  // ix+1 of leftmost that fits description
+  // i.e. the length of the string we should return
+  var curr = str.length;
+
+  while (curr > 0) {
+    var currChar = str.charAt(curr - 1);
+    if (mustMatchC && currChar === c) {
+      curr--;
+    } else if (!mustMatchC && currChar !== c) {
+      curr--;
+    } else {
+      break;
+    }
+  }
+
+  return str.substr(0, curr);
+}
+
 /**
  * Marked
  */
@@ -1455,6 +1664,8 @@ marked.defaults = marked.getDefaults();
  * Expose
  */
 
+marked(' # # ####A');
+
 marked.Parser = Parser;
 marked.parser = Parser.parse;
 

diff --git a/test/new/nogfm_hashtag.md b/test/new/nogfm_hashtag.md
@@ -1,5 +1,5 @@
 ---
-gfm: false
+pedantic: true
 ---
 #header