fletcher
diff --git a/‎CMakeLists.txt
+8-6 b/‎CMakeLists.txt
+8-6
diff --git a/‎QuickStart.fodt
+545 b/‎QuickStart.fodt
+545
diff --git a/‎QuickStart.html
+277 b/‎QuickStart.html
+277
diff --git a/‎QuickStart.pdf
82.5 KB b/‎QuickStart.pdf
82.5 KB
@@ -8,10 +8,10 @@ cmake_minimum_required (VERSION 2.6)
 set (My_Project_Title "MultiMarkdown")
 set (My_Project_Description "Lightweight markup processor to produce HTML, LaTeX, and more.")
 set (My_Project_Author "Fletcher T. Penney")
-set (My_Project_Revised_Date "2017-03-05")
-set (My_Project_Version_Major 0)
-set (My_Project_Version_Minor 4)
-set (My_Project_Version_Patch 2b)
+set (My_Project_Revised_Date "2017-03-09")
+set (My_Project_Version_Major 6)
+set (My_Project_Version_Minor 0)
+set (My_Project_Version_Patch -b1)
 
 set (My_Project_Version "${My_Project_Version_Major}.${My_Project_Version_Minor}.${My_Project_Version_Patch}")
 
@@ -172,6 +172,7 @@ configure_file (
 
 # src_files are the primary files, and will be included in doxygen documentation
 set(src_files
+	Sources/libMultiMarkdown/aho-corasick.c
 	Sources/libMultiMarkdown/beamer.c
 	Sources/libMultiMarkdown/char.c
 	Sources/libMultiMarkdown/d_string.c
@@ -194,6 +195,7 @@ set(src_files
 
 # Primary header files, also for doxygen documentation
 set(header_files
+	Sources/libMultiMarkdown/aho-corasick.h
 	Sources/libMultiMarkdown/beamer.h
 	Sources/libMultiMarkdown/char.h
 	Sources/libMultiMarkdown/include/d_string.h
@@ -567,6 +569,6 @@ ADD_MMD_TEST(mmd-6-latex "-t latex" MMD6Tests tex)
 
 ADD_MMD_TEST(mmd-6-odf "-t odf" MMD6Tests fodt)
 
-ADD_MMD_TEST(pathologic "" ../build html)
-
 ADD_MMD_TEST(pathologic-compat "-c" ../build html)
+
+ADD_MMD_TEST(pathologic "" ../build html)
@@ -0,0 +1,277 @@
+<!DOCTYPE html>
+<html>
+<head>
+	<meta charset="utf-8"/>
+	<title>MultiMarkdown v6 Quick Start Guide</title>
+	<meta name="author" content="Fletcher T. Penney"/>
+	<meta name="version" content="6.0-b"/>
+</head>
+<body>
+
+<div class="TOC">
+
+<ul>
+<li><a href="#introduction">Introduction </a></li>
+<li><a href="#performance">Performance </a></li>
+<li><a href="#parsetree">Parse Tree </a></li>
+<li><a href="#features">Features </a>
+<ul>
+<li><a href="#abbreviationsoracronyms">Abbreviations (Or Acronyms) </a></li>
+<li><a href="#citations">Citations </a></li>
+<li><a href="#criticmarkup">CriticMarkup </a></li>
+<li><a href="#emphandstrong">Emph and Strong </a></li>
+<li><a href="#fencedcodeblocks">Fenced Code Blocks </a></li>
+<li><a href="#glossaryterms">Glossary Terms </a></li>
+<li><a href="#internationalization">Internationalization </a></li>
+<li><a href="#metadata">Metadata </a></li>
+<li><a href="#tableofcontents">Table of Contents </a></li>
+</ul>
+</li>
+<li><a href="#futuresteps">Future Steps </a></li>
+</ul>
+</div>
+
+<h3 id="introduction">Introduction </h3>
+
+<p>Version: 6.0-b</p>
+
+<p>This document serves as a description of MultiMarkdown (<abbr title="MultiMarkdown">MMD</abbr>) v6, as well as a sample
+document to demonstrate the various features. Specifically, differences from
+<abbr title="MultiMarkdown">MMD</abbr> v5 will be pointed out.</p>
+
+<h3 id="performance">Performance </h3>
+
+<p>A big motivating factor leading to the development of <abbr title="MultiMarkdown">MMD</abbr> v6 was
+performance. When <abbr title="MultiMarkdown">MMD</abbr> first migrated from Perl to C (based on <a href="https://github.com/jgm/peg-markdown">peg-
+markdown</a>), it was among the fastest
+Markdown parsers available. That was many years ago, and the &#8220;competition&#8221;
+has made a great deal of progress since that time.</p>
+
+<p>When developing <abbr title="MultiMarkdown">MMD</abbr> v6, one of my goals was to keep <abbr title="MultiMarkdown">MMD</abbr> at least in the
+ballpark of the fastest processors. Of course, being <em>the</em> fastest would be
+fantastic, but I was more concerned with ensuring that the code was easily
+understood, and easily updated with new features in the future.</p>
+
+<p><abbr title="MultiMarkdown">MMD</abbr> v3 &#8211; v5 used a <a href="#gn:1" id="gnref:1" title="see glossary" class="glossary">PEG</a> to handle the parsing. This made it easy to
+understand the relationship between the <abbr title="MultiMarkdown">MMD</abbr> grammar and the parsing code,
+since they were one and the same. However, the parsing code generated by
+the parsers was not particularly fast, and was prone to troublesome edge
+cases with terrible performance characteristics.</p>
+
+<p>The first step in <abbr title="MultiMarkdown">MMD</abbr> v6 parsing is to break the source text into a series
+of tokens, which may consist of plain text, whitespace, or special characters
+such as &#8216;*&#8217;, &#8216;[&#8217;, etc. This chain of tokens is then used to perform the
+actual parsing.</p>
+
+<p><abbr title="MultiMarkdown">MMD</abbr> v6 divides the parsing into two separate phases, which actually fits
+more with Markdown&#8217;s design philosophically.</p>
+
+<ol>
+<li><p>Block parsing consists of identifying the &#8220;type&#8221; of each line of the
+source text, and grouping the lines into blocks (e.g. paragraphs, lists,
+blockquotes, etc.) Some blocks are a single line (e.g. ATX headers), and
+others can be many lines long. The block parsing in <abbr title="MultiMarkdown">MMD</abbr> v6 is handled
+by a parser generated by <a href="http://www.hwaci.com/sw/lemon/">lemon</a>. This
+parser allows the block structure to be more readily understood by
+non-programmers, but the generated parser is still fast.</p></li>
+<li><p>Span parsing consists of identifying Markdown/<abbr title="MultiMarkdown">MMD</abbr> structures that occur
+inside of blocks, such as links, images, strong, emph, etc. Most of these
+structures require matching pairs of tokens to specify where the span starts
+and where it ends. Most of these spans allow arbitrary levels of nesting as
+well. This made parsing them correctly in the PEG-based code difficult and
+slow. <abbr title="MultiMarkdown">MMD</abbr> v6 uses a different approach that is accurate and has good
+performance characteristics even with edge cases. Basically, it keeps a stack
+of each &#8220;opening&#8221; token as it steps through the token chain. When a &#8220;closing&#8221;
+token is found, it is paired with the most recent appropriate opener on the
+stack. Any tokens in between the opener and closer are removed, as they are
+not able to be matched any more. To avoid unnecessary searches for non-
+existent openers, the parser keeps track of which opening tokens have been
+discovered. This allows the parser to continue moving forwards without having
+to go backwards and re-parse any previously visited tokens.</p></li>
+</ol>
+
+<p>The result of this redesigned <abbr title="MultiMarkdown">MMD</abbr> parser is that it can parse short
+documents more quickly than <a href="http://commonmark.org/">CommonMark</a>, and takes
+only 15% &#8211; 20% longer to parse long documents. I have not delved too deeply
+into this, but I presume that CommonMark has a bit more &#8220;set-up&#8221; time that
+becomes expensive when parsing a short document (e.g. a paragraph or two). But
+this cost becomes negligible when parsing longer documents (e.g. file sizes of
+1 MB). So depending on your use case, CommonMark may well be faster than
+<abbr title="MultiMarkdown">MMD</abbr>, but we&#8217;re talking about splitting hairs here&#8230;. Recent comparisons
+show <abbr title="MultiMarkdown">MMD</abbr> v6 taking approximately 4.37 seconds to parse a 108 MB file
+(approximately 24.8 MB/second), and CommonMark took 3.72 seconds for the same
+file (29.2 MB/second). For comparison, <abbr title="MultiMarkdown">MMD</abbr> v5.4 took approximately 94
+second for the same file (1.15 MB/second).</p>
+
+<p>For a more realistic file of approx 28 kb (the source of the Markdown Syntax
+web page), both <abbr title="MultiMarkdown">MMD</abbr> and CommonMark parse it too quickly to accurately
+measure. In fact, it requires a file consisting of the original file copied
+32 times over (0.85 MB) before <code>/usr/bin/env time</code> reports a time over the
+minimum threshold of 0.01 seconds for either program.</p>
+
+<p>There is still potentially room for additional optimization in <abbr title="MultiMarkdown">MMD</abbr>.
+However, even if I can&#8217;t close the performance gap with CommonMark on longer
+files, the additional features of <abbr title="MultiMarkdown">MMD</abbr> compared with Markdown in addition to
+the increased legibility of the source code of <abbr title="MultiMarkdown">MMD</abbr> (in my biased opinion
+anyway) make this project worthwhile.</p>
+
+<h3 id="parsetree">Parse Tree </h3>
+
+<p><abbr title="MultiMarkdown">MMD</abbr> v6 performs its parsing in the following steps:</p>
+
+<ol>
+<li><p>Start with a null-terminated string of source text (C style string)</p></li>
+<li><p>Lex string into token chain</p></li>
+<li><p>Parse token chain into blocks</p></li>
+<li><p>Parse tokens within each block into span level structures (e.g. strong,
+emph, etc.)</p></li>
+<li><p>Export the token tree into the desired output format (e.g. HTML, LaTeX,
+etc.) and return the resulting C style string</p>
+
+<p><strong>OR</strong></p></li>
+<li><p>Use the resulting token tree for your own purposes.</p></li>
+</ol>
+
+<p>The token tree (<a href="#gn:2" id="gnref:2" title="see glossary" class="glossary">AST</a>) includes starting offsets and length of each token,
+allowing you to use <abbr title="MultiMarkdown">MMD</abbr> as part of a syntax highlighter. <abbr title="MultiMarkdown">MMD</abbr> v5 did not
+have this functionality in the public version, in part because the PEG parsers
+used did not provide reliable offset positions, requiring a great deal of
+effort when I adapted MMD for use in <a href="http://multimarkdown.com/">MultiMarkdown
+Composer</a>.</p>
+
+<p>These steps are managed using the <code>mmd_engine</code> &#8220;object&#8221;. An individual
+<code>mmd_engine</code> cannot be used by multiple threads simultaneously, so if
+libMultiMarkdown is to be used in a multithreaded program, a separate
+<code>mmd_engine</code> should be created for each thread. Alternatively, just use the
+slightly more abstracted <code>mmd_convert_string()</code> function that handles creating
+and destroying the <code>mmd_engine</code> automatically.</p>
+
+<h3 id="features">Features </h3>
+
+<h4 id="abbreviationsoracronyms">Abbreviations (Or Acronyms) </h4>
+
+<p>This file includes the use of <abbr title="MultiMarkdown">MMD</abbr> as an abbreviation for MultiMarkdown. The
+abbreviation will be expanded on the first use, and the shortened form will be
+used on subsequent occurrences.</p>
+
+<p>Abbreviations can be specified using inline or reference syntax. The inline
+variant requires that the abbreviation be wrapped in parentheses and
+immediately follows the <code>&gt;</code>.</p>
+
+<pre><code>[>MMD] is an abbreviation.  So is [>(MD) Markdown].
+
+[>MMD]: MultiMarkdown
+</code></pre>
+
+<h4 id="citations">Citations </h4>
+
+<p>Citations can be specified using an inline syntax, just like inline footnotes.</p>
+
+<h4 id="criticmarkup">CriticMarkup </h4>
+
+<p><abbr title="MultiMarkdown">MMD</abbr> v6 has improved support for <a href="http://criticmarkup.com/">CriticMarkup</a>, both in terms of parsing, and
+in terms of support for each output format. You can <ins>insert text</ins>,
+<del>delete text</del>, substitute <del>one thing</del><ins>for another</ins>, <mark>highlight text</mark>,
+and <span class="critic comment">leave comments</span> in the text.</p>
+
+<h4 id="emphandstrong">Emph and Strong </h4>
+
+<p>The basics of emphasis and strong emphasis are unchanged, but the parsing
+engine has been improved to be more accurate, particularly in various edge
+cases where proper parsing can be difficult.</p>
+
+<h4 id="fencedcodeblocks">Fenced Code Blocks </h4>
+
+<p>Fenced code blocks are fundamentally the same as <abbr title="MultiMarkdown">MMD</abbr> v5, except:</p>
+
+<ol>
+<li><p>The leading and trailing fences can be 3, 4, or 5 backticks in length. That
+should be sufficient to account for complex documents without requiring a more
+complex parser.</p></li>
+<li><p>If there is no trailing fence, then everything after the leading fence is
+considered to be part of the code block.</p></li>
+</ol>
+
+<h4 id="glossaryterms">Glossary Terms </h4>
+
+<p>If there are terms in your document you wish to define in a <a href="#gn:3" id="gnref:3" title="see glossary" class="glossary">glossary</a> at
+the end of your document, you can define them using the glossary syntax.</p>
+
+<p>Glossary terms can be specified using inline or reference syntax. The inline
+variant requires that the abbreviation be wrapped in parentheses and
+immediately follows the <code>?</code>.</p>
+
+<pre><code>[?(glossary) The glossary collects information about important
+terms used in your document] is a glossary term.
+
+[?glossary] is also a glossary term.
+
+[?glossary]: The glossary collects information about important
+terms used in your document
+</code></pre>
+
+<h4 id="internationalization">Internationalization </h4>
+
+<p><abbr title="MultiMarkdown">MMD</abbr> v6 includes support for substituting certain text phrases in other
+languages. This only affects the HTML format.</p>
+
+<h4 id="metadata">Metadata </h4>
+
+<p>Metadata in <abbr title="MultiMarkdown">MMD</abbr> v6 includes new support for LaTeX &#8211; the <code>latex config</code> key
+allows you to automatically setup of multiple <code>latex include</code> files at once.
+The default setups that I use would typically consist of one LaTeX file to be
+included at the top of the file, one to be included right at the beginning of
+the document, and one to be included at the end of the document. If you want
+to specify the latex files separately, you can use <code>latex leader</code>, <code>latex
+begin</code>, and <code>latex footer</code>.</p>
+
+<h4 id="tableofcontents">Table of Contents </h4>
+
+<p>By placing <code>{{TOC}}</code> in your document, you can insert an automatically
+generated Table of Contents in your document. As of <abbr title="MultiMarkdown">MMD</abbr> v6, the native
+Table of Contents functionality is used when exporting to LaTeX or
+OpenDocument formats.</p>
+
+<h3 id="futuresteps">Future Steps </h3>
+
+<p>Some features I plan to implement at some point:</p>
+
+<ol>
+<li><p><abbr title="MultiMarkdown">MMD</abbr> v5 used to automatically identify abbreviated terms throughout the
+document and substitute them automatically. I plan to reimplement this
+functionality, but will probably improve upon it to include glossary terms,
+and possibly even support for indexing documents in LaTeX (and possibly
+OpenOffice).</p></li>
+<li><p>OPML export support is not available in v6. I plan on adding improved
+support for this at some point. I was hoping to be able to re-use the
+existing v6 parser but it might be simpler to use the approach from v5 and
+earlier, which was to have a separate parser tuned to only identify headers
+and &#8220;stuff between headers&#8221;.</p></li>
+<li><p>Improved EPUB support. Currently, EPUB support is provided by a separate
+<a href="https://github.com/fletcher/MMD-ePub">tool</a>. At some point, I would like to
+better integrate this into <abbr title="MultiMarkdown">MMD</abbr> itself.</p></li>
+</ol>
+
+<div class="glossary">
+<hr />
+<ol>
+
+<li id="gn:1">
+PEG: <p>Parsing Expression Grammar <a href="https://en.wikipedia.org/wiki/Parsing_expression_grammar">https://en.wikipedia.org/wiki/Parsing_expression_grammar</a> <a href="#gnref:1" title="return to body" class="reverseglossary">&#160;&#8617;</a></p>
+</li>
+
+<li id="gn:2">
+AST: <p>Abstract Syntax Tree <a href="https://en.wikipedia.org/wiki/Abstract_syntax_tree">https://en.wikipedia.org/wiki/Abstract_syntax_tree</a> <a href="#gnref:2" title="return to body" class="reverseglossary">&#160;&#8617;</a></p>
+</li>
+
+<li id="gn:3">
+glossary: <p>The
+glossary collects information about important terms used in your document <a href="#gnref:3" title="return to body" class="reverseglossary">&#160;&#8617;</a></p>
+</li>
+
+</ol>
+</div>
+
+</body>
+</html>
+