Skip to content

Commit d8fd4d2

Browse files
committed
Backport patch from commonmark/cmark#376
Fixes #36 Fixes #14
1 parent 5b5d611 commit d8fd4d2

File tree

3 files changed

+66
-4
lines changed

3 files changed

+66
-4
lines changed

NEWS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
2.0.0
22
- The tagfilter extension for markdown_html now actually works (#15)
3+
- Backport patch to filter illegal control characters in markdown_xml()
34

45
1.9.5
56
- Fix parallel make problem

src/cmark/xml.c

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,70 @@
77
#include "cmark-gfm.h"
88
#include "node.h"
99
#include "buffer.h"
10-
#include "houdini.h"
1110
#include "syntax_extension.h"
1211

1312
#define BUFFER_SIZE 100
1413
#define MAX_INDENT 40
1514

1615
// Functions to convert cmark_nodes to XML strings.
1716

18-
static void escape_xml(cmark_strbuf *dest, const unsigned char *source,
19-
bufsize_t length) {
20-
houdini_escape_html0(dest, source, length, 0);
17+
// C0 control characters, U+FFFE and U+FFF aren't allowed in XML.
18+
static const char XML_ESCAPE_TABLE[256] = {
19+
/* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
20+
/* 0x10 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21+
/* 0x20 */ 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22+
/* 0x30 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 5, 0,
23+
/* 0x40 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24+
/* 0x50 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
25+
/* 0x60 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26+
/* 0x70 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27+
/* 0x80 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28+
/* 0x90 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29+
/* 0xA0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30+
/* 0xB0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9,
31+
/* 0xC0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32+
/* 0xD0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33+
/* 0xE0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
34+
/* 0xF0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
35+
};
36+
37+
// U+FFFD Replacement Character encoded in UTF-8
38+
#define UTF8_REPL "\xEF\xBF\xBD"
39+
40+
static const char *XML_ESCAPES[] = {
41+
"", UTF8_REPL, """, "&", "<", ">"
42+
};
43+
44+
static void escape_xml(cmark_strbuf *ob, const unsigned char *src,
45+
bufsize_t size) {
46+
bufsize_t i = 0, org, esc = 0;
47+
48+
while (i < size) {
49+
org = i;
50+
while (i < size && (esc = XML_ESCAPE_TABLE[src[i]]) == 0)
51+
i++;
52+
53+
if (i > org)
54+
cmark_strbuf_put(ob, src + org, i - org);
55+
56+
if (i >= size)
57+
break;
58+
59+
if (esc == 9) {
60+
// To replace U+FFFE and U+FFFF with U+FFFD, only the last byte has to
61+
// be changed.
62+
// We know that src[i] is 0xBE or 0xBF.
63+
if (i >= 2 && src[i-2] == 0xEF && src[i-1] == 0xBF) {
64+
cmark_strbuf_putc(ob, 0xBD);
65+
} else {
66+
cmark_strbuf_putc(ob, src[i]);
67+
}
68+
} else {
69+
cmark_strbuf_puts(ob, XML_ESCAPES[esc]);
70+
}
71+
72+
i++;
73+
}
2174
}
2275

2376
struct render_state {

tests/testthat/test-parser.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#If this breaks you need to re-apply https://github.com/commonmark/cmark/pull/376
2+
3+
test_that("illegal unicode is replaced with tofu", {
4+
text <- "foo\023bar"
5+
xml <- markdown_xml(text)
6+
doc <- xml2::read_xml(xml)
7+
expect_equal(xml2::xml_text(doc), "foo\uFFFDbar")
8+
})

0 commit comments

Comments
 (0)