Skip to content

Commit 01cb296

Browse files
committed
Reimplement HTML Serializer for better ns cleaning
1 parent d0013f2 commit 01cb296

File tree

4 files changed

+180
-27
lines changed

4 files changed

+180
-27
lines changed

composer.json

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"php" : "^8.2",
1717
"ext-dom" : "*",
1818
"ext-libxml": "*",
19+
"ext-xmlwriter": "*",
1920
"theseer/css2xpath": "^2.0"
2021
},
2122
"autoload": {

src/autoload.php

-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ function($class) {
2929
'templado\\engine\\mergelistexception' => '/merger/MergeListException.php',
3030
'templado\\engine\\merger' => '/merger/Merger.php',
3131
'templado\\engine\\mergerexception' => '/merger/MergerException.php',
32-
'templado\\engine\\namespacecleaningtransformation' => '/transformation/NamespaceCleaningTransformation.php',
3332
'templado\\engine\\notdefined' => '/viewmodel/NotDefined.php',
3433
'templado\\engine\\parsingexception' => '/document/ParsingException.php',
3534
'templado\\engine\\remove' => '/viewmodel/Remove.php',

src/serializer/HTMLSerializer.php

+136-24
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,16 @@
99
*/
1010
namespace Templado\Engine;
1111

12+
use DOMAttr;
1213
use DOMDocument;
14+
use DOMElement;
15+
use DOMNameSpaceNode;
16+
use DOMNode;
17+
use DOMXPath;
18+
use XMLWriter;
19+
use function assert;
20+
use const LIBXML_NOEMPTYTAG;
21+
use const LIBXML_NOXMLDECL;
1322

1423
class HTMLSerializer implements Serializer {
1524
private bool $stripRDFaFlag = false;
@@ -20,6 +29,10 @@ class HTMLSerializer implements Serializer {
2029

2130
private bool $withDoctypeFlag = true;
2231

32+
private const HTMLNS = 'http://www.w3.org/1999/xhtml';
33+
34+
private bool $isFirst;
35+
2336
/** @psalm-var list<Filter> */
2437
private array $filters = [];
2538

@@ -63,13 +76,6 @@ public function addFilter(Filter $filter): self {
6376
}
6477

6578
public function serialize(DOMDocument $document): string {
66-
if ($this->namespaceCleaningFlag) {
67-
$this->transformations[] = new NamespaceCleaningTransformation();
68-
}
69-
70-
if ($this->stripRDFaFlag) {
71-
$this->transformations[] = new StripRDFaAttributesTransformation;
72-
}
7379

7480
if (!empty($this->transformations)) {
7581
(new TransformationProcessor())->process(
@@ -78,34 +84,140 @@ public function serialize(DOMDocument $document): string {
7884
);
7985
}
8086

87+
$xmlString = $this->namespaceCleaningFlag ?
88+
$this->serializeToCleanedString($document) :
89+
$this->serializeToBasicString($document);
90+
91+
$this->filters[] = new EmptyElementsFilter();
92+
93+
foreach ($this->filters as $filter) {
94+
$xmlString = $filter->apply($xmlString);
95+
}
96+
97+
return $xmlString;
98+
}
99+
100+
private function serializeToCleanedString(DOMDocument $document): string {
101+
$writer = new XMLWriter();
102+
$writer->openMemory();
103+
$writer->setIndent(true);
104+
$writer->setIndentString(' ');
105+
106+
if ($this->keepXMLHeaderFlag) {
107+
$writer->startDocument();
108+
}
109+
81110
if ($this->withDoctypeFlag) {
82-
$document = $this->enforceHTML5DocType($document);
111+
$writer->writeDtd('html');
83112
}
84113

85-
$document->formatOutput = true;
86-
$xmlString = $document->saveXML(options: LIBXML_NOEMPTYTAG);
114+
$this->isFirst = true;
87115

88-
$this->filters[] = new EmptyElementsFilter();
116+
$this->walk($writer, $document->documentElement, []);
89117

90-
if (!$this->keepXMLHeaderFlag) {
91-
$this->filters[] = new XMLHeaderFilter();
118+
if ($this->keepXMLHeaderFlag) {
119+
$writer->endDocument();
92120
}
93121

94-
foreach ($this->filters as $filter) {
95-
$xmlString = $filter->apply($xmlString);
122+
return $writer->outputMemory();
123+
}
124+
125+
private function walk(XMLWriter $writer, DOMNode $node, array $knownPrefixes):void {
126+
assert($node->ownerDocument instanceof DOMDocument);
127+
128+
if (!$node instanceof DOMElement) {
129+
$writer->writeRaw(
130+
$node->ownerDocument->saveXML($node)
131+
);
132+
133+
return;
96134
}
97135

98-
return $xmlString;
136+
if ($node->namespaceURI === self::HTMLNS || empty($node->namespaceURI)) {
137+
$writer->startElement($node->localName);
138+
if ($this->isFirst) {
139+
$writer->writeAttribute('xmlns', self::HTMLNS);
140+
$this->isFirst = false;
141+
}
142+
} else {
143+
$writer->startElement($node->nodeName);
144+
if (empty($node->prefix)) {
145+
$writer->writeAttribute('xmlns', $node->namespaceURI);
146+
} elseif (!isset($knownPrefixes[$node->prefix])) {
147+
$writer->writeAttribute('xmlns:' . $node->prefix, $node->namespaceURI);
148+
$knownPrefixes[$node->prefix] = $node->namespaceURI;
149+
}
150+
}
151+
152+
foreach($node->attributes as $attribute) {
153+
assert($attribute instanceof DOMAttr);
154+
155+
if ($this->stripRDFaFlag && in_array($attribute->name, ['property', 'resource', 'prefix', 'typeof', 'vocab'])) {
156+
continue;
157+
}
158+
159+
if (empty($attribute->prefix)) {
160+
$writer->writeAttribute($attribute->name, $attribute->value);
161+
continue;
162+
}
163+
164+
if (!isset($knownPrefixes[$attribute->prefix])) {
165+
$knownPrefixes[$attribute->prefix] = $node->lookupNamespaceURI($attribute->prefix);
166+
$writer->writeAttribute('xmlns:' . $attribute->prefix, $node->lookupNamespaceURI($attribute->prefix));
167+
}
168+
169+
$writer->writeAttribute(
170+
$attribute->nodeName,
171+
$attribute->value
172+
);
173+
}
174+
175+
foreach((new DOMXPath($node->ownerDocument))->query('./namespace::*', $node) as $nsNode) {
176+
assert($nsNode instanceof DOMNameSpaceNode);
177+
178+
if (empty($nsNode->prefix) || $nsNode->prefix === 'xml') {
179+
continue;
180+
}
181+
182+
if ($nsNode->nodeValue === self::HTMLNS) {
183+
continue;
184+
}
185+
186+
if (isset($knownPrefixes[$nsNode->prefix])) {
187+
continue;
188+
}
189+
190+
assert($nsNode->nodeValue !== null);
191+
$writer->writeAttribute('xmlns:' . $nsNode->prefix, $nsNode->nodeValue);
192+
$knownPrefixes[$nsNode->prefix] = $nsNode->nodeValue;
193+
194+
}
195+
196+
if ($node->hasChildNodes()) {
197+
foreach($node->childNodes as $childNode) {
198+
$this->walk($writer, $childNode, $knownPrefixes);
199+
}
200+
}
201+
202+
$writer->fullEndElement();
99203
}
100204

101-
private function enforceHTML5DocType(DOMDocument $document): DOMDocument {
102-
$tmp = new DOMDocument();
103-
$tmp->loadXML('<?xml version="1.0" ?><!DOCTYPE html><html />');
104-
$tmp->replaceChild(
105-
$tmp->importNode($document->documentElement, true),
106-
$tmp->documentElement
107-
);
205+
private function serializeToBasicString(DOMDocument $document): string {
206+
$document->formatOutput = true;
207+
$xmlString = $document->saveXML($document->documentElement, options: LIBXML_NOEMPTYTAG);
108208

109-
return $tmp;
209+
if ($this->withDoctypeFlag) {
210+
$xmlString = "<!DOCTYPE html>\n" . $xmlString;
211+
}
212+
213+
if ($this->keepXMLHeaderFlag) {
214+
$xmlString = sprintf(
215+
'<?xml version="1.0" encoding="%s" ?>',
216+
$document->encoding ?? 'utf-8'
217+
) . "\n" . $xmlString;
218+
}
219+
220+
return $xmlString . "\n";
110221
}
222+
111223
}

tests/serializer/HTMLSerializerTest.php

+43-2
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,14 @@
55
use PHPUnit\Framework\Attributes\CoversClass;
66
use PHPUnit\Framework\Attributes\UsesClass;
77
use PHPUnit\Framework\TestCase;
8+
use const LIBXML_NOEMPTYTAG;
89

910
#[CoversClass(HTMLSerializer::class)]
1011
#[UsesClass(Document::class)]
1112
#[UsesClass(EmptyElementsFilter::class)]
12-
#[UsesClass(NamespaceCleaningTransformation::class)]
1313
#[UsesClass(Selection::class)]
1414
#[UsesClass(StaticNodeList::class)]
1515
#[UsesClass(TransformationProcessor::class)]
16-
#[UsesClass(XMLHeaderFilter::class)]
1716
#[UsesClass(XPathSelector::class)]
1817
#[UsesClass(StripRDFaAttributesTransformation::class)]
1918
class HTMLSerializerTest extends TestCase {
@@ -77,6 +76,48 @@ public function testAddedTransformationGetsApplies(): void {
7776
);
7877
}
7978

79+
public function testXMLHeaderIsKeptWhenNotCleaning() {
80+
$dom = new DOMDocument();
81+
$dom->preserveWhiteSpace = false;
82+
$dom->loadXML('<html xmlns="http://www.w3.org/1999/xhtml" />');
83+
84+
$this->assertSame(
85+
'<?xml version="1.0" encoding="utf-8" ?>' . "\n" . '<html xmlns="http://www.w3.org/1999/xhtml"></html>' . "\n",
86+
(new HTMLSerializer())->keepXMLHeader()->noHtml5Doctype()->disableNamespaceCleaning()->serialize($dom)
87+
);
88+
}
89+
90+
public function testNamespacedAttributesGetSerializedCorrectly() {
91+
$dom = new DOMDocument();
92+
$dom->preserveWhiteSpace = false;
93+
$dom->loadXML('<?xml version="1.0" ?><html xmlns="http://www.w3.org/1999/xhtml" xmlns:a="urn:a" a:attr="value" />');
94+
95+
$this->assertSame(
96+
'<?xml version="1.0"?>' . "\n" . '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:a="urn:a" a:attr="value"></html>' . "\n",
97+
(new HTMLSerializer())->keepXMLHeader()->noHtml5Doctype()->serialize($dom)
98+
);
99+
}
100+
101+
public function testNamespacedElementsGetSerializedCorrectly() {
102+
$dom = new DOMDocument();
103+
$dom->preserveWhiteSpace = false;
104+
$dom->loadXML('<?xml version="1.0" ?><html xmlns="http://www.w3.org/1999/xhtml" xmlns:a="urn:a"><a:foo /><b:foo xmlns:b="urn:b" /><c xmlns="urn:c" /></html>');
105+
106+
$this->assertSame(
107+
implode("\n", [
108+
'<?xml version="1.0"?>',
109+
'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:a="urn:a">',
110+
' <a:foo></a:foo>',
111+
' <b:foo xmlns:b="urn:b"></b:foo>',
112+
' <c xmlns="urn:c"></c>',
113+
'</html>' . "\n"
114+
]),
115+
(new HTMLSerializer())->keepXMLHeader()->noHtml5Doctype()->serialize($dom)
116+
);
117+
}
118+
119+
120+
80121
private function createInputDocument(): Document {
81122
return Document::fromString(file_get_contents(__DIR__ . '/../_data/serializer/input.xml'));
82123
}

0 commit comments

Comments
 (0)