Skip to content

Commit 4505f4f

Browse files
authored
XML: modernize API when available & workaround issues with legacy versions (#15899)
Uses the libxml2 per-context error handlers when available. For example, we can always use it for `XML::Reader` since it's available since at least libxml2 2.9 (released in 2012), that becomes the default expected libxml2 version. Sadly, the other per context error handlers only appeared in libxml2 2.13 (released in 2024) so we can't assume they exist, but can still use them when compiling against this version. This patch adds a libxml2 version detection using `pkg-config`. We can specify the `LIBXML_VERSION` environment variable to target another release if the runtime version will be different (though it is highly recommended to use libxml2 2.13+), or for the Windows MSVC target. On older libxml2 releases, errors can be raised per context (e.g. xml reader) _and_ through the structured error handler (now deprecated) _and_ the older generic (long deprecated). For these older releases, this patch still sets the globals (actually thread locals), be they error handlers or some XML save configuration, but saves them when the fiber might `yield`, restoring the default handlers; it eventually restores the globals when the fiber is resumed, so the current thread will see the values previously configured for the fiber, whatever if another fiber did something else, or the fiber got resumed by another thread (execution context).
1 parent c5af983 commit 4505f4f

File tree

9 files changed

+256
-92
lines changed

9 files changed

+256
-92
lines changed

spec/std/xml/html_spec.cr

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,6 @@ describe XML do
6060
it "parses html5 (#1404)" do
6161
html5 = "<html><body><nav>Test</nav></body></html>"
6262
xml = XML.parse_html(html5)
63-
xml.errors.should_not be_nil
6463
xml.xpath_node("//html/body/nav").should_not be_nil
6564
end
6665

spec/std/xml/reader_spec.cr

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -567,15 +567,16 @@ module XML
567567
reader.to_unsafe.should be_a(LibXML::XMLTextReader)
568568
end
569569
end
570-
end
571570

572-
describe "#errors" do
573-
it "makes errors accessible" do
574-
reader = XML::Reader.new(%(<people></foo>))
575-
reader.read
576-
reader.expand?
571+
describe "#errors" do
572+
it "makes errors accessible" do
573+
options = XML::ParserOptions::RECOVER | XML::ParserOptions::NONET
574+
reader = XML::Reader.new(%(<people></foo>), options)
575+
reader.read
576+
reader.expand?
577577

578-
reader.errors.map(&.to_s).should eq ["Opening and ending tag mismatch: people line 1 and foo"]
578+
reader.errors.map(&.to_s).should eq ["Opening and ending tag mismatch: people line 1 and foo"]
579+
end
579580
end
580581
end
581582
end

spec/std/xml/xml_spec.cr

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,9 @@ describe XML do
159159
end
160160

161161
it "#errors" do
162-
xml = XML.parse(%(<people></foo>))
162+
options = XML::ParserOptions::RECOVER | XML::ParserOptions::NONET
163+
164+
xml = XML.parse(%(<people></foo>), options)
163165
xml.root.not_nil!.name.should eq("people")
164166
xml.errors.try(&.map(&.to_s)).should eq ["Opening and ending tag mismatch: people line 1 and foo"]
165167

src/xml.cr

Lines changed: 67 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
require "./xml/libxml2"
2+
13
# The XML module allows parsing and generating [XML](https://www.w3.org/XML/) documents.
24
#
35
# NOTE: To use `XML`, you must explicitly import it with `require "xml"`
@@ -54,77 +56,101 @@ module XML
5456
# See `ParserOptions.default` for default options.
5557
def self.parse(string : String, options : ParserOptions = ParserOptions.default) : Node
5658
raise XML::Error.new("Document is empty", 0) if string.empty?
57-
from_ptr { LibXML.xmlReadMemory(string, string.bytesize, nil, nil, options) }
59+
ctxt = LibXML.xmlNewParserCtxt
60+
from_ptr(ctxt) do
61+
LibXML.xmlCtxtReadMemory(ctxt, string, string.bytesize, nil, nil, options)
62+
end
5863
end
5964

6065
# Parses an XML document from *io* with *options* into an `XML::Node`.
6166
#
6267
# See `ParserOptions.default` for default options.
6368
def self.parse(io : IO, options : ParserOptions = ParserOptions.default) : Node
64-
from_ptr { LibXML.xmlReadIO(
65-
->(ctx, buffer, len) {
66-
LibC::Int.new(Box(IO).unbox(ctx).read Slice.new(buffer, len))
67-
},
68-
->(ctx) { 0 },
69-
Box(IO).box(io),
70-
nil,
71-
nil,
72-
options,
73-
) }
69+
ctxt = LibXML.xmlNewParserCtxt
70+
from_ptr(ctxt) do
71+
LibXML.xmlCtxtReadIO(ctxt, ->read_callback, ->close_callback, Box(IO).box(io), nil, nil, options)
72+
end
7473
end
7574

7675
# Parses an HTML document from *string* with *options* into an `XML::Node`.
7776
#
7877
# See `HTMLParserOptions.default` for default options.
7978
def self.parse_html(string : String, options : HTMLParserOptions = HTMLParserOptions.default) : Node
8079
raise XML::Error.new("Document is empty", 0) if string.empty?
81-
from_ptr { LibXML.htmlReadMemory(string, string.bytesize, nil, "utf-8", options) }
80+
ctxt = LibXML.htmlNewParserCtxt
81+
from_ptr(ctxt) do
82+
LibXML.htmlCtxtReadMemory(ctxt, string, string.bytesize, nil, "utf-8", options)
83+
end
8284
end
8385

8486
# Parses an HTML document from *io* with *options* into an `XML::Node`.
8587
#
8688
# See `HTMLParserOptions.default` for default options.
8789
def self.parse_html(io : IO, options : HTMLParserOptions = HTMLParserOptions.default) : Node
88-
from_ptr { LibXML.htmlReadIO(
89-
->(ctx, buffer, len) {
90-
LibC::Int.new(Box(IO).unbox(ctx).read Slice.new(buffer, len))
91-
},
92-
->(ctx) { 0 },
93-
Box(IO).box(io),
94-
nil,
95-
"utf-8",
96-
options,
97-
) }
90+
ctxt = LibXML.htmlNewParserCtxt
91+
from_ptr(ctxt) do
92+
LibXML.htmlCtxtReadIO(ctxt, ->read_callback, ->close_callback, Box(IO).box(io), nil, "utf-8", options)
93+
end
9894
end
9995

100-
protected def self.from_ptr(& : -> LibXML::Doc*)
101-
errors = [] of XML::Error
102-
doc = XML::Error.collect(errors) { yield }
96+
protected def self.read_callback(data : Void*, buffer : UInt8*, len : LibC::Int) : LibC::Int
97+
io = Box(IO).unbox(data)
98+
buf = Slice.new(buffer, len)
99+
ret = {% if LibXML.has_method?(:xmlCtxtSetErrorHandler) %}
100+
io.read(buf)
101+
{% else %}
102+
XML::Error.default_handlers { io.read(buf) }
103+
{% end %}
104+
LibC::Int.new(ret)
105+
end
103106

107+
protected def self.close_callback(data : Void*) : LibC::Int
108+
LibC::Int.new(0)
109+
end
110+
111+
protected def self.from_ptr(ctxt, & : -> LibXML::Doc*)
112+
errors = [] of XML::Error
113+
doc =
114+
{% if LibXML.has_method?(:xmlCtxtSetErrorHandler) %}
115+
LibXML.xmlCtxtSetErrorHandler(ctxt, ->Error.structured_callback, Box.box(errors))
116+
yield
117+
{% else %}
118+
XML::Error.unsafe_collect(errors) { yield }
119+
{% end %}
104120
raise Error.new(LibXML.xmlGetLastError) unless doc
105121

106122
Node.new(doc, errors)
107123
end
108124

109-
protected def self.with_indent_tree_output(indent : Bool, &)
110-
ptr = LibXML.__xmlIndentTreeOutput
111-
old, ptr.value = ptr.value, indent ? 1 : 0
112-
begin
113-
yield
114-
ensure
115-
ptr.value = old
125+
{% unless LibXML.has_method?(:xmlSaveSetIndentString) %}
126+
# NOTE: These helpers are for internal compatibility with libxml < 2.14.
127+
128+
protected def self.with_indent_tree_output(indent : Bool, &)
129+
save_indent_tree_output do
130+
LibXML.__xmlIndentTreeOutput.value = indent ? 1 : 0
131+
yield
132+
end
116133
end
117-
end
118134

119-
protected def self.with_tree_indent_string(string : String, &)
120-
ptr = LibXML.__xmlTreeIndentString
121-
old, ptr.value = ptr.value, string.to_unsafe
122-
begin
123-
yield
124-
ensure
125-
ptr.value = old
135+
protected def self.save_indent_tree_output(&)
136+
value = LibXML.__xmlIndentTreeOutput.value
137+
begin
138+
yield
139+
ensure
140+
LibXML.__xmlIndentTreeOutput.value = value
141+
end
126142
end
127-
end
143+
144+
protected def self.with_tree_indent_string(string : String, &)
145+
value = LibXML.__xmlTreeIndentString.value
146+
LibXML.__xmlTreeIndentString.value = string.to_unsafe
147+
begin
148+
yield
149+
ensure
150+
LibXML.__xmlTreeIndentString.value = value
151+
end
152+
end
153+
{% end %}
128154

129155
class_getter libxml2_version : String do
130156
version_string = String.new(LibXML.xmlParserVersion)

src/xml/error.cr

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,27 +16,83 @@ class XML::Error < Exception
1616
{% raise "`XML::Error.errors` was removed because it leaks memory when it's not used. XML errors are accessible directly in the respective context via `XML::Reader#errors` and `XML::Node#errors`.\nSee https://github.com/crystal-lang/crystal/issues/14934 for details. " %}
1717
end
1818

19-
def self.collect(errors, &)
20-
LibXML.xmlSetStructuredErrorFunc Box.box(errors), ->(ctx, error) {
21-
Box(Array(XML::Error)).unbox(ctx) << XML::Error.new(error)
22-
}
19+
protected def self.structured_callback(data : Void*, error : LibXML::Error*) : Nil
20+
Box(Array(Error)).unbox(data) << Error.new(error)
21+
end
22+
23+
protected def self.generic_callback(data : Void*, fmt : UInt8*) : Nil
24+
message = String.new(fmt).chomp
25+
Box(Array(Error)).unbox(data) << XML::Error.new(message, 0)
26+
end
27+
28+
# Saves the global error handlers (and user data) for the current thread,
29+
# replaces them with a custom handler to record reported XML errors in
30+
# *errors*, and eventually restores the saved error handlers (and user data)
31+
# before returning.
32+
#
33+
# Saves both structured + generic handlers because libxml < 2.13 use *both* in
34+
# practice.
35+
#
36+
# NOTE: This is for internal compatibility with libxml < 2.13. Do not use.
37+
protected def self.unsafe_collect(errors : Array(Error), &)
38+
data = Box.box(errors)
39+
with_handlers(data, ->structured_callback(Void*, LibXML::Error*), data, ->generic_callback(Void*, UInt8*)) { yield }
40+
end
41+
42+
# Saves the current global error handlers (and user data) and restore the
43+
# default handlers for the duration of the block. Eventually restores the
44+
# saved error handlers (and user data) before returning.
45+
#
46+
# Use this when a callback can potentially do a fiber context switch, for
47+
# example IO operations.
48+
#
49+
# Saves both structured + generic handlers because libxml < 2.13 use *both* in
50+
# practice.
51+
#
52+
# NOTE: This is for internal compatibility with libxml < 2.13. Do not use.
53+
protected def self.default_handlers(&)
54+
with_handlers(nil, nil, nil, nil) { yield }
55+
end
56+
57+
private def self.with_handlers(scontext, shandler, context, handler, &)
58+
orig_scontext = LibXML.__xmlStructuredErrorContext.value
59+
orig_shandler = LibXML.__xmlStructuredError.value
60+
61+
orig_context = LibXML.__xmlGenericErrorContext.value
62+
orig_handler = LibXML.__xmlGenericError.value
63+
64+
LibXML.xmlSetStructuredErrorFunc(scontext, shandler)
65+
LibXML.xmlSetGenericErrorFunc(context, handler)
66+
2367
begin
2468
yield
2569
ensure
26-
LibXML.xmlSetStructuredErrorFunc nil, nil
70+
# can't call xmlSetStructuredErrorFunc or xmlSetGenericErrorFunc: the
71+
# compiler complains that it's passing a closure to C (it's not)
72+
LibXML.__xmlStructuredErrorContext.value = orig_scontext
73+
LibXML.__xmlStructuredError.value = orig_shandler
74+
75+
LibXML.__xmlGenericErrorContext.value = orig_context
76+
LibXML.__xmlGenericError.value = orig_handler
2777
end
2878
end
2979

80+
@[Deprecated("Legacy libxml2 API that mutate global state. Do not use.")]
81+
def self.collect(errors, &)
82+
unsafe_collect(errors) { yield }
83+
end
84+
85+
@[Deprecated("Legacy libxml2 API that mutate global state. Do not use.")]
3086
def self.collect_generic(errors, &)
31-
LibXML.xmlSetGenericErrorFunc Box.box(errors), ->(ctx, fmt) {
87+
LibXML.xmlSetGenericErrorFunc Box.box(errors), ->(data, fmt) {
3288
# TODO: use va_start and va_end to
3389
message = String.new(fmt).chomp
3490
error = XML::Error.new(message, 0)
3591

3692
{% if flag?(:arm) || flag?(:aarch64) %}
3793
# libxml2 is likely missing ARM unwind tables (.ARM.extab and .ARM.exidx
3894
# sections) which prevent raising from a libxml2 context.
39-
Box(Array(XML::Error)).unbox(ctx) << error
95+
Box(Array(XML::Error)).unbox(data) << error
4096
{% else %}
4197
raise error
4298
{% end %}

src/xml/libxml2.cr

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,18 @@ require "./save_options"
1717
{% end %}
1818
{% end %}
1919
lib LibXML
20+
# The bindings default to libxml 2.9 that was released in 2012. We can safely
21+
# assume at least this version is available everywhere.
22+
23+
{% if (version = env("LIBXML_VERSION")) && (version.strip != "") %}
24+
VERSION = {{env("LIBXML_VERSION")}}
25+
{% elsif !flag?(:win32) || flag?(:gnu) %}
26+
VERSION = {{`sh -c "pkg-config libxml-2.0 --silence-errors --modversion 2> /dev/null || echo 2.9.0"`.strip.stringify}}
27+
{% else %}
28+
# TODO: figure out the actual libxml version on *-windows-msvc target
29+
VERSION = "2.9.0"
30+
{% end %}
31+
2032
alias Int = LibC::Int
2133

2234
$xmlParserVersion : LibC::Char*
@@ -69,6 +81,8 @@ lib LibXML
6981
properties : Int
7082
end
7183

84+
alias HTMLDoc = Doc
85+
7286
struct Attr
7387
include NodeCommon
7488
ns : NS*
@@ -97,6 +111,9 @@ lib LibXML
97111
alias XMLTextReader = Void*
98112
alias XMLTextReaderLocator = Void*
99113

114+
alias ParserCtxt = Void*
115+
alias HTMLParserCtxt = ParserCtxt
116+
100117
enum ParserSeverity
101118
VALIDITY_WARNING = 1
102119
VALIDITY_ERROR = 2
@@ -134,7 +151,7 @@ lib LibXML
134151
fun xmlTextReaderCurrentNode(reader : XMLTextReader) : Node*
135152

136153
fun xmlTextReaderSetErrorHandler(reader : XMLTextReader, f : TextReaderErrorFunc) : Void
137-
154+
fun xmlTextReaderSetStructuredErrorHandler(reader : XMLTextReader, f : StructuredErrorFunc, arg : Void*) : Void
138155
fun xmlTextReaderLocatorLineNumber(XMLTextReaderLocator) : Int
139156

140157
fun xmlReadMemory(buffer : UInt8*, size : Int, url : UInt8*, encoding : UInt8*, options : XML::ParserOptions) : Doc*
@@ -146,6 +163,14 @@ lib LibXML
146163
fun xmlReadIO(ioread : InputReadCallback, ioclose : InputCloseCallback, ioctx : Void*, url : UInt8*, encoding : UInt8*, options : XML::ParserOptions) : Doc*
147164
fun htmlReadIO(ioread : InputReadCallback, ioclose : InputCloseCallback, ioctx : Void*, url : UInt8*, encoding : UInt8*, options : XML::HTMLParserOptions) : Doc*
148165

166+
fun xmlNewParserCtxt : ParserCtxt
167+
fun xmlCtxtReadIO(ParserCtxt, ioread : InputReadCallback, ioclose : InputCloseCallback, ioctx : Void*, url : UInt8*, encoding : UInt8*, options : XML::ParserOptions) : Doc*
168+
fun xmlCtxtReadMemory(ParserCtxt, buffer : UInt8*, size : Int, url : UInt8*, encoding : UInt8*, options : XML::ParserOptions) : Doc*
169+
170+
fun htmlNewParserCtxt : HTMLParserCtxt
171+
fun htmlCtxtReadMemory(HTMLParserCtxt, buffer : UInt8*, size : Int, url : UInt8*, encoding : UInt8*, options : XML::HTMLParserOptions) : Doc*
172+
fun htmlCtxtReadIO(HTMLParserCtxt, ioread : InputReadCallback, ioclose : InputCloseCallback, ioctx : Void*, url : UInt8*, encoding : UInt8*, options : XML::HTMLParserOptions) : Doc*
173+
149174
fun xmlDocGetRootElement(doc : Doc*) : Node*
150175
fun xmlXPathNodeSetCreate(node : Node*) : NodeSet*
151176
fun xmlXPathNodeSetAddUnique(cur : NodeSet*, val : Node*) : Int
@@ -321,8 +346,15 @@ lib LibXML
321346
alias StructuredErrorFunc = (Void*, Error*) ->
322347
alias GenericErrorFunc = (Void*, UInt8*) ->
323348

324-
fun xmlSetStructuredErrorFunc(ctx : Void*, f : StructuredErrorFunc)
349+
# deprecated
325350
fun xmlSetGenericErrorFunc(ctx : Void*, f : GenericErrorFunc)
351+
fun __xmlGenericError : GenericErrorFunc*
352+
fun __xmlGenericErrorContext : Void**
353+
354+
# deprecated since 2.13
355+
fun xmlSetStructuredErrorFunc(ctx : Void*, f : StructuredErrorFunc)
356+
fun __xmlStructuredError : StructuredErrorFunc*
357+
fun __xmlStructuredErrorContext : Void**
326358

327359
fun xmlGetNsList(doc : Doc*, node : Node*) : NS**
328360

@@ -331,6 +363,15 @@ lib LibXML
331363
fun xmlUnsetProp(node : Node*, name : UInt8*) : Int
332364

333365
fun xmlValidateNameValue(value : UInt8*) : Int
366+
367+
{% if compare_versions(LibXML::VERSION, "2.13.0") >= 0 %}
368+
fun xmlCtxtSetErrorHandler(ctxt : ParserCtxt, handler : StructuredErrorFunc, data : Void*)
369+
fun xmlXPathSetErrorHandler(ctxt : XPathContext*, handler : StructuredErrorFunc, data : Void*)
370+
{% end %}
371+
372+
{% if compare_versions(LibXML::VERSION, "2.14.0") >= 0 %}
373+
fun xmlSaveSetIndentString(SaveCtxPtr, UInt8*)
374+
{% end %}
334375
end
335376

336377
LibXML.xmlInitParser

0 commit comments

Comments
 (0)