Skip to content

Commit 088b3be

Browse files
committed
Merge branch 'issue-71'
2 parents c224316 + 86e4b3d commit 088b3be

File tree

9 files changed

+260
-39
lines changed

9 files changed

+260
-39
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66

77
* Improvement on the doc
88

9+
### ADDED
10+
11+
* Added option to raise on DTD definitions
12+
913
## [0.6.6] (2019-02-24)
1014

1115
* small bugfix: Fix compilation warnings on newer versions of Elixir

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Add dependency to your project's `mix.exs`:
1717

1818
```elixir
1919
def deps do
20-
[{:sweet_xml, "~> 0.6.6"}]
20+
[{:sweet_xml, "~> 0.7.0"}]
2121
end
2222
```
2323

@@ -467,6 +467,17 @@ result = file_stream
467467
|> stream_tags([:li, :special_match_key], discard: [:li, :special_match_key])
468468
```
469469

470+
## Security
471+
472+
Whenever you have to deal with some XML that was not generated by your system (untrusted document),
473+
it is highly recommended that you separate the parsing step from the mapping step, in order to be able
474+
to prevent some default behavior through options. You can check the doc for `SweetXml.parse/2` for more details.
475+
The current recommendations are:
476+
```
477+
doc |> parse(dtd: :none) |> xpath(spec, subspec)
478+
enum |> stream_tags(tags, dtd: :none)
479+
```
480+
470481
## Copyright and License
471482

472483
Copyright (c) 2014, Frank Liu

lib/sweet_xml.ex

Lines changed: 116 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
defmodule SweetXpath do
2+
@moduledoc false
23

34
defmodule Priv do
45
@moduledoc false
@@ -115,20 +116,34 @@ defmodule SweetXml do
115116
"""
116117

117118
require Record
119+
@doc false
118120
Record.defrecord :xmlDecl, Record.extract(:xmlDecl, from_lib: "xmerl/include/xmerl.hrl")
121+
@doc false
119122
Record.defrecord :xmlAttribute, Record.extract(:xmlAttribute, from_lib: "xmerl/include/xmerl.hrl")
123+
@doc false
120124
Record.defrecord :xmlNamespace, Record.extract(:xmlNamespace, from_lib: "xmerl/include/xmerl.hrl")
125+
@doc false
121126
Record.defrecord :xmlNsNode, Record.extract(:xmlNsNode, from_lib: "xmerl/include/xmerl.hrl")
127+
@doc false
122128
Record.defrecord :xmlElement, Record.extract(:xmlElement, from_lib: "xmerl/include/xmerl.hrl")
129+
@doc false
123130
Record.defrecord :xmlText, Record.extract(:xmlText, from_lib: "xmerl/include/xmerl.hrl")
131+
@doc false
124132
Record.defrecord :xmlComment, Record.extract(:xmlComment, from_lib: "xmerl/include/xmerl.hrl")
133+
@doc false
125134
Record.defrecord :xmlPI, Record.extract(:xmlPI, from_lib: "xmerl/include/xmerl.hrl")
135+
@doc false
126136
Record.defrecord :xmlDocument, Record.extract(:xmlDocument, from_lib: "xmerl/include/xmerl.hrl")
137+
@doc false
127138
Record.defrecord :xmlObj, Record.extract(:xmlObj, from_lib: "xmerl/include/xmerl.hrl")
128139

140+
@type doc :: (iodata | String.t | Enum.t)
141+
@type spec :: %SweetXpath{}
142+
@opaque xmlElement :: record(:xmlElement)
143+
129144

130145
@doc ~s"""
131-
`sigil_x/2` simply returns a `SweetXpath` struct, with modifiers converted to
146+
`sigil_x/2` simply returns a `%SweetXpath{}` struct, with modifiers converted to
132147
boolean fields:
133148
134149
iex> SweetXml.sigil_x("//some/path", 'e')
@@ -211,29 +226,52 @@ defmodule SweetXml do
211226
| xpath.namespaces]}
212227
end
213228

214-
@doc """
229+
@doc """
230+
Parse a document into a form ready to be used by `xpath/3` and `xmap/2`.
231+
215232
`doc` can be
216233
217234
- a byte list (iodata)
218235
- a binary
219236
- any enumerable of binaries (for instance `File.stream!/3` result)
220237
221-
`options` are `xmerl` options described here [http://www.erlang.org/doc/man/xmerl_scan.html](http://www.erlang.org/doc/man/xmerl_scan.html),
222-
see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for usage.
238+
`options` can be both:
239+
* `xmerl`'s options as described on the [xmerl_scan](http://www.erlang.org/doc/man/xmerl_scan.html) documentation page,
240+
see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for some advanced usage.
241+
For example: `parse(doc, quiet: true)`
242+
* `:dtd` to prevent DTD parsing or fetching, with the following possibilities:
243+
* `:none`, will prevent both internal and external entities, it is the recommended options on untrusted XML;
244+
* `:all`, the default, for backward compatibility, allows all DTDs;
245+
* `:internal_only`, will block all attempt at external fetching;
246+
* `[only: entities]` where `entities` is either an atom for a single entity, or a list of atoms.
247+
If any other entity is defined in the XML, `parse` will raise on them.
223248
224249
When `doc` is an enumerable, the `:cont_fun` option cannot be given.
225250
226251
Returns an `xmlElement` record.
227252
"""
228-
def parse(doc), do: parse(doc, [])
229-
def parse(doc, options) when is_binary(doc) do
230-
doc |> :erlang.binary_to_list |> parse(options)
253+
@spec parse(doc, opts :: list) :: xmlElement
254+
def parse(doc, opts \\ []) do
255+
ets = :ets.new(nil, [])
256+
dtd_arg = :proplists.get_value(:dtd, opts, :all)
257+
opts = :proplists.delete(:dtd, opts)
258+
opts = SweetXml.Options.handle_dtd(dtd_arg).(ets) ++ opts
259+
try do
260+
do_parse(doc, opts)
261+
after
262+
_ = :ets.delete(ets)
263+
end
231264
end
232-
def parse([c | _] = doc, options) when is_integer(c) do
265+
266+
@doc false
267+
def do_parse(doc, options) when is_binary(doc) do
268+
doc |> :erlang.binary_to_list |> do_parse(options)
269+
end
270+
def do_parse([c | _] = doc, options) when is_integer(c) do
233271
{parsed_doc, _} = :xmerl_scan.string(doc, options)
234272
parsed_doc
235273
end
236-
def parse(doc_enum, options) do
274+
def do_parse(doc_enum, options) do
237275
{parsed_doc, _} = :xmerl_scan.string('', options ++ continuation_opts(doc_enum))
238276
parsed_doc
239277
end
@@ -252,6 +290,7 @@ defmodule SweetXml do
252290
will be `{:tagname, xmlelem}`. e.g. :li, :header
253291
- `options[:discard]` is the list of tag which will be discarded:
254292
not added to its parent DOM.
293+
- More options details are available with `parse/2`.
255294
256295
## Examples
257296
@@ -287,10 +326,9 @@ defmodule SweetXml do
287326
def stream_tags(doc, tags, options \\ []) do
288327
tags = if is_atom(tags), do: [tags], else: tags
289328

290-
{discard_tags, xmerl_options} = if options[:discard] do
291-
{options[:discard], Keyword.delete(options, :discard)}
292-
else
293-
{[], options}
329+
{discard_tags, xmerl_options} = case :proplists.lookup(:discard, options) do
330+
{:discard, tags} -> {tags, :proplists.delete(:discard, options)}
331+
:none -> {[], options}
294332
end
295333

296334
doc |> stream(fn emit ->
@@ -327,9 +365,9 @@ defmodule SweetXml do
327365
328366
- `doc` is an enumerable, data will be pulled during the result stream
329367
enumeration. e.g. `File.stream!("some_file.xml")`
330-
- `options_callback` is an anonymous function `fn emit -> xmerl_opts` use it to
368+
- `options_callback` is an anonymous function `fn emit -> (xmerl_opts | opts)` use it to
331369
define your :xmerl callbacks and put data into the stream using
332-
`emit.(elem)` in the callbacks.
370+
`emit.(elem)` in the callbacks. More details are available with `parse/2`.
333371
334372
For example, here you define a stream of all `xmlElement` :
335373
@@ -358,33 +396,43 @@ defmodule SweetXml do
358396
Stream.resource fn ->
359397
{parent, ref} = waiter = {self(), make_ref()}
360398
opts = options_callback.(fn e -> send(parent, {:event, ref, e}) end)
399+
400+
ets = :ets.new(nil, [:public])
401+
dtd_arg = :proplists.get_value(:dtd, opts, :all)
402+
opts = :proplists.delete(:dtd, opts)
403+
opts = SweetXml.Options.handle_dtd(dtd_arg).(ets) ++ opts
404+
361405
pid = spawn_link fn -> :xmerl_scan.string('', opts ++ continuation_opts(doc, waiter)) end
362-
{ref, pid, Process.monitor(pid)}
363-
end, fn {ref, pid, monref} = acc ->
406+
{ref, pid, Process.monitor(pid), ets}
407+
end, fn {ref, pid, monref, ets} = acc ->
364408
receive do
365409
{:DOWN, ^monref, _, _, _} ->
366-
{:halt, :parse_ended} ## !!! maybe do something when reason !== :normal
410+
{:halt, {:parse_ended, ets}} ## !!! maybe do something when reason !== :normal
367411
{:event, ^ref, event} ->
368412
{[event], acc}
369413
{:wait, ^ref} ->
370414
send(pid, {:continue, ref})
371415
{[], acc}
372416
end
373417
end, fn
374-
:parse_ended -> :ok
375-
{ref, pid, monref} ->
418+
{:parse_ended, ets} ->
419+
_ = :ets.delete(ets)
420+
:ok
421+
422+
{ref, pid, monref, ets} ->
376423
Process.demonitor(monref)
424+
_ = :ets.delete(ets)
377425
flush_halt(pid, ref)
378426
end
379427
end
380428

381429
@doc ~S"""
382-
`xpath` allows you to query an XML document with xpath.
430+
`xpath` allows you to query an XML document with XPath.
383431
384-
The second argument to xpath is a `SweetXpath` struct. The optional third
432+
The second argument to xpath is a `%SweetXpath{}` struct. The optional third
385433
argument is a keyword list, such that the value of each keyword is also
386-
either a `SweetXpath` or a list with head being a `SweetXpath` and tail being
387-
another keyword list exactly like before. Please see examples below for better
434+
either a `%SweetXpath{}` or a list with head being a `%SweetXpath{}` and tail being
435+
another keyword list exactly like before. Please see the examples below for better
388436
understanding.
389437
390438
## Examples
@@ -417,32 +465,49 @@ defmodule SweetXml do
417465
...> )
418466
%{ul: %{a: 'Two'}}
419467
468+
## Security
469+
470+
Whenever you are working with some xml that was not generated by your system,
471+
it is highly recommended that you restrain some functionalities of XML
472+
during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
473+
Unless you know exactly what kind of DTD you want to permit in your xml,
474+
it is recommended that you use the following code example to prevent possible attacks:
475+
```
476+
doc
477+
|> parse(dtd: :none)
478+
|> xpath(spec, subspec)
479+
```
480+
For more details, see `parse/2`.
420481
"""
421-
def xpath(parent, spec) when not is_tuple(parent) do
482+
@spec xpath(parent :: (doc | xmlElement), spec, subspec) :: any
483+
when subspec: keyword(spec | subspec)
484+
def xpath(parent, spec, subspec \\ [])
485+
486+
def xpath(parent, spec, []) when not is_tuple(parent) do
422487
parent |> parse |> xpath(spec)
423488
end
424489

425-
def xpath(parent, %SweetXpath{is_list: true, is_value: true, cast_to: cast, is_optional: is_opt?} = spec) do
490+
def xpath(parent, %SweetXpath{is_list: true, is_value: true, cast_to: cast, is_optional: is_opt?} = spec, []) do
426491
get_current_entities(parent, spec) |> Enum.map(&(_value(&1)) |> to_cast(cast,is_opt?)) |> spec.transform_fun.()
427492
end
428493

429-
def xpath(parent, %SweetXpath{is_list: true, is_value: false} = spec) do
494+
def xpath(parent, %SweetXpath{is_list: true, is_value: false} = spec, []) do
430495
get_current_entities(parent, spec) |> spec.transform_fun.()
431496
end
432497

433-
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: string_type, is_optional: is_opt?} = spec) when string_type in [:string,:soft_string] do
498+
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: string_type, is_optional: is_opt?} = spec, []) when string_type in [:string,:soft_string] do
434499
spec = %SweetXpath{spec | is_list: true}
435500
get_current_entities(parent, spec)
436501
|> Enum.map(&(_value(&1) |> to_cast(string_type, is_opt?)))
437502
|> Enum.join
438503
|> spec.transform_fun.()
439504
end
440505

441-
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: cast, is_optional: is_opt?} = spec) do
506+
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: cast, is_optional: is_opt?} = spec, []) do
442507
get_current_entities(parent, spec) |> _value |> to_cast(cast, is_opt?) |> spec.transform_fun.()
443508
end
444509

445-
def xpath(parent, %SweetXpath{is_list: false, is_value: false} = spec) do
510+
def xpath(parent, %SweetXpath{is_list: false, is_value: false} = spec, []) do
446511
get_current_entities(parent, spec) |> spec.transform_fun.()
447512
end
448513

@@ -457,11 +522,13 @@ defmodule SweetXml do
457522
end
458523

459524
@doc ~S"""
460-
`xmap` returns a mapping with each value being the result of `xpath`
525+
`xmap` returns a mapping with each value being the result of `xpath`.
461526
462-
Just as `xpath`, you can nest the mapping structure. Please see `xpath` for
527+
Just as `xpath`, you can nest the mapping structure. Please see `xpath/3` for
463528
more detail.
464529
530+
You can give the option `true` to get the result as a keyword list instead of a map.
531+
465532
## Examples
466533
467534
Simple:
@@ -509,8 +576,24 @@ defmodule SweetXml do
509576
...> ]
510577
...> ], true)
511578
[message: 'Message', ul: %{a: 'Two'}]
579+
580+
## Security
581+
582+
Whenever you are working with some xml that was not generated by your system,
583+
it is highly recommended that you restrain some functionalities of XML
584+
during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
585+
Unless you know exactly what kind of DTD you want to permit in your xml,
586+
it is recommended that you use the following code example to prevent possible attacks:
587+
```
588+
doc
589+
|> parse(dtd: :none)
590+
|> xmap(specs, options)
591+
```
592+
For more details, see `parse/2`.
512593
"""
513-
def xmap(parent, mapping), do: xmap(parent, mapping, %{is_keyword: false})
594+
@spec xmap(parent :: (doc | xmlElement), mapping :: specs, options :: (boolean | map)) :: (map | keyword)
595+
when specs: keyword(spec | specs)
596+
def xmap(parent, mapping, options \\ false)
514597

515598
def xmap(nil, _, %{is_optional: true}), do: nil
516599

lib/sweet_xml/options.ex

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
defmodule SweetXml.Options do
2+
@moduledoc false
3+
4+
def handle_dtd(:all) do
5+
fn _ -> [] end
6+
end
7+
def handle_dtd(:none) do
8+
fn ets ->
9+
handle_dtd(:internal_only).(ets) ++ handle_dtd(only: []).(ets)
10+
end
11+
end
12+
def handle_dtd(:internal_only) do
13+
fn _ ->
14+
[fetch_fun: fn _, _ -> {:error, "no external entity allowed"} end]
15+
end
16+
end
17+
def handle_dtd(only: entity) when is_atom(entity) do
18+
handle_dtd(only: [entity])
19+
end
20+
def handle_dtd(only: entities) when is_list(entities) do
21+
fn ets ->
22+
read = fn
23+
context, name, state ->
24+
ets = :xmerl_scan.rules_state(state)
25+
case :ets.lookup(ets, {context, name}) do
26+
[] -> :undefined
27+
[{_, value}] -> value
28+
end
29+
end
30+
31+
write = fn
32+
:entity = context, name, value, state ->
33+
_ = case name in entities do
34+
true ->
35+
ets = :xmerl_scan.rules_state(state)
36+
_ = case :ets.lookup(ets, {context, name}) do
37+
[] -> :ets.insert(ets, {{context, name}, value})
38+
_ -> :ok
39+
end
40+
false -> raise("DTD not allowed: #{name}")
41+
end
42+
state
43+
44+
context, name, value, state ->
45+
ets = :xmerl_scan.rules_state(state)
46+
_ = case :ets.lookup(ets, {context, name}) do
47+
[] -> :ets.insert(ets, {{context, name}, value})
48+
_ -> :ok
49+
end
50+
state
51+
end
52+
53+
[{:rules, read, write, ets}]
54+
end
55+
end
56+
end

0 commit comments

Comments
 (0)