Skip to content

Commit 86e4b3d

Browse files
committed
WIP gh-71 : doc update
1 parent d26fffa commit 86e4b3d

File tree

7 files changed

+121
-36
lines changed

7 files changed

+121
-36
lines changed

README.md

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Add dependency to your project's `mix.exs`:
1717

1818
```elixir
1919
def deps do
20-
[{:sweet_xml, "~> 0.6.6"}]
20+
[{:sweet_xml, "~> 0.7.0"}]
2121
end
2222
```
2323

@@ -464,6 +464,17 @@ result = file_stream
464464
|> stream_tags([:li, :special_match_key], discard: [:li, :special_match_key])
465465
```
466466

467+
## Security
468+
469+
Whenever you have to deal with some XML that was not generated by your system (untrusted document),
470+
it is highly recommended that you separate the parsing step from the mapping step, in order to be able
471+
to prevent some default behavior through options. You can check the doc for `SweetXml.parse/2` for more details.
472+
The current recommendations are:
473+
```
474+
doc |> parse(dtd: :none) |> xpath(spec, subspec)
475+
enum |> stream_tags(tags, dtd: :none)
476+
```
477+
467478
## Copyright and License
468479

469480
Copyright (c) 2014, Frank Liu

lib/sweet_xml.ex

Lines changed: 81 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
defmodule SweetXpath do
2+
@moduledoc false
23

34
defmodule Priv do
45
@moduledoc false
@@ -115,20 +116,34 @@ defmodule SweetXml do
115116
"""
116117

117118
require Record
119+
@doc false
118120
Record.defrecord :xmlDecl, Record.extract(:xmlDecl, from_lib: "xmerl/include/xmerl.hrl")
121+
@doc false
119122
Record.defrecord :xmlAttribute, Record.extract(:xmlAttribute, from_lib: "xmerl/include/xmerl.hrl")
123+
@doc false
120124
Record.defrecord :xmlNamespace, Record.extract(:xmlNamespace, from_lib: "xmerl/include/xmerl.hrl")
125+
@doc false
121126
Record.defrecord :xmlNsNode, Record.extract(:xmlNsNode, from_lib: "xmerl/include/xmerl.hrl")
127+
@doc false
122128
Record.defrecord :xmlElement, Record.extract(:xmlElement, from_lib: "xmerl/include/xmerl.hrl")
129+
@doc false
123130
Record.defrecord :xmlText, Record.extract(:xmlText, from_lib: "xmerl/include/xmerl.hrl")
131+
@doc false
124132
Record.defrecord :xmlComment, Record.extract(:xmlComment, from_lib: "xmerl/include/xmerl.hrl")
133+
@doc false
125134
Record.defrecord :xmlPI, Record.extract(:xmlPI, from_lib: "xmerl/include/xmerl.hrl")
135+
@doc false
126136
Record.defrecord :xmlDocument, Record.extract(:xmlDocument, from_lib: "xmerl/include/xmerl.hrl")
137+
@doc false
127138
Record.defrecord :xmlObj, Record.extract(:xmlObj, from_lib: "xmerl/include/xmerl.hrl")
128139

140+
@type doc :: (iodata | String.t | Enum.t)
141+
@type spec :: %SweetXpath{}
142+
@opaque xmlElement :: record(:xmlElement)
143+
129144

130145
@doc ~s"""
131-
`sigil_x/2` simply returns a `SweetXpath` struct, with modifiers converted to
146+
`sigil_x/2` simply returns a `%SweetXpath{}` struct, with modifiers converted to
132147
boolean fields:
133148
134149
iex> SweetXml.sigil_x("//some/path", 'e')
@@ -211,20 +226,31 @@ defmodule SweetXml do
211226
| xpath.namespaces]}
212227
end
213228

214-
@doc """
229+
@doc """
230+
Parse a document into a form ready to be used by `xpath/3` and `xmap/2`.
231+
215232
`doc` can be
216233
217234
- a byte list (iodata)
218235
- a binary
219236
- any enumerable of binaries (for instance `File.stream!/3` result)
220237
221-
`options` are `xmerl` options described here [http://www.erlang.org/doc/man/xmerl_scan.html](http://www.erlang.org/doc/man/xmerl_scan.html),
222-
see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for usage.
238+
`options` can be both:
239+
* `xmerl`'s options as described on the [xmerl_scan](http://www.erlang.org/doc/man/xmerl_scan.html) documentation page,
240+
see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for some advanced usage.
241+
For example: `parse(doc, quiet: true)`
242+
* `:dtd` to prevent DTD parsing or fetching, with the following possibilities:
243+
* `:none`, will prevent both internal and external entities, it is the recommended options on untrusted XML;
244+
* `:all`, the default, for backward compatibility, allows all DTDs;
245+
* `:internal_only`, will block all attempt at external fetching;
246+
* `[only: entities]` where `entities` is either an atom for a single entity, or a list of atoms.
247+
If any other entity is defined in the XML, `parse` will raise on them.
223248
224249
When `doc` is an enumerable, the `:cont_fun` option cannot be given.
225250
226251
Returns an `xmlElement` record.
227252
"""
253+
@spec parse(doc, opts :: list) :: xmlElement
228254
def parse(doc, opts \\ []) do
229255
ets = :ets.new(nil, [])
230256
dtd_arg = :proplists.get_value(:dtd, opts, :all)
@@ -264,6 +290,7 @@ defmodule SweetXml do
264290
will be `{:tagname, xmlelem}`. e.g. :li, :header
265291
- `options[:discard]` is the list of tag which will be discarded:
266292
not added to its parent DOM.
293+
- More options details are available with `parse/2`.
267294
268295
## Examples
269296
@@ -338,9 +365,9 @@ defmodule SweetXml do
338365
339366
- `doc` is an enumerable, data will be pulled during the result stream
340367
enumeration. e.g. `File.stream!("some_file.xml")`
341-
- `options_callback` is an anonymous function `fn emit -> xmerl_opts` use it to
368+
- `options_callback` is an anonymous function `fn emit -> (xmerl_opts | opts)` use it to
342369
define your :xmerl callbacks and put data into the stream using
343-
`emit.(elem)` in the callbacks.
370+
`emit.(elem)` in the callbacks. More details are available with `parse/2`.
344371
345372
For example, here you define a stream of all `xmlElement` :
346373
@@ -400,12 +427,12 @@ defmodule SweetXml do
400427
end
401428

402429
@doc ~S"""
403-
`xpath` allows you to query an XML document with xpath.
430+
`xpath` allows you to query an XML document with XPath.
404431
405-
The second argument to xpath is a `SweetXpath` struct. The optional third
432+
The second argument to xpath is a `%SweetXpath{}` struct. The optional third
406433
argument is a keyword list, such that the value of each keyword is also
407-
either a `SweetXpath` or a list with head being a `SweetXpath` and tail being
408-
another keyword list exactly like before. Please see examples below for better
434+
either a `%SweetXpath{}` or a list with head being a `%SweetXpath{}` and tail being
435+
another keyword list exactly like before. Please see the examples below for better
409436
understanding.
410437
411438
## Examples
@@ -438,32 +465,49 @@ defmodule SweetXml do
438465
...> )
439466
%{ul: %{a: 'Two'}}
440467
468+
## Security
469+
470+
Whenever you are working with some xml that was not generated by your system,
471+
it is highly recommended that you restrain some functionalities of XML
472+
during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
473+
Unless you know exactly what kind of DTD you want to permit in your xml,
474+
it is recommended that you use the following code example to prevent possible attacks:
475+
```
476+
doc
477+
|> parse(dtd: :none)
478+
|> xpath(spec, subspec)
479+
```
480+
For more details, see `parse/2`.
441481
"""
442-
def xpath(parent, spec) when not is_tuple(parent) do
482+
@spec xpath(parent :: (doc | xmlElement), spec, subspec) :: any
483+
when subspec: keyword(spec | subspec)
484+
def xpath(parent, spec, subspec \\ [])
485+
486+
def xpath(parent, spec, []) when not is_tuple(parent) do
443487
parent |> parse |> xpath(spec)
444488
end
445489

446-
def xpath(parent, %SweetXpath{is_list: true, is_value: true, cast_to: cast, is_optional: is_opt?} = spec) do
490+
def xpath(parent, %SweetXpath{is_list: true, is_value: true, cast_to: cast, is_optional: is_opt?} = spec, []) do
447491
get_current_entities(parent, spec) |> Enum.map(&(_value(&1)) |> to_cast(cast,is_opt?)) |> spec.transform_fun.()
448492
end
449493

450-
def xpath(parent, %SweetXpath{is_list: true, is_value: false} = spec) do
494+
def xpath(parent, %SweetXpath{is_list: true, is_value: false} = spec, []) do
451495
get_current_entities(parent, spec) |> spec.transform_fun.()
452496
end
453497

454-
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: string_type, is_optional: is_opt?} = spec) when string_type in [:string,:soft_string] do
498+
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: string_type, is_optional: is_opt?} = spec, []) when string_type in [:string,:soft_string] do
455499
spec = %SweetXpath{spec | is_list: true}
456500
get_current_entities(parent, spec)
457501
|> Enum.map(&(_value(&1) |> to_cast(string_type, is_opt?)))
458502
|> Enum.join
459503
|> spec.transform_fun.()
460504
end
461505

462-
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: cast, is_optional: is_opt?} = spec) do
506+
def xpath(parent, %SweetXpath{is_list: false, is_value: true, cast_to: cast, is_optional: is_opt?} = spec, []) do
463507
get_current_entities(parent, spec) |> _value |> to_cast(cast, is_opt?) |> spec.transform_fun.()
464508
end
465509

466-
def xpath(parent, %SweetXpath{is_list: false, is_value: false} = spec) do
510+
def xpath(parent, %SweetXpath{is_list: false, is_value: false} = spec, []) do
467511
get_current_entities(parent, spec) |> spec.transform_fun.()
468512
end
469513

@@ -478,11 +522,13 @@ defmodule SweetXml do
478522
end
479523

480524
@doc ~S"""
481-
`xmap` returns a mapping with each value being the result of `xpath`
525+
`xmap` returns a mapping with each value being the result of `xpath`.
482526
483-
Just as `xpath`, you can nest the mapping structure. Please see `xpath` for
527+
Just as `xpath`, you can nest the mapping structure. Please see `xpath/3` for
484528
more detail.
485529
530+
You can give the option `true` to get the result as a keyword list instead of a map.
531+
486532
## Examples
487533
488534
Simple:
@@ -530,8 +576,24 @@ defmodule SweetXml do
530576
...> ]
531577
...> ], true)
532578
[message: 'Message', ul: %{a: 'Two'}]
579+
580+
## Security
581+
582+
Whenever you are working with some xml that was not generated by your system,
583+
it is highly recommended that you restrain some functionalities of XML
584+
during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
585+
Unless you know exactly what kind of DTD you want to permit in your xml,
586+
it is recommended that you use the following code example to prevent possible attacks:
587+
```
588+
doc
589+
|> parse(dtd: :none)
590+
|> xmap(specs, options)
591+
```
592+
For more details, see `parse/2`.
533593
"""
534-
def xmap(parent, mapping), do: xmap(parent, mapping, %{is_keyword: false})
594+
@spec xmap(parent :: (doc | xmlElement), mapping :: specs, options :: (boolean | map)) :: (map | keyword)
595+
when specs: keyword(spec | specs)
596+
def xmap(parent, mapping, options \\ false)
535597

536598
def xmap(nil, _, %{is_optional: true}), do: nil
537599

lib/sweet_xml/options.ex

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
defmodule SweetXml.Options do
2+
@moduledoc false
3+
24
def handle_dtd(:all) do
35
fn _ -> [] end
46
end

mix.exs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
defmodule SweetXml.Mixfile do
22
use Mix.Project
33

4-
@source_url "https://github.com/awetzel/sweet_xml"
4+
@source_url "https://github.com/kbrw/sweet_xml"
55

66
def project do
77
[
88
app: :sweet_xml,
9-
version: "0.6.6",
9+
version: "0.7.0-rc.1",
1010
elixir: "~> 1.0",
1111
description: "An sweet wrapper of :xmerl to help query XML docs",
1212
deps: deps(),

mix.lock

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
%{
22
"earmark": {:hex, :earmark, "1.3.1", "73812f447f7a42358d3ba79283cfa3075a7580a3a2ed457616d6517ac3738cb9", [:mix], [], "hexpm", "000aaeff08919e95e7aea13e4af7b2b9734577b3e6a7c50ee31ee88cab6ec4fb"},
33
"earmark_parser": {:hex, :earmark_parser, "1.4.12", "b245e875ec0a311a342320da0551da407d9d2b65d98f7a9597ae078615af3449", [:mix], [], "hexpm", "711e2cc4d64abb7d566d43f54b78f7dc129308a63bc103fbd88550d2174b3160"},
4-
"ex_doc": {:hex, :ex_doc, "0.23.0", "a069bc9b0bf8efe323ecde8c0d62afc13d308b1fa3d228b65bca5cf8703a529d", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}], "hexpm", "f5e2c4702468b2fd11b10d39416ddadd2fcdd173ba2a0285ebd92c39827a5a16"},
4+
"ex_doc": {:hex, :ex_doc, "0.24.1", "15673de99154f93ca7f05900e4e4155ced1ee0cd34e0caeee567900a616871a4", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "07972f17bdf7dc7b5bd76ec97b556b26178ed3f056e7ec9288eb7cea7f91cce2"},
55
"makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"},
6-
"makeup_elixir": {:hex, :makeup_elixir, "0.15.0", "98312c9f0d3730fde4049985a1105da5155bfe5c11e47bdc7406d88e01e4219b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "75ffa34ab1056b7e24844c90bfc62aaf6f3a37a15faa76b07bc5eba27e4a8b4a"},
6+
"makeup_elixir": {:hex, :makeup_elixir, "0.15.1", "b5888c880d17d1cc3e598f05cdb5b5a91b7b17ac4eaf5f297cb697663a1094dd", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "db68c173234b07ab2a07f645a5acdc117b9f99d69ebf521821d89690ae6c6ec8"},
7+
"makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"},
78
"markdown": {:git, "git://github.com/devinus/markdown.git", "cd0df79b6f1cc374499d47f6ba6aaab5096f874f", []},
89
"nimble_parsec": {:hex, :nimble_parsec, "1.1.0", "3a6fca1550363552e54c216debb6a9e95bd8d32348938e13de5eda962c0d7f89", [:mix], [], "hexpm", "08eb32d66b706e913ff748f11694b17981c0b04a33ef470e33e11b3d3ac8f54b"},
910
}

test/files/xxe.xml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!DOCTYPE foo [ <!ELEMENT foo ANY >
3+
<!ENTITY xxe SYSTEM "file:///etc/passwd" >]>
4+
<response><result>&xxe;</result></response>

test/issue_71_test.exs

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,27 @@ defmodule Issue71Test do
22
use ExUnit.Case
33

44
test "raise on reading /etc/passwd with dtd: :none" do
5-
sneaky_xml = """
6-
<?xml version=\"1.0\" encoding=\"UTF-8\"?>
7-
<!DOCTYPE foo [ <!ELEMENT foo ANY >
8-
<!ENTITY xxe SYSTEM \"file:///etc/passwd\" >]>
9-
<response><result>&xxe;</result></response>
10-
"""
5+
sneaky_xml = File.read!("./test/files/xxe.xml")
116

127
assert {:fatal, {{:error_fetching_DTD, {_, _}}, _file, _line, _col}} =
138
catch_exit(SweetXml.parse(sneaky_xml, dtd: :none, quiet: true))
149
end
1510

11+
test "raise on reading /etc/passwd with dtd: :internal_only" do
12+
sneaky_xml = File.read!("./test/files/xxe.xml")
13+
14+
assert {:fatal, {{:error_fetching_DTD, {_, _}}, _file, _line, _col}} =
15+
catch_exit(SweetXml.parse(sneaky_xml, dtd: :internal_only, quiet: true))
16+
end
17+
18+
test "raise on reading /etc/passwd with dtd: [only: :banana]" do
19+
sneaky_xml = File.read!("./test/files/xxe.xml")
20+
21+
assert_raise RuntimeError, fn ->
22+
SweetXml.parse(sneaky_xml, dtd: [only: :banana])
23+
end
24+
end
25+
1626
test "raise on billion_laugh.xml with dtd: :none" do
1727
dangerous_xml = File.read!("./test/files/billion_laugh.xml")
1828
assert_raise RuntimeError, fn ->
@@ -21,12 +31,7 @@ defmodule Issue71Test do
2131
end
2232

2333
test "stream: raise on reading /etc/passwd with dtd: :none" do
24-
sneaky_xml = """
25-
<?xml version=\"1.0\" encoding=\"UTF-8\"?>
26-
<!DOCTYPE foo [ <!ELEMENT foo ANY >
27-
<!ENTITY xxe SYSTEM \"file:///etc/passwd\" >]>
28-
<response><result>&xxe;</result></response>
29-
"""
34+
sneaky_xml = File.read!("./test/files/xxe.xml")
3035

3136
_ = Process.flag(:trap_exit, true)
3237
pid = spawn_link(fn -> Stream.run(SweetXml.stream_tags(sneaky_xml, :banana, dtd: :none, quiet: true)) end)

0 commit comments

Comments
 (0)