1
1
defmodule SweetXpath do
2
+ @ moduledoc false
2
3
3
4
defmodule Priv do
4
5
@ moduledoc false
@@ -115,20 +116,34 @@ defmodule SweetXml do
115
116
"""
116
117
117
118
require Record
119
+ @ doc false
118
120
Record . defrecord :xmlDecl , Record . extract ( :xmlDecl , from_lib: "xmerl/include/xmerl.hrl" )
121
+ @ doc false
119
122
Record . defrecord :xmlAttribute , Record . extract ( :xmlAttribute , from_lib: "xmerl/include/xmerl.hrl" )
123
+ @ doc false
120
124
Record . defrecord :xmlNamespace , Record . extract ( :xmlNamespace , from_lib: "xmerl/include/xmerl.hrl" )
125
+ @ doc false
121
126
Record . defrecord :xmlNsNode , Record . extract ( :xmlNsNode , from_lib: "xmerl/include/xmerl.hrl" )
127
+ @ doc false
122
128
Record . defrecord :xmlElement , Record . extract ( :xmlElement , from_lib: "xmerl/include/xmerl.hrl" )
129
+ @ doc false
123
130
Record . defrecord :xmlText , Record . extract ( :xmlText , from_lib: "xmerl/include/xmerl.hrl" )
131
+ @ doc false
124
132
Record . defrecord :xmlComment , Record . extract ( :xmlComment , from_lib: "xmerl/include/xmerl.hrl" )
133
+ @ doc false
125
134
Record . defrecord :xmlPI , Record . extract ( :xmlPI , from_lib: "xmerl/include/xmerl.hrl" )
135
+ @ doc false
126
136
Record . defrecord :xmlDocument , Record . extract ( :xmlDocument , from_lib: "xmerl/include/xmerl.hrl" )
137
+ @ doc false
127
138
Record . defrecord :xmlObj , Record . extract ( :xmlObj , from_lib: "xmerl/include/xmerl.hrl" )
128
139
140
+ @ type doc :: ( iodata | String . t | Enum . t )
141
+ @ type spec :: % SweetXpath { }
142
+ @ opaque xmlElement :: record ( :xmlElement )
143
+
129
144
130
145
@ doc ~s"""
131
- `sigil_x/2` simply returns a `SweetXpath` struct, with modifiers converted to
146
+ `sigil_x/2` simply returns a `% SweetXpath{} ` struct, with modifiers converted to
132
147
boolean fields:
133
148
134
149
iex> SweetXml.sigil_x("//some/path", 'e')
@@ -211,29 +226,52 @@ defmodule SweetXml do
211
226
| xpath . namespaces ] }
212
227
end
213
228
214
- @ doc """
229
+ @ doc """
230
+ Parse a document into a form ready to be used by `xpath/3` and `xmap/2`.
231
+
215
232
`doc` can be
216
233
217
234
- a byte list (iodata)
218
235
- a binary
219
236
- any enumerable of binaries (for instance `File.stream!/3` result)
220
237
221
- `options` are `xmerl` options described here [http://www.erlang.org/doc/man/xmerl_scan.html](http://www.erlang.org/doc/man/xmerl_scan.html),
222
- see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for usage.
238
+ `options` can be both:
239
+ * `xmerl`'s options as described on the [xmerl_scan](http://www.erlang.org/doc/man/xmerl_scan.html) documentation page,
240
+ see [the erlang tutorial](http://www.erlang.org/doc/apps/xmerl/xmerl_examples.html) for some advanced usage.
241
+ For example: `parse(doc, quiet: true)`
242
+ * `:dtd` to prevent DTD parsing or fetching, with the following possibilities:
243
+ * `:none`, will prevent both internal and external entities, it is the recommended options on untrusted XML;
244
+ * `:all`, the default, for backward compatibility, allows all DTDs;
245
+ * `:internal_only`, will block all attempt at external fetching;
246
+ * `[only: entities]` where `entities` is either an atom for a single entity, or a list of atoms.
247
+ If any other entity is defined in the XML, `parse` will raise on them.
223
248
224
249
When `doc` is an enumerable, the `:cont_fun` option cannot be given.
225
250
226
251
Returns an `xmlElement` record.
227
252
"""
228
- def parse ( doc ) , do: parse ( doc , [ ] )
229
- def parse ( doc , options ) when is_binary ( doc ) do
230
- doc |> :erlang . binary_to_list |> parse ( options )
253
+ @ spec parse ( doc , opts :: list ) :: xmlElement
254
+ def parse ( doc , opts \\ [ ] ) do
255
+ ets = :ets . new ( nil , [ ] )
256
+ dtd_arg = :proplists . get_value ( :dtd , opts , :all )
257
+ opts = :proplists . delete ( :dtd , opts )
258
+ opts = SweetXml.Options . handle_dtd ( dtd_arg ) . ( ets ) ++ opts
259
+ try do
260
+ do_parse ( doc , opts )
261
+ after
262
+ _ = :ets . delete ( ets )
263
+ end
231
264
end
232
- def parse ( [ c | _ ] = doc , options ) when is_integer ( c ) do
265
+
266
+ @ doc false
267
+ def do_parse ( doc , options ) when is_binary ( doc ) do
268
+ doc |> :erlang . binary_to_list |> do_parse ( options )
269
+ end
270
+ def do_parse ( [ c | _ ] = doc , options ) when is_integer ( c ) do
233
271
{ parsed_doc , _ } = :xmerl_scan . string ( doc , options )
234
272
parsed_doc
235
273
end
236
- def parse ( doc_enum , options ) do
274
+ def do_parse ( doc_enum , options ) do
237
275
{ parsed_doc , _ } = :xmerl_scan . string ( '' , options ++ continuation_opts ( doc_enum ) )
238
276
parsed_doc
239
277
end
@@ -252,6 +290,7 @@ defmodule SweetXml do
252
290
will be `{:tagname, xmlelem}`. e.g. :li, :header
253
291
- `options[:discard]` is the list of tag which will be discarded:
254
292
not added to its parent DOM.
293
+ - More options details are available with `parse/2`.
255
294
256
295
## Examples
257
296
@@ -287,10 +326,9 @@ defmodule SweetXml do
287
326
def stream_tags ( doc , tags , options \\ [ ] ) do
288
327
tags = if is_atom ( tags ) , do: [ tags ] , else: tags
289
328
290
- { discard_tags , xmerl_options } = if options [ :discard ] do
291
- { options [ :discard ] , Keyword . delete ( options , :discard ) }
292
- else
293
- { [ ] , options }
329
+ { discard_tags , xmerl_options } = case :proplists . lookup ( :discard , options ) do
330
+ { :discard , tags } -> { tags , :proplists . delete ( :discard , options ) }
331
+ :none -> { [ ] , options }
294
332
end
295
333
296
334
doc |> stream ( fn emit ->
@@ -327,9 +365,9 @@ defmodule SweetXml do
327
365
328
366
- `doc` is an enumerable, data will be pulled during the result stream
329
367
enumeration. e.g. `File.stream!("some_file.xml")`
330
- - `options_callback` is an anonymous function `fn emit -> xmerl_opts` use it to
368
+ - `options_callback` is an anonymous function `fn emit -> ( xmerl_opts | opts) ` use it to
331
369
define your :xmerl callbacks and put data into the stream using
332
- `emit.(elem)` in the callbacks.
370
+ `emit.(elem)` in the callbacks. More details are available with `parse/2`.
333
371
334
372
For example, here you define a stream of all `xmlElement` :
335
373
@@ -358,33 +396,43 @@ defmodule SweetXml do
358
396
Stream . resource fn ->
359
397
{ parent , ref } = waiter = { self ( ) , make_ref ( ) }
360
398
opts = options_callback . ( fn e -> send ( parent , { :event , ref , e } ) end )
399
+
400
+ ets = :ets . new ( nil , [ :public ] )
401
+ dtd_arg = :proplists . get_value ( :dtd , opts , :all )
402
+ opts = :proplists . delete ( :dtd , opts )
403
+ opts = SweetXml.Options . handle_dtd ( dtd_arg ) . ( ets ) ++ opts
404
+
361
405
pid = spawn_link fn -> :xmerl_scan . string ( '' , opts ++ continuation_opts ( doc , waiter ) ) end
362
- { ref , pid , Process . monitor ( pid ) }
363
- end , fn { ref , pid , monref } = acc ->
406
+ { ref , pid , Process . monitor ( pid ) , ets }
407
+ end , fn { ref , pid , monref , ets } = acc ->
364
408
receive do
365
409
{ :DOWN , ^ monref , _ , _ , _ } ->
366
- { :halt , :parse_ended } ## !!! maybe do something when reason !== :normal
410
+ { :halt , { :parse_ended , ets } } ## !!! maybe do something when reason !== :normal
367
411
{ :event , ^ ref , event } ->
368
412
{ [ event ] , acc }
369
413
{ :wait , ^ ref } ->
370
414
send ( pid , { :continue , ref } )
371
415
{ [ ] , acc }
372
416
end
373
417
end , fn
374
- :parse_ended -> :ok
375
- { ref , pid , monref } ->
418
+ { :parse_ended , ets } ->
419
+ _ = :ets . delete ( ets )
420
+ :ok
421
+
422
+ { ref , pid , monref , ets } ->
376
423
Process . demonitor ( monref )
424
+ _ = :ets . delete ( ets )
377
425
flush_halt ( pid , ref )
378
426
end
379
427
end
380
428
381
429
@ doc ~S"""
382
- `xpath` allows you to query an XML document with xpath .
430
+ `xpath` allows you to query an XML document with XPath .
383
431
384
- The second argument to xpath is a `SweetXpath` struct. The optional third
432
+ The second argument to xpath is a `% SweetXpath{} ` struct. The optional third
385
433
argument is a keyword list, such that the value of each keyword is also
386
- either a `SweetXpath` or a list with head being a `SweetXpath` and tail being
387
- another keyword list exactly like before. Please see examples below for better
434
+ either a `% SweetXpath{} ` or a list with head being a `% SweetXpath{} ` and tail being
435
+ another keyword list exactly like before. Please see the examples below for better
388
436
understanding.
389
437
390
438
## Examples
@@ -417,32 +465,49 @@ defmodule SweetXml do
417
465
...> )
418
466
%{ul: %{a: 'Two'}}
419
467
468
+ ## Security
469
+
470
+ Whenever you are working with some xml that was not generated by your system,
471
+ it is highly recommended that you restrain some functionalities of XML
472
+ during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
473
+ Unless you know exactly what kind of DTD you want to permit in your xml,
474
+ it is recommended that you use the following code example to prevent possible attacks:
475
+ ```
476
+ doc
477
+ |> parse(dtd: :none)
478
+ |> xpath(spec, subspec)
479
+ ```
480
+ For more details, see `parse/2`.
420
481
"""
421
- def xpath ( parent , spec ) when not is_tuple ( parent ) do
482
+ @ spec xpath ( parent :: ( doc | xmlElement ) , spec , subspec ) :: any
483
+ when subspec: keyword ( spec | subspec )
484
+ def xpath ( parent , spec , subspec \\ [ ] )
485
+
486
+ def xpath ( parent , spec , [ ] ) when not is_tuple ( parent ) do
422
487
parent |> parse |> xpath ( spec )
423
488
end
424
489
425
- def xpath ( parent , % SweetXpath { is_list: true , is_value: true , cast_to: cast , is_optional: is_opt? } = spec ) do
490
+ def xpath ( parent , % SweetXpath { is_list: true , is_value: true , cast_to: cast , is_optional: is_opt? } = spec , [ ] ) do
426
491
get_current_entities ( parent , spec ) |> Enum . map ( & ( _value ( & 1 ) ) |> to_cast ( cast , is_opt? ) ) |> spec . transform_fun . ( )
427
492
end
428
493
429
- def xpath ( parent , % SweetXpath { is_list: true , is_value: false } = spec ) do
494
+ def xpath ( parent , % SweetXpath { is_list: true , is_value: false } = spec , [ ] ) do
430
495
get_current_entities ( parent , spec ) |> spec . transform_fun . ( )
431
496
end
432
497
433
- def xpath ( parent , % SweetXpath { is_list: false , is_value: true , cast_to: string_type , is_optional: is_opt? } = spec ) when string_type in [ :string , :soft_string ] do
498
+ def xpath ( parent , % SweetXpath { is_list: false , is_value: true , cast_to: string_type , is_optional: is_opt? } = spec , [ ] ) when string_type in [ :string , :soft_string ] do
434
499
spec = % SweetXpath { spec | is_list: true }
435
500
get_current_entities ( parent , spec )
436
501
|> Enum . map ( & ( _value ( & 1 ) |> to_cast ( string_type , is_opt? ) ) )
437
502
|> Enum . join
438
503
|> spec . transform_fun . ( )
439
504
end
440
505
441
- def xpath ( parent , % SweetXpath { is_list: false , is_value: true , cast_to: cast , is_optional: is_opt? } = spec ) do
506
+ def xpath ( parent , % SweetXpath { is_list: false , is_value: true , cast_to: cast , is_optional: is_opt? } = spec , [ ] ) do
442
507
get_current_entities ( parent , spec ) |> _value |> to_cast ( cast , is_opt? ) |> spec . transform_fun . ( )
443
508
end
444
509
445
- def xpath ( parent , % SweetXpath { is_list: false , is_value: false } = spec ) do
510
+ def xpath ( parent , % SweetXpath { is_list: false , is_value: false } = spec , [ ] ) do
446
511
get_current_entities ( parent , spec ) |> spec . transform_fun . ( )
447
512
end
448
513
@@ -457,11 +522,13 @@ defmodule SweetXml do
457
522
end
458
523
459
524
@ doc ~S"""
460
- `xmap` returns a mapping with each value being the result of `xpath`
525
+ `xmap` returns a mapping with each value being the result of `xpath`.
461
526
462
- Just as `xpath`, you can nest the mapping structure. Please see `xpath` for
527
+ Just as `xpath`, you can nest the mapping structure. Please see `xpath/3 ` for
463
528
more detail.
464
529
530
+ You can give the option `true` to get the result as a keyword list instead of a map.
531
+
465
532
## Examples
466
533
467
534
Simple:
@@ -509,8 +576,24 @@ defmodule SweetXml do
509
576
...> ]
510
577
...> ], true)
511
578
[message: 'Message', ul: %{a: 'Two'}]
579
+
580
+ ## Security
581
+
582
+ Whenever you are working with some xml that was not generated by your system,
583
+ it is highly recommended that you restrain some functionalities of XML
584
+ during the parsing. SweetXml allows in particular to prevent DTD parsing and fetching.
585
+ Unless you know exactly what kind of DTD you want to permit in your xml,
586
+ it is recommended that you use the following code example to prevent possible attacks:
587
+ ```
588
+ doc
589
+ |> parse(dtd: :none)
590
+ |> xmap(specs, options)
591
+ ```
592
+ For more details, see `parse/2`.
512
593
"""
513
- def xmap ( parent , mapping ) , do: xmap ( parent , mapping , % { is_keyword: false } )
594
+ @ spec xmap ( parent :: ( doc | xmlElement ) , mapping :: specs , options :: ( boolean | map ) ) :: ( map | keyword )
595
+ when specs: keyword ( spec | specs )
596
+ def xmap ( parent , mapping , options \\ false )
514
597
515
598
def xmap ( nil , _ , % { is_optional: true } ) , do: nil
516
599
0 commit comments