8
8
9
9
private import python
10
10
private import semmle.python.dataflow.new.DataFlow
11
+ private import semmle.python.dataflow.new.TaintTracking
11
12
private import semmle.python.Concepts
12
13
private import semmle.python.ApiGraphs
13
14
private import semmle.python.frameworks.data.ModelsAsData
@@ -68,16 +69,9 @@ module Lxml {
68
69
*/
69
70
class XPathCall extends XML:: XPathExecution:: Range , DataFlow:: CallCfgNode {
70
71
XPathCall ( ) {
71
- exists ( API:: Node parseResult |
72
- parseResult =
73
- etreeRef ( ) .getMember ( [ "parse" , "fromstring" , "fromstringlist" , "HTML" , "XML" ] ) .getReturn ( )
74
- or
75
- // TODO: lxml.etree.parseid(<text>)[0] will contain the root element from parsing <text>
76
- // but we don't really have a way to model that nicely.
77
- parseResult = etreeRef ( ) .getMember ( "XMLParser" ) .getReturn ( ) .getMember ( "close" ) .getReturn ( )
78
- |
79
- this = parseResult .getMember ( "xpath" ) .getACall ( )
80
- )
72
+ // TODO: lxml.etree.parseid(<text>)[0] will contain the root element from parsing <text>
73
+ // but we don't really have a way to model that nicely.
74
+ this = [ Element:: instance ( ) , ElementTree:: instance ( ) ] .getMember ( "xpath" ) .getACall ( )
81
75
}
82
76
83
77
override DataFlow:: Node getXPath ( ) { result in [ this .getArg ( 0 ) , this .getArgByName ( "_path" ) ] }
@@ -209,31 +203,37 @@ module Lxml {
209
203
* A call to either of:
210
204
* - `lxml.etree.fromstring`
211
205
* - `lxml.etree.fromstringlist`
206
+ * - `lxml.etree.HTML`
212
207
* - `lxml.etree.XML`
213
208
* - `lxml.etree.XMLID`
209
+ * - `lxml.etree.XMLDTDID`
214
210
* - `lxml.etree.parse`
215
211
* - `lxml.etree.parseid`
216
212
*
217
213
* See
218
214
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstring
219
215
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstringlist
216
+ * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.HTML
220
217
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XML
221
218
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XMLID
219
+ * - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.XMLDTDID
222
220
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parse
223
221
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parseid
224
222
*/
225
223
private class LxmlParsing extends DataFlow:: CallCfgNode , XML:: XmlParsing:: Range {
226
224
string functionName ;
227
225
228
226
LxmlParsing ( ) {
229
- functionName in [ "fromstring" , "fromstringlist" , "XML" , "XMLID" , "parse" , "parseid" ] and
227
+ functionName in [
228
+ "fromstring" , "fromstringlist" , "HTML" , "XML" , "XMLID" , "XMLDTDID" , "parse" , "parseid"
229
+ ] and
230
230
this = etreeRef ( ) .getMember ( functionName ) .getACall ( )
231
231
}
232
232
233
233
override DataFlow:: Node getAnInput ( ) {
234
234
result in [
235
235
this .getArg ( 0 ) ,
236
- // fromstring / XML / XMLID
236
+ // fromstring / HTML / XML / XMLID / XMLDTDID
237
237
this .getArgByName ( "text" ) ,
238
238
// fromstringlist
239
239
this .getArgByName ( "strings" ) ,
@@ -248,7 +248,8 @@ module Lxml {
248
248
this .getParserArg ( ) = XmlParser:: instanceVulnerableTo ( kind )
249
249
or
250
250
kind .isXxe ( ) and
251
- not exists ( this .getParserArg ( ) )
251
+ not exists ( this .getParserArg ( ) ) and
252
+ not functionName = "HTML"
252
253
}
253
254
254
255
override predicate mayExecuteInput ( ) { none ( ) }
@@ -318,4 +319,183 @@ module Lxml {
318
319
319
320
override DataFlow:: Node getAPathArgument ( ) { result = this .getAnInput ( ) }
320
321
}
322
+
323
+ /** Provides models for the `lxml.etree.Element` class. */
324
+ module Element {
325
+ /** Gets a reference to the `Element` class. */
326
+ API:: Node classRef ( ) { result = etreeRef ( ) .getMember ( [ "Element" , "_Element" ] ) }
327
+
328
+ /**
329
+ * A source of `lxml.etree.Element` instances, extend this class to model new instances.
330
+ *
331
+ * This can include instantiations of the class, return values from function
332
+ * calls, or a special parameter that will be set when functions are called by an external
333
+ * library.
334
+ *
335
+ * Use the predicate `Element::instance()` to get references to instances of `lxml.etree.Element` instances.
336
+ */
337
+ abstract class InstanceSource instanceof API:: Node {
338
+ /** Gets a textual representation of this element. */
339
+ string toString ( ) { result = super .toString ( ) }
340
+ }
341
+
342
+ /** Gets a reference to an `lxml.etree.Element` instance. */
343
+ API:: Node instance ( ) { result instanceof InstanceSource }
344
+
345
+ /** An `Element` instantiated directly. */
346
+ private class ElementInstance extends InstanceSource {
347
+ ElementInstance ( ) { this = classRef ( ) .getAnInstance ( ) }
348
+ }
349
+
350
+ /** The result of a parse operation that returns an `Element`. */
351
+ private class ParseResult extends InstanceSource {
352
+ ParseResult ( ) {
353
+ // TODO: The XmlParser module does not currently use API graphs
354
+ this =
355
+ [
356
+ etreeRef ( ) .getMember ( "XMLParser" ) .getAnInstance ( ) ,
357
+ etreeRef ( ) .getMember ( "get_default_parser" ) .getReturn ( )
358
+ ] .getMember ( "close" ) .getReturn ( )
359
+ or
360
+ // TODO: `XMLID`, `XMLDTDID`, `parseid` returns a tuple of which the first element is an `Element`.
361
+ // `iterparse` returns an iterator of tuples, each of which has a second element that is an `Element`.
362
+ this = etreeRef ( ) .getMember ( [ "XML" , "HTML" , "fromstring" , "fromstringlist" ] ) .getReturn ( )
363
+ }
364
+ }
365
+
366
+ /** A call to a method on an `Element` that returns another `Element`. */
367
+ private class ElementMethod extends InstanceSource {
368
+ ElementMethod ( ) {
369
+ // an Element is an iterator of Elements
370
+ this = instance ( ) .getASubscript ( )
371
+ or
372
+ // methods that return an Element
373
+ this = instance ( ) .getMember ( [ "find" , "getnext" , "getprevious" , "getparent" ] ) .getReturn ( )
374
+ or
375
+ // methods that return an iterator of Elements
376
+ this =
377
+ instance ( )
378
+ .getMember ( [
379
+ "cssselect" , "findall" , "getchildren" , "getiterator" , "iter" , "iterancestors" ,
380
+ "iterdecendants" , "iterchildren" , "itersiblings" , "iterfind" , "xpath"
381
+ ] )
382
+ .getReturn ( )
383
+ .getASubscript ( )
384
+ }
385
+ }
386
+
387
+ /** A call to a method on an `ElementTree` that returns an `Element`. */
388
+ private class ElementTreeMethod extends InstanceSource {
389
+ ElementTreeMethod ( ) {
390
+ this = ElementTree:: instance ( ) .getMember ( [ "getroot" , "find" ] ) .getReturn ( )
391
+ or
392
+ this =
393
+ ElementTree:: instance ( )
394
+ .getMember ( [ "findall" , "getiterator" , "iter" , "iterfind" , "xpath" ] )
395
+ .getReturn ( )
396
+ .getASubscript ( )
397
+ }
398
+ }
399
+
400
+ /**
401
+ * An additional taint step from an `Element` instance.
402
+ * See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase.
403
+ */
404
+ private class ElementTaintStep extends TaintTracking:: AdditionalTaintStep {
405
+ override predicate step ( DataFlow:: Node nodeFrom , DataFlow:: Node nodeTo ) {
406
+ exists ( DataFlow:: MethodCallNode call |
407
+ nodeTo = call and instance ( ) .asSource ( ) .flowsTo ( nodeFrom )
408
+ |
409
+ call .calls ( nodeFrom ,
410
+ // We consider a node to be tainted if there could be taint anywhere in the element tree;
411
+ // so sibling nodes (e.g. `getnext`) are also tainted.
412
+ // This ensures nodes like `elem[0].getnext()` are tracked.
413
+ [
414
+ "cssselect" , "find" , "findall" , "findtext" , "get" , "getchildren" , "getiterator" ,
415
+ "getnext" , "getparent" , "getprevious" , "getroottree" , "items" , "iter" ,
416
+ "iterancestors" , "iterchildren" , "iterdescendants" , "itersiblings" , "iterfind" ,
417
+ "itertext" , "keys" , "values" , "xpath"
418
+ ] )
419
+ )
420
+ or
421
+ exists ( DataFlow:: AttrRead attr | nodeTo = attr and instance ( ) .asSource ( ) .flowsTo ( nodeFrom ) |
422
+ attr .accesses ( nodeFrom , [ "attrib" , "base" , "nsmap" , "prefix" , "tag" , "tail" , "text" ] )
423
+ )
424
+ }
425
+ }
426
+ }
427
+
428
+ /** Provides models for the `lxml.etree.ElementTree` class. */
429
+ module ElementTree {
430
+ /** Gets a reference to the `ElementTree` class. */
431
+ API:: Node classRef ( ) { result = etreeRef ( ) .getMember ( [ "ElementTree" , "_ElementTree" ] ) }
432
+
433
+ /**
434
+ * A source of `lxml.etree.ElementTree` instances; extend this class to model new instances.
435
+ *
436
+ * This can include instantiations of the class, return values from function
437
+ * calls, or a special parameter that will be set when functions are called by an external
438
+ * library.
439
+ *
440
+ * Use the predicate `ElementTree::instance()` to get references to instances of `lxml.etree.ElementTree` instances.
441
+ */
442
+ abstract class InstanceSource instanceof API:: Node {
443
+ /** Gets a textual representation of this element. */
444
+ string toString ( ) { result = super .toString ( ) }
445
+ }
446
+
447
+ /** Gets a reference to an `lxml.etree.ElementTree` instance. */
448
+ API:: Node instance ( ) { result instanceof InstanceSource }
449
+
450
+ /** An `ElementTree` instantiated directly. */
451
+ private class ElementTreeInstance extends InstanceSource {
452
+ ElementTreeInstance ( ) { this = classRef ( ) .getAnInstance ( ) }
453
+ }
454
+
455
+ /** The result of a parse operation that returns an `ElementTree`. */
456
+ private class ParseResult extends InstanceSource {
457
+ ParseResult ( ) { this = etreeRef ( ) .getMember ( "parse" ) .getReturn ( ) }
458
+ }
459
+
460
+ /** A call to a method on an `Element` that returns another `Element`. */
461
+ private class ElementMethod extends InstanceSource {
462
+ ElementMethod ( ) { this = Element:: instance ( ) .getMember ( "getroottree" ) .getReturn ( ) }
463
+ }
464
+
465
+ /** An additional taint step from an `ElementTree` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree._ElementTree */
466
+ private class ElementTaintStep extends TaintTracking:: AdditionalTaintStep {
467
+ override predicate step ( DataFlow:: Node nodeFrom , DataFlow:: Node nodeTo ) {
468
+ exists ( DataFlow:: MethodCallNode call |
469
+ nodeTo = call and instance ( ) .asSource ( ) .flowsTo ( nodeFrom )
470
+ |
471
+ call .calls ( nodeFrom ,
472
+ [
473
+ "find" , "findall" , "findtext" , "get" , "getiterator" , "getroot" , "iter" , "iterfind" ,
474
+ "xpath"
475
+ ] )
476
+ )
477
+ or
478
+ exists ( DataFlow:: AttrRead attr | nodeTo = attr and instance ( ) .asSource ( ) .flowsTo ( nodeFrom ) |
479
+ attr .accesses ( nodeFrom , "docinfo" )
480
+ )
481
+ }
482
+ }
483
+ }
484
+
485
+ /** A call to serialise xml to a string */
486
+ private class XmlEncoding extends Encoding:: Range , DataFlow:: CallCfgNode {
487
+ XmlEncoding ( ) {
488
+ this = etreeRef ( ) .getMember ( [ "tostring" , "tostringlist" , "tounicode" ] ) .getACall ( )
489
+ }
490
+
491
+ override DataFlow:: Node getAnInput ( ) {
492
+ result = [ this .getArg ( 0 ) , this .getArgByName ( "element_or_tree" ) ]
493
+ }
494
+
495
+ override DataFlow:: Node getOutput ( ) { result = this }
496
+
497
+ override string getFormat ( ) { result = "XML" }
498
+ }
499
+ // TODO: ElementTree.write can write to a file-like object; should that be a flow step?
500
+ // It also can accept a filepath which could be a path injection sink.
321
501
}
0 commit comments