Skip to content

Commit a7fb73a

Browse files
Merge pull request #18185 from joefarebrother/python-lxml
Python: Model additional flow steps for the lxml framework
2 parents 303b11e + 35961e4 commit a7fb73a

File tree

5 files changed

+356
-13
lines changed

5 files changed

+356
-13
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
- Additional taint steps through methods of `lxml.etree.Element` and `lxml.etree.ElementTree` objects from the `lxml` PyPI package have been modeled.

python/ql/lib/semmle/python/frameworks/Lxml.qll

+193-13
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
private import python
1010
private import semmle.python.dataflow.new.DataFlow
11+
private import semmle.python.dataflow.new.TaintTracking
1112
private import semmle.python.Concepts
1213
private import semmle.python.ApiGraphs
1314
private import semmle.python.frameworks.data.ModelsAsData
@@ -68,16 +69,9 @@ module Lxml {
6869
*/
6970
class XPathCall extends XML::XPathExecution::Range, DataFlow::CallCfgNode {
7071
XPathCall() {
71-
exists(API::Node parseResult |
72-
parseResult =
73-
etreeRef().getMember(["parse", "fromstring", "fromstringlist", "HTML", "XML"]).getReturn()
74-
or
75-
// TODO: lxml.etree.parseid(<text>)[0] will contain the root element from parsing <text>
76-
// but we don't really have a way to model that nicely.
77-
parseResult = etreeRef().getMember("XMLParser").getReturn().getMember("close").getReturn()
78-
|
79-
this = parseResult.getMember("xpath").getACall()
80-
)
72+
// TODO: lxml.etree.parseid(<text>)[0] will contain the root element from parsing <text>
73+
// but we don't really have a way to model that nicely.
74+
this = [Element::instance(), ElementTree::instance()].getMember("xpath").getACall()
8175
}
8276

8377
override DataFlow::Node getXPath() { result in [this.getArg(0), this.getArgByName("_path")] }
@@ -209,31 +203,37 @@ module Lxml {
209203
* A call to either of:
210204
* - `lxml.etree.fromstring`
211205
* - `lxml.etree.fromstringlist`
206+
* - `lxml.etree.HTML`
212207
* - `lxml.etree.XML`
213208
* - `lxml.etree.XMLID`
209+
* - `lxml.etree.XMLDTDID`
214210
* - `lxml.etree.parse`
215211
* - `lxml.etree.parseid`
216212
*
217213
* See
218214
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstring
219215
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.fromstringlist
216+
* - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.HTML
220217
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XML
221218
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.XMLID
219+
* - https://lxml.de/apidoc/lxml.etree.html#lxml.etree.XMLDTDID
222220
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parse
223221
* - https://lxml.de/apidoc/lxml.etree.html?highlight=parseids#lxml.etree.parseid
224222
*/
225223
private class LxmlParsing extends DataFlow::CallCfgNode, XML::XmlParsing::Range {
226224
string functionName;
227225

228226
LxmlParsing() {
229-
functionName in ["fromstring", "fromstringlist", "XML", "XMLID", "parse", "parseid"] and
227+
functionName in [
228+
"fromstring", "fromstringlist", "HTML", "XML", "XMLID", "XMLDTDID", "parse", "parseid"
229+
] and
230230
this = etreeRef().getMember(functionName).getACall()
231231
}
232232

233233
override DataFlow::Node getAnInput() {
234234
result in [
235235
this.getArg(0),
236-
// fromstring / XML / XMLID
236+
// fromstring / HTML / XML / XMLID / XMLDTDID
237237
this.getArgByName("text"),
238238
// fromstringlist
239239
this.getArgByName("strings"),
@@ -248,7 +248,8 @@ module Lxml {
248248
this.getParserArg() = XmlParser::instanceVulnerableTo(kind)
249249
or
250250
kind.isXxe() and
251-
not exists(this.getParserArg())
251+
not exists(this.getParserArg()) and
252+
not functionName = "HTML"
252253
}
253254

254255
override predicate mayExecuteInput() { none() }
@@ -318,4 +319,183 @@ module Lxml {
318319

319320
override DataFlow::Node getAPathArgument() { result = this.getAnInput() }
320321
}
322+
323+
/** Provides models for the `lxml.etree.Element` class. */
324+
module Element {
325+
/** Gets a reference to the `Element` class. */
326+
API::Node classRef() { result = etreeRef().getMember(["Element", "_Element"]) }
327+
328+
/**
329+
* A source of `lxml.etree.Element` instances, extend this class to model new instances.
330+
*
331+
* This can include instantiations of the class, return values from function
332+
* calls, or a special parameter that will be set when functions are called by an external
333+
* library.
334+
*
335+
* Use the predicate `Element::instance()` to get references to instances of `lxml.etree.Element` instances.
336+
*/
337+
abstract class InstanceSource instanceof API::Node {
338+
/** Gets a textual representation of this element. */
339+
string toString() { result = super.toString() }
340+
}
341+
342+
/** Gets a reference to an `lxml.etree.Element` instance. */
343+
API::Node instance() { result instanceof InstanceSource }
344+
345+
/** An `Element` instantiated directly. */
346+
private class ElementInstance extends InstanceSource {
347+
ElementInstance() { this = classRef().getAnInstance() }
348+
}
349+
350+
/** The result of a parse operation that returns an `Element`. */
351+
private class ParseResult extends InstanceSource {
352+
ParseResult() {
353+
// TODO: The XmlParser module does not currently use API graphs
354+
this =
355+
[
356+
etreeRef().getMember("XMLParser").getAnInstance(),
357+
etreeRef().getMember("get_default_parser").getReturn()
358+
].getMember("close").getReturn()
359+
or
360+
// TODO: `XMLID`, `XMLDTDID`, `parseid` returns a tuple of which the first element is an `Element`.
361+
// `iterparse` returns an iterator of tuples, each of which has a second element that is an `Element`.
362+
this = etreeRef().getMember(["XML", "HTML", "fromstring", "fromstringlist"]).getReturn()
363+
}
364+
}
365+
366+
/** A call to a method on an `Element` that returns another `Element`. */
367+
private class ElementMethod extends InstanceSource {
368+
ElementMethod() {
369+
// an Element is an iterator of Elements
370+
this = instance().getASubscript()
371+
or
372+
// methods that return an Element
373+
this = instance().getMember(["find", "getnext", "getprevious", "getparent"]).getReturn()
374+
or
375+
// methods that return an iterator of Elements
376+
this =
377+
instance()
378+
.getMember([
379+
"cssselect", "findall", "getchildren", "getiterator", "iter", "iterancestors",
380+
"iterdecendants", "iterchildren", "itersiblings", "iterfind", "xpath"
381+
])
382+
.getReturn()
383+
.getASubscript()
384+
}
385+
}
386+
387+
/** A call to a method on an `ElementTree` that returns an `Element`. */
388+
private class ElementTreeMethod extends InstanceSource {
389+
ElementTreeMethod() {
390+
this = ElementTree::instance().getMember(["getroot", "find"]).getReturn()
391+
or
392+
this =
393+
ElementTree::instance()
394+
.getMember(["findall", "getiterator", "iter", "iterfind", "xpath"])
395+
.getReturn()
396+
.getASubscript()
397+
}
398+
}
399+
400+
/**
401+
* An additional taint step from an `Element` instance.
402+
* See https://lxml.de/apidoc/lxml.etree.html#lxml.etree.ElementBase.
403+
*/
404+
private class ElementTaintStep extends TaintTracking::AdditionalTaintStep {
405+
override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
406+
exists(DataFlow::MethodCallNode call |
407+
nodeTo = call and instance().asSource().flowsTo(nodeFrom)
408+
|
409+
call.calls(nodeFrom,
410+
// We consider a node to be tainted if there could be taint anywhere in the element tree;
411+
// so sibling nodes (e.g. `getnext`) are also tainted.
412+
// This ensures nodes like `elem[0].getnext()` are tracked.
413+
[
414+
"cssselect", "find", "findall", "findtext", "get", "getchildren", "getiterator",
415+
"getnext", "getparent", "getprevious", "getroottree", "items", "iter",
416+
"iterancestors", "iterchildren", "iterdescendants", "itersiblings", "iterfind",
417+
"itertext", "keys", "values", "xpath"
418+
])
419+
)
420+
or
421+
exists(DataFlow::AttrRead attr | nodeTo = attr and instance().asSource().flowsTo(nodeFrom) |
422+
attr.accesses(nodeFrom, ["attrib", "base", "nsmap", "prefix", "tag", "tail", "text"])
423+
)
424+
}
425+
}
426+
}
427+
428+
/** Provides models for the `lxml.etree.ElementTree` class. */
429+
module ElementTree {
430+
/** Gets a reference to the `ElementTree` class. */
431+
API::Node classRef() { result = etreeRef().getMember(["ElementTree", "_ElementTree"]) }
432+
433+
/**
434+
* A source of `lxml.etree.ElementTree` instances; extend this class to model new instances.
435+
*
436+
* This can include instantiations of the class, return values from function
437+
* calls, or a special parameter that will be set when functions are called by an external
438+
* library.
439+
*
440+
* Use the predicate `ElementTree::instance()` to get references to instances of `lxml.etree.ElementTree` instances.
441+
*/
442+
abstract class InstanceSource instanceof API::Node {
443+
/** Gets a textual representation of this element. */
444+
string toString() { result = super.toString() }
445+
}
446+
447+
/** Gets a reference to an `lxml.etree.ElementTree` instance. */
448+
API::Node instance() { result instanceof InstanceSource }
449+
450+
/** An `ElementTree` instantiated directly. */
451+
private class ElementTreeInstance extends InstanceSource {
452+
ElementTreeInstance() { this = classRef().getAnInstance() }
453+
}
454+
455+
/** The result of a parse operation that returns an `ElementTree`. */
456+
private class ParseResult extends InstanceSource {
457+
ParseResult() { this = etreeRef().getMember("parse").getReturn() }
458+
}
459+
460+
/** A call to a method on an `Element` that returns another `Element`. */
461+
private class ElementMethod extends InstanceSource {
462+
ElementMethod() { this = Element::instance().getMember("getroottree").getReturn() }
463+
}
464+
465+
/** An additional taint step from an `ElementTree` instance. See https://lxml.de/apidoc/lxml.etree.html#lxml.etree._ElementTree */
466+
private class ElementTaintStep extends TaintTracking::AdditionalTaintStep {
467+
override predicate step(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
468+
exists(DataFlow::MethodCallNode call |
469+
nodeTo = call and instance().asSource().flowsTo(nodeFrom)
470+
|
471+
call.calls(nodeFrom,
472+
[
473+
"find", "findall", "findtext", "get", "getiterator", "getroot", "iter", "iterfind",
474+
"xpath"
475+
])
476+
)
477+
or
478+
exists(DataFlow::AttrRead attr | nodeTo = attr and instance().asSource().flowsTo(nodeFrom) |
479+
attr.accesses(nodeFrom, "docinfo")
480+
)
481+
}
482+
}
483+
}
484+
485+
/** A call to serialise xml to a string */
486+
private class XmlEncoding extends Encoding::Range, DataFlow::CallCfgNode {
487+
XmlEncoding() {
488+
this = etreeRef().getMember(["tostring", "tostringlist", "tounicode"]).getACall()
489+
}
490+
491+
override DataFlow::Node getAnInput() {
492+
result = [this.getArg(0), this.getArgByName("element_or_tree")]
493+
}
494+
495+
override DataFlow::Node getOutput() { result = this }
496+
497+
override string getFormat() { result = "XML" }
498+
}
499+
// TODO: ElementTree.write can write to a file-like object; should that be a flow step?
500+
// It also can accept a filepath which could be a path injection sink.
321501
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
argumentToEnsureNotTaintedNotMarkedAsSpurious
2+
untaintedArgumentToEnsureTaintedNotMarkedAsMissing
3+
testFailures
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
import experimental.meta.InlineTaintTest
2+
import MakeInlineTaintTest<TestTaintTrackingConfig>

0 commit comments

Comments
 (0)