Skip to content

Commit 4a153a1

Browse files
vstinnertdwyer
andauthored
[CVE-2023-27043] gh-102988: Reject malformed addresses in email.parseaddr() (#111116)
Detect email address parsing errors and return empty tuple to indicate the parsing error (old API). Add an optional 'strict' parameter to getaddresses() and parseaddr() functions. Patch by Thomas Dwyer. Co-Authored-By: Thomas Dwyer <[email protected]>
1 parent 4026ad5 commit 4a153a1

File tree

5 files changed

+357
-21
lines changed

5 files changed

+357
-21
lines changed

Doc/library/email.utils.rst

+15-4
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,18 @@ of the new API.
5858
begins with angle brackets, they are stripped off.
5959

6060

61-
.. function:: parseaddr(address)
61+
.. function:: parseaddr(address, *, strict=True)
6262

6363
Parse address -- which should be the value of some address-containing field such
6464
as :mailheader:`To` or :mailheader:`Cc` -- into its constituent *realname* and
6565
*email address* parts. Returns a tuple of that information, unless the parse
6666
fails, in which case a 2-tuple of ``('', '')`` is returned.
6767

68+
If *strict* is true, use a strict parser which rejects malformed inputs.
69+
70+
.. versionchanged:: 3.13
71+
Add *strict* optional parameter and reject malformed inputs by default.
72+
6873

6974
.. function:: formataddr(pair, charset='utf-8')
7075

@@ -82,12 +87,15 @@ of the new API.
8287
Added the *charset* option.
8388

8489

85-
.. function:: getaddresses(fieldvalues)
90+
.. function:: getaddresses(fieldvalues, *, strict=True)
8691

8792
This method returns a list of 2-tuples of the form returned by ``parseaddr()``.
8893
*fieldvalues* is a sequence of header field values as might be returned by
89-
:meth:`Message.get_all <email.message.Message.get_all>`. Here's a simple
90-
example that gets all the recipients of a message::
94+
:meth:`Message.get_all <email.message.Message.get_all>`.
95+
96+
If *strict* is true, use a strict parser which rejects malformed inputs.
97+
98+
Here's a simple example that gets all the recipients of a message::
9199

92100
from email.utils import getaddresses
93101

@@ -97,6 +105,9 @@ of the new API.
97105
resent_ccs = msg.get_all('resent-cc', [])
98106
all_recipients = getaddresses(tos + ccs + resent_tos + resent_ccs)
99107

108+
.. versionchanged:: 3.13
109+
Add *strict* optional parameter and reject malformed inputs by default.
110+
100111

101112
.. function:: parsedate(date)
102113

Doc/whatsnew/3.13.rst

+13
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,19 @@ doctest
199199
:attr:`doctest.TestResults.skipped` attributes.
200200
(Contributed by Victor Stinner in :gh:`108794`.)
201201

202+
email
203+
-----
204+
205+
* :func:`email.utils.getaddresses` and :func:`email.utils.parseaddr` now return
206+
``('', '')`` 2-tuples in more situations where invalid email addresses are
207+
encountered instead of potentially inaccurate values. Add optional *strict*
208+
parameter to these two functions: use ``strict=False`` to get the old
209+
behavior, accept malformed inputs.
210+
``getattr(email.utils, 'supports_strict_parsing', False)`` can be use to
211+
check if the *strict* paramater is available.
212+
(Contributed by Thomas Dwyer and Victor Stinner for :gh:`102988` to improve
213+
the CVE-2023-27043 fix.)
214+
202215
glob
203216
----
204217

Lib/email/utils.py

+142-9
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
specialsre = re.compile(r'[][\\()<>@,:;".]')
4444
escapesre = re.compile(r'[\\"]')
4545

46+
4647
def _has_surrogates(s):
4748
"""Return True if s may contain surrogate-escaped binary data."""
4849
# This check is based on the fact that unless there are surrogates, utf8
@@ -103,12 +104,127 @@ def formataddr(pair, charset='utf-8'):
103104
return address
104105

105106

107+
def _iter_escaped_chars(addr):
108+
pos = 0
109+
escape = False
110+
for pos, ch in enumerate(addr):
111+
if escape:
112+
yield (pos, '\\' + ch)
113+
escape = False
114+
elif ch == '\\':
115+
escape = True
116+
else:
117+
yield (pos, ch)
118+
if escape:
119+
yield (pos, '\\')
120+
121+
122+
def _strip_quoted_realnames(addr):
123+
"""Strip real names between quotes."""
124+
if '"' not in addr:
125+
# Fast path
126+
return addr
127+
128+
start = 0
129+
open_pos = None
130+
result = []
131+
for pos, ch in _iter_escaped_chars(addr):
132+
if ch == '"':
133+
if open_pos is None:
134+
open_pos = pos
135+
else:
136+
if start != open_pos:
137+
result.append(addr[start:open_pos])
138+
start = pos + 1
139+
open_pos = None
140+
141+
if start < len(addr):
142+
result.append(addr[start:])
143+
144+
return ''.join(result)
106145

107-
def getaddresses(fieldvalues):
108-
"""Return a list of (REALNAME, EMAIL) for each fieldvalue."""
109-
all = COMMASPACE.join(str(v) for v in fieldvalues)
110-
a = _AddressList(all)
111-
return a.addresslist
146+
147+
supports_strict_parsing = True
148+
149+
def getaddresses(fieldvalues, *, strict=True):
150+
"""Return a list of (REALNAME, EMAIL) or ('','') for each fieldvalue.
151+
152+
When parsing fails for a fieldvalue, a 2-tuple of ('', '') is returned in
153+
its place.
154+
155+
If strict is true, use a strict parser which rejects malformed inputs.
156+
"""
157+
158+
# If strict is true, if the resulting list of parsed addresses is greater
159+
# than the number of fieldvalues in the input list, a parsing error has
160+
# occurred and consequently a list containing a single empty 2-tuple [('',
161+
# '')] is returned in its place. This is done to avoid invalid output.
162+
#
163+
# Malformed input: getaddresses(['[email protected] <[email protected]>'])
164+
# Invalid output: [('', '[email protected]'), ('', '[email protected]')]
165+
# Safe output: [('', '')]
166+
167+
if not strict:
168+
all = COMMASPACE.join(str(v) for v in fieldvalues)
169+
a = _AddressList(all)
170+
return a.addresslist
171+
172+
fieldvalues = [str(v) for v in fieldvalues]
173+
fieldvalues = _pre_parse_validation(fieldvalues)
174+
addr = COMMASPACE.join(fieldvalues)
175+
a = _AddressList(addr)
176+
result = _post_parse_validation(a.addresslist)
177+
178+
# Treat output as invalid if the number of addresses is not equal to the
179+
# expected number of addresses.
180+
n = 0
181+
for v in fieldvalues:
182+
# When a comma is used in the Real Name part it is not a deliminator.
183+
# So strip those out before counting the commas.
184+
v = _strip_quoted_realnames(v)
185+
# Expected number of addresses: 1 + number of commas
186+
n += 1 + v.count(',')
187+
if len(result) != n:
188+
return [('', '')]
189+
190+
return result
191+
192+
193+
def _check_parenthesis(addr):
194+
# Ignore parenthesis in quoted real names.
195+
addr = _strip_quoted_realnames(addr)
196+
197+
opens = 0
198+
for pos, ch in _iter_escaped_chars(addr):
199+
if ch == '(':
200+
opens += 1
201+
elif ch == ')':
202+
opens -= 1
203+
if opens < 0:
204+
return False
205+
return (opens == 0)
206+
207+
208+
def _pre_parse_validation(email_header_fields):
209+
accepted_values = []
210+
for v in email_header_fields:
211+
if not _check_parenthesis(v):
212+
v = "('', '')"
213+
accepted_values.append(v)
214+
215+
return accepted_values
216+
217+
218+
def _post_parse_validation(parsed_email_header_tuples):
219+
accepted_values = []
220+
# The parser would have parsed a correctly formatted domain-literal
221+
# The existence of an [ after parsing indicates a parsing failure
222+
for v in parsed_email_header_tuples:
223+
if '[' in v[1]:
224+
v = ('', '')
225+
accepted_values.append(v)
226+
227+
return accepted_values
112228

113229

114230
def _format_timetuple_and_zone(timetuple, zone):
@@ -207,16 +323,33 @@ def parsedate_to_datetime(data):
207323
tzinfo=datetime.timezone(datetime.timedelta(seconds=tz)))
208324

209325

210-
def parseaddr(addr):
326+
def parseaddr(addr, *, strict=True):
211327
"""
212328
Parse addr into its constituent realname and email address parts.
213329
214330
Return a tuple of realname and email address, unless the parse fails, in
215331
which case return a 2-tuple of ('', '').
332+
333+
If strict is True, use a strict parser which rejects malformed inputs.
216334
"""
217-
addrs = _AddressList(addr).addresslist
218-
if not addrs:
219-
return '', ''
335+
if not strict:
336+
addrs = _AddressList(addr).addresslist
337+
if not addrs:
338+
return ('', '')
339+
return addrs[0]
340+
341+
if isinstance(addr, list):
342+
addr = addr[0]
343+
344+
if not isinstance(addr, str):
345+
return ('', '')
346+
347+
addr = _pre_parse_validation([addr])[0]
348+
addrs = _post_parse_validation(_AddressList(addr).addresslist)
349+
350+
if not addrs or len(addrs) > 1:
351+
return ('', '')
352+
220353
return addrs[0]
221354

222355

0 commit comments

Comments
 (0)