Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit 03318a7

Browse files
richvdhcallahad
andauthored
Merge pull request from GHSA-x345-32rc-8h85
* tests for push rule pattern matching * tests for acl pattern matching * factor out common `re.escape` * Factor out common re.compile * Factor out common anchoring code * add word_boundary support to `glob_to_regex` * Use `glob_to_regex` in push rule evaluator NB that this drops support for character classes. I don't think anyone ever used them. * Improve efficiency of globs with multiple wildcards The idea here is that we compress multiple `*` globs into a single `.*`. We also need to consider `?`, since `*?*` is as hard to implement efficiently as `**`. * add assertion on regex pattern * Fix mypy * Simplify glob_to_regex * Inline the glob_to_regex helper function Signed-off-by: Dan Callahan <[email protected]> * Moar comments Signed-off-by: Dan Callahan <[email protected]> Co-authored-by: Dan Callahan <[email protected]>
1 parent 4df26ab commit 03318a7

File tree

6 files changed

+296
-68
lines changed

6 files changed

+296
-68
lines changed

synapse/config/tls.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import warnings
1818
from datetime import datetime
1919
from hashlib import sha256
20-
from typing import List, Optional
20+
from typing import List, Optional, Pattern
2121

2222
from unpaddedbase64 import encode_base64
2323

@@ -124,7 +124,7 @@ def read_config(self, config: dict, config_dir_path: str, **kwargs):
124124
fed_whitelist_entries = []
125125

126126
# Support globs (*) in whitelist values
127-
self.federation_certificate_verification_whitelist = [] # type: List[str]
127+
self.federation_certificate_verification_whitelist = [] # type: List[Pattern]
128128
for entry in fed_whitelist_entries:
129129
try:
130130
entry_regex = glob_to_regex(entry.encode("ascii").decode("ascii"))

synapse/push/push_rule_evaluator.py

Lines changed: 3 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from synapse.events import EventBase
2121
from synapse.types import UserID
22+
from synapse.util import glob_to_regex, re_word_boundary
2223
from synapse.util.caches.lrucache import LruCache
2324

2425
logger = logging.getLogger(__name__)
@@ -183,7 +184,7 @@ def _contains_display_name(self, display_name: str) -> bool:
183184
r = regex_cache.get((display_name, False, True), None)
184185
if not r:
185186
r1 = re.escape(display_name)
186-
r1 = _re_word_boundary(r1)
187+
r1 = re_word_boundary(r1)
187188
r = re.compile(r1, flags=re.IGNORECASE)
188189
regex_cache[(display_name, False, True)] = r
189190

@@ -212,64 +213,14 @@ def _glob_matches(glob: str, value: str, word_boundary: bool = False) -> bool:
212213
try:
213214
r = regex_cache.get((glob, True, word_boundary), None)
214215
if not r:
215-
r = _glob_to_re(glob, word_boundary)
216+
r = glob_to_regex(glob, word_boundary)
216217
regex_cache[(glob, True, word_boundary)] = r
217218
return bool(r.search(value))
218219
except re.error:
219220
logger.warning("Failed to parse glob to regex: %r", glob)
220221
return False
221222

222223

223-
def _glob_to_re(glob: str, word_boundary: bool) -> Pattern:
224-
"""Generates regex for a given glob.
225-
226-
Args:
227-
glob
228-
word_boundary: Whether to match against word boundaries or entire string.
229-
"""
230-
if IS_GLOB.search(glob):
231-
r = re.escape(glob)
232-
233-
r = r.replace(r"\*", ".*?")
234-
r = r.replace(r"\?", ".")
235-
236-
# handle [abc], [a-z] and [!a-z] style ranges.
237-
r = GLOB_REGEX.sub(
238-
lambda x: (
239-
"[%s%s]" % (x.group(1) and "^" or "", x.group(2).replace(r"\\\-", "-"))
240-
),
241-
r,
242-
)
243-
if word_boundary:
244-
r = _re_word_boundary(r)
245-
246-
return re.compile(r, flags=re.IGNORECASE)
247-
else:
248-
r = "^" + r + "$"
249-
250-
return re.compile(r, flags=re.IGNORECASE)
251-
elif word_boundary:
252-
r = re.escape(glob)
253-
r = _re_word_boundary(r)
254-
255-
return re.compile(r, flags=re.IGNORECASE)
256-
else:
257-
r = "^" + re.escape(glob) + "$"
258-
return re.compile(r, flags=re.IGNORECASE)
259-
260-
261-
def _re_word_boundary(r: str) -> str:
262-
"""
263-
Adds word boundary characters to the start and end of an
264-
expression to require that the match occur as a whole word,
265-
but do so respecting the fact that strings starting or ending
266-
with non-word characters will change word boundaries.
267-
"""
268-
# we can't use \b as it chokes on unicode. however \W seems to be okay
269-
# as shorthand for [^0-9A-Za-z_].
270-
return r"(^|\W)%s(\W|$)" % (r,)
271-
272-
273224
def _flatten_dict(
274225
d: Union[EventBase, dict],
275226
prefix: Optional[List[str]] = None,

synapse/util/__init__.py

Lines changed: 47 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import json
1616
import logging
1717
import re
18+
from typing import Pattern
1819

1920
import attr
2021
from frozendict import frozendict
@@ -26,6 +27,9 @@
2627
logger = logging.getLogger(__name__)
2728

2829

30+
_WILDCARD_RUN = re.compile(r"([\?\*]+)")
31+
32+
2933
def _reject_invalid_json(val):
3034
"""Do not allow Infinity, -Infinity, or NaN values in JSON."""
3135
raise ValueError("Invalid JSON value: '%s'" % val)
@@ -158,25 +162,54 @@ def log_failure(failure, msg, consumeErrors=True):
158162
return failure
159163

160164

161-
def glob_to_regex(glob):
165+
def glob_to_regex(glob: str, word_boundary: bool = False) -> Pattern:
162166
"""Converts a glob to a compiled regex object.
163167
164-
The regex is anchored at the beginning and end of the string.
165-
166168
Args:
167-
glob (str)
169+
glob: pattern to match
170+
word_boundary: If True, the pattern will be allowed to match at word boundaries
171+
anywhere in the string. Otherwise, the pattern is anchored at the start and
172+
end of the string.
168173
169174
Returns:
170-
re.RegexObject
175+
compiled regex pattern
171176
"""
172-
res = ""
173-
for c in glob:
174-
if c == "*":
175-
res = res + ".*"
176-
elif c == "?":
177-
res = res + "."
177+
178+
# Patterns with wildcards must be simplified to avoid performance cliffs
179+
# - The glob `?**?**?` is equivalent to the glob `???*`
180+
# - The glob `???*` is equivalent to the regex `.{3,}`
181+
chunks = []
182+
for chunk in _WILDCARD_RUN.split(glob):
183+
# No wildcards? re.escape()
184+
if not _WILDCARD_RUN.match(chunk):
185+
chunks.append(re.escape(chunk))
186+
continue
187+
188+
# Wildcards? Simplify.
189+
qmarks = chunk.count("?")
190+
if "*" in chunk:
191+
chunks.append(".{%d,}" % qmarks)
178192
else:
179-
res = res + re.escape(c)
193+
chunks.append(".{%d}" % qmarks)
194+
195+
res = "".join(chunks)
180196

181-
# \A anchors at start of string, \Z at end of string
182-
return re.compile(r"\A" + res + r"\Z", re.IGNORECASE)
197+
if word_boundary:
198+
res = re_word_boundary(res)
199+
else:
200+
# \A anchors at start of string, \Z at end of string
201+
res = r"\A" + res + r"\Z"
202+
203+
return re.compile(res, re.IGNORECASE)
204+
205+
206+
def re_word_boundary(r: str) -> str:
207+
"""
208+
Adds word boundary characters to the start and end of an
209+
expression to require that the match occur as a whole word,
210+
but do so respecting the fact that strings starting or ending
211+
with non-word characters will change word boundaries.
212+
"""
213+
# we can't use \b as it chokes on unicode. however \W seems to be okay
214+
# as shorthand for [^0-9A-Za-z_].
215+
return r"(^|\W)%s(\W|$)" % (r,)

tests/federation/test_federation_server.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,25 @@ def test_block_ip_literals(self):
7474
self.assertFalse(server_matches_acl_event("[1:2::]", e))
7575
self.assertTrue(server_matches_acl_event("1:2:3:4", e))
7676

77+
def test_wildcard_matching(self):
78+
e = _create_acl_event({"allow": ["good*.com"]})
79+
self.assertTrue(
80+
server_matches_acl_event("good.com", e),
81+
"* matches 0 characters",
82+
)
83+
self.assertTrue(
84+
server_matches_acl_event("GOOD.COM", e),
85+
"pattern is case-insensitive",
86+
)
87+
self.assertTrue(
88+
server_matches_acl_event("good.aa.com", e),
89+
"* matches several characters, including '.'",
90+
)
91+
self.assertFalse(
92+
server_matches_acl_event("ishgood.com", e),
93+
"pattern does not allow prefixes",
94+
)
95+
7796

7897
class StateQueryTests(unittest.FederatingHomeserverTestCase):
7998

tests/push/test_push_rule_evaluator.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
from typing import Any, Dict
16+
1517
from synapse.api.room_versions import RoomVersions
1618
from synapse.events import FrozenEvent
1719
from synapse.push import push_rule_evaluator
@@ -66,6 +68,170 @@ def test_display_name(self):
6668
# A display name with spaces should work fine.
6769
self.assertTrue(evaluator.matches(condition, "@user:test", "foo bar"))
6870

71+
def _assert_matches(
72+
self, condition: Dict[str, Any], content: Dict[str, Any], msg=None
73+
) -> None:
74+
evaluator = self._get_evaluator(content)
75+
self.assertTrue(evaluator.matches(condition, "@user:test", "display_name"), msg)
76+
77+
def _assert_not_matches(
78+
self, condition: Dict[str, Any], content: Dict[str, Any], msg=None
79+
) -> None:
80+
evaluator = self._get_evaluator(content)
81+
self.assertFalse(
82+
evaluator.matches(condition, "@user:test", "display_name"), msg
83+
)
84+
85+
def test_event_match_body(self):
86+
"""Check that event_match conditions on content.body work as expected"""
87+
88+
# if the key is `content.body`, the pattern matches substrings.
89+
90+
# non-wildcards should match
91+
condition = {
92+
"kind": "event_match",
93+
"key": "content.body",
94+
"pattern": "foobaz",
95+
}
96+
self._assert_matches(
97+
condition,
98+
{"body": "aaa FoobaZ zzz"},
99+
"patterns should match and be case-insensitive",
100+
)
101+
self._assert_not_matches(
102+
condition,
103+
{"body": "aa xFoobaZ yy"},
104+
"pattern should only match at word boundaries",
105+
)
106+
self._assert_not_matches(
107+
condition,
108+
{"body": "aa foobazx yy"},
109+
"pattern should only match at word boundaries",
110+
)
111+
112+
# wildcards should match
113+
condition = {
114+
"kind": "event_match",
115+
"key": "content.body",
116+
"pattern": "f?o*baz",
117+
}
118+
119+
self._assert_matches(
120+
condition,
121+
{"body": "aaa FoobarbaZ zzz"},
122+
"* should match string and pattern should be case-insensitive",
123+
)
124+
self._assert_matches(
125+
condition, {"body": "aa foobaz yy"}, "* should match 0 characters"
126+
)
127+
self._assert_not_matches(
128+
condition, {"body": "aa fobbaz yy"}, "? should not match 0 characters"
129+
)
130+
self._assert_not_matches(
131+
condition, {"body": "aa fiiobaz yy"}, "? should not match 2 characters"
132+
)
133+
self._assert_not_matches(
134+
condition,
135+
{"body": "aa xfooxbaz yy"},
136+
"pattern should only match at word boundaries",
137+
)
138+
self._assert_not_matches(
139+
condition,
140+
{"body": "aa fooxbazx yy"},
141+
"pattern should only match at word boundaries",
142+
)
143+
144+
# test backslashes
145+
condition = {
146+
"kind": "event_match",
147+
"key": "content.body",
148+
"pattern": r"f\oobaz",
149+
}
150+
self._assert_matches(
151+
condition,
152+
{"body": r"F\oobaz"},
153+
"backslash should match itself",
154+
)
155+
condition = {
156+
"kind": "event_match",
157+
"key": "content.body",
158+
"pattern": r"f\?obaz",
159+
}
160+
self._assert_matches(
161+
condition,
162+
{"body": r"F\oobaz"},
163+
r"? after \ should match any character",
164+
)
165+
166+
def test_event_match_non_body(self):
167+
"""Check that event_match conditions on other keys work as expected"""
168+
169+
# if the key is anything other than 'content.body', the pattern must match the
170+
# whole value.
171+
172+
# non-wildcards should match
173+
condition = {
174+
"kind": "event_match",
175+
"key": "content.value",
176+
"pattern": "foobaz",
177+
}
178+
self._assert_matches(
179+
condition,
180+
{"value": "FoobaZ"},
181+
"patterns should match and be case-insensitive",
182+
)
183+
self._assert_not_matches(
184+
condition,
185+
{"value": "xFoobaZ"},
186+
"pattern should only match at the start/end of the value",
187+
)
188+
self._assert_not_matches(
189+
condition,
190+
{"value": "FoobaZz"},
191+
"pattern should only match at the start/end of the value",
192+
)
193+
194+
# wildcards should match
195+
condition = {
196+
"kind": "event_match",
197+
"key": "content.value",
198+
"pattern": "f?o*baz",
199+
}
200+
self._assert_matches(
201+
condition,
202+
{"value": "FoobarbaZ"},
203+
"* should match string and pattern should be case-insensitive",
204+
)
205+
self._assert_matches(
206+
condition, {"value": "foobaz"}, "* should match 0 characters"
207+
)
208+
self._assert_not_matches(
209+
condition, {"value": "fobbaz"}, "? should not match 0 characters"
210+
)
211+
self._assert_not_matches(
212+
condition, {"value": "fiiobaz"}, "? should not match 2 characters"
213+
)
214+
self._assert_not_matches(
215+
condition,
216+
{"value": "xfooxbaz"},
217+
"pattern should only match at the start/end of the value",
218+
)
219+
self._assert_not_matches(
220+
condition,
221+
{"value": "fooxbazx"},
222+
"pattern should only match at the start/end of the value",
223+
)
224+
self._assert_not_matches(
225+
condition,
226+
{"value": "x\nfooxbaz"},
227+
"pattern should not match after a newline",
228+
)
229+
self._assert_not_matches(
230+
condition,
231+
{"value": "fooxbaz\nx"},
232+
"pattern should not match before a newline",
233+
)
234+
69235
def test_no_body(self):
70236
"""Not having a body shouldn't break the evaluator."""
71237
evaluator = self._get_evaluator({})

0 commit comments

Comments
 (0)