This repository was archived by the owner on Apr 26, 2024. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. #15079
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
0ce55f5
Add failing test
reivilibre b312e6d
Small rename of variables
reivilibre 2085a40
Add minitests to characterise how we do tokenisation today
reivilibre bc5c511
Add escaping to our tsquery builder
reivilibre a19355b
Test that we can find a user by prefix of MXID
reivilibre 30c722a
Newsfile
reivilibre 68d3c64
Loosen test to accept at least 2 versions of ICU
reivilibre e5c2c78
Update tests/handlers/test_user_directory.py
reivilibre ef37452
Let the tests run on both ICU and non-ICU
reivilibre File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,11 @@ | |
from synapse.server import HomeServer | ||
from synapse.storage import DataStore | ||
from synapse.storage.background_updates import _BackgroundUpdateHandler | ||
from synapse.storage.databases.main import user_directory | ||
from synapse.storage.databases.main.user_directory import ( | ||
_parse_words_with_icu, | ||
_parse_words_with_regex, | ||
) | ||
from synapse.storage.roommember import ProfileInfo | ||
from synapse.util import Clock | ||
|
||
|
@@ -42,7 +47,7 @@ | |
BOB = "@bob:b" | ||
BOBBY = "@bobby:a" | ||
# The localpart isn't 'Bela' on purpose so we can test looking up display names. | ||
BELA = "@somenickname:a" | ||
BELA = "@somenickname:example.org" | ||
|
||
|
||
class GetUserDirectoryTables: | ||
|
@@ -423,6 +428,8 @@ async def mocked_process_users(*args: Any, **kwargs: Any) -> int: | |
|
||
|
||
class UserDirectoryStoreTestCase(HomeserverTestCase): | ||
use_icu = False | ||
|
||
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: | ||
self.store = hs.get_datastores().main | ||
|
||
|
@@ -434,6 +441,12 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: | |
self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None)) | ||
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB))) | ||
|
||
self._restore_use_icu = user_directory.USE_ICU | ||
user_directory.USE_ICU = self.use_icu | ||
|
||
def tearDown(self) -> None: | ||
user_directory.USE_ICU = self._restore_use_icu | ||
|
||
def test_search_user_dir(self) -> None: | ||
# normally when alice searches the directory she should just find | ||
# bob because bobby doesn't share a room with her. | ||
|
@@ -478,6 +491,26 @@ def test_search_user_dir_stop_words(self) -> None: | |
{"user_id": BELA, "display_name": "Bela", "avatar_url": None}, | ||
) | ||
|
||
@override_config({"user_directory": {"search_all_users": True}}) | ||
def test_search_user_dir_start_of_user_id(self) -> None: | ||
"""Tests that a user can look up another user by searching for the start | ||
of their user ID. | ||
""" | ||
r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10)) | ||
self.assertFalse(r["limited"]) | ||
self.assertEqual(1, len(r["results"])) | ||
self.assertDictEqual( | ||
r["results"][0], | ||
{"user_id": BELA, "display_name": "Bela", "avatar_url": None}, | ||
) | ||
|
||
Comment on lines
+494
to
+506
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this test (and any others) running under with-ICU and without-ICU? (Should it?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. now it is :) |
||
|
||
class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): | ||
use_icu = True | ||
|
||
if not icu: | ||
skip = "Requires PyICU" | ||
|
||
|
||
class UserDirectoryICUTestCase(HomeserverTestCase): | ||
if not icu: | ||
|
@@ -513,3 +546,31 @@ def test_icu_word_boundary(self) -> None: | |
r["results"][0], | ||
{"user_id": ALICE, "display_name": display_name, "avatar_url": None}, | ||
) | ||
|
||
def test_icu_word_boundary_punctuation(self) -> None: | ||
""" | ||
Tests the behaviour of punctuation with the ICU tokeniser. | ||
|
||
Seems to depend on underlying version of ICU. | ||
""" | ||
|
||
# Note: either tokenisation is fine, because Postgres actually splits | ||
# words itself afterwards. | ||
self.assertIn( | ||
_parse_words_with_icu("lazy'fox jumped:over the.dog"), | ||
( | ||
# ICU 66 on Ubuntu 20.04 | ||
["lazy'fox", "jumped", "over", "the", "dog"], | ||
# ICU 70 on Ubuntu 22.04 | ||
["lazy'fox", "jumped:over", "the.dog"], | ||
), | ||
) | ||
|
||
def test_regex_word_boundary_punctuation(self) -> None: | ||
""" | ||
Tests the behaviour of punctuation with the non-ICU tokeniser | ||
""" | ||
self.assertEqual( | ||
_parse_words_with_regex("lazy'fox jumped:over the.dog"), | ||
["lazy", "fox", "jumped", "over", "the", "dog"], | ||
) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.