Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error. #15079

Merged
merged 9 commits into from
Feb 20, 2023
1 change: 1 addition & 0 deletions changelog.d/15079.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a bug introduced in Synapse v1.74.0 where searching with colons when using ICU for search term tokenisation would fail with an error.
24 changes: 20 additions & 4 deletions synapse/storage/databases/main/user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,11 +918,19 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
We use this so that we can add prefix matching, which isn't something
that is supported by default.
"""
results = _parse_words(search_term)
escaped_words = []
for word in _parse_words(search_term):
# Postgres tsvector and tsquery quoting rules:
# words potentially containing punctuation should be quoted
# and then existing quotes and backslashes should be doubled
# See: https://www.postgresql.org/docs/current/datatype-textsearch.html#DATATYPE-TSQUERY

quoted_word = word.replace("'", "''").replace("\\", "\\\\")
escaped_words.append(f"'{quoted_word}'")

both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
exact = " & ".join("%s" % (result,) for result in results)
prefix = " & ".join("%s:*" % (result,) for result in results)
both = " & ".join("(%s:* | %s)" % (word, word) for word in escaped_words)
exact = " & ".join("%s" % (word,) for word in escaped_words)
prefix = " & ".join("%s:*" % (word,) for word in escaped_words)

return both, exact, prefix

Expand All @@ -944,6 +952,14 @@ def _parse_words(search_term: str) -> List[str]:
if USE_ICU:
return _parse_words_with_icu(search_term)

return _parse_words_with_regex(search_term)


def _parse_words_with_regex(search_term: str) -> List[str]:
"""
Break down search term into words, when we don't have ICU available.
See: `_parse_words`
"""
return re.findall(r"([\w\-]+)", search_term, re.UNICODE)


Expand Down
7 changes: 7 additions & 0 deletions tests/handlers/test_user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,13 @@ def test_excludes_appservice_sender(self) -> None:
self.helper.join(room, self.appservice.sender, tok=self.appservice.token)
self._check_only_one_user_in_directory(user, room)

def test_search_term_with_colon_in_it_does_not_raise(self) -> None:
"""
Regression test: Test that search terms with colons in them are acceptable.
"""
u1 = self.register_user("user1", "pass")
self.get_success(self.handler.search_users(u1, "haha:paamayim-nekudotayim", 10))

def test_user_not_in_users_table(self) -> None:
"""Unclear how it happens, but on matrix.org we've seen join events
for users who aren't in the users table. Test that we don't fall over
Expand Down
63 changes: 62 additions & 1 deletion tests/storage/test_user_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
from synapse.server import HomeServer
from synapse.storage import DataStore
from synapse.storage.background_updates import _BackgroundUpdateHandler
from synapse.storage.databases.main import user_directory
from synapse.storage.databases.main.user_directory import (
_parse_words_with_icu,
_parse_words_with_regex,
)
from synapse.storage.roommember import ProfileInfo
from synapse.util import Clock

Expand All @@ -42,7 +47,7 @@
BOB = "@bob:b"
BOBBY = "@bobby:a"
# The localpart isn't 'Bela' on purpose so we can test looking up display names.
BELA = "@somenickname:a"
BELA = "@somenickname:example.org"


class GetUserDirectoryTables:
Expand Down Expand Up @@ -423,6 +428,8 @@ async def mocked_process_users(*args: Any, **kwargs: Any) -> int:


class UserDirectoryStoreTestCase(HomeserverTestCase):
use_icu = False

def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main

Expand All @@ -434,6 +441,12 @@ def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.get_success(self.store.update_profile_in_user_dir(BELA, "Bela", None))
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE, BOB)))

self._restore_use_icu = user_directory.USE_ICU
user_directory.USE_ICU = self.use_icu

def tearDown(self) -> None:
user_directory.USE_ICU = self._restore_use_icu

def test_search_user_dir(self) -> None:
# normally when alice searches the directory she should just find
# bob because bobby doesn't share a room with her.
Expand Down Expand Up @@ -478,6 +491,26 @@ def test_search_user_dir_stop_words(self) -> None:
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)

@override_config({"user_directory": {"search_all_users": True}})
def test_search_user_dir_start_of_user_id(self) -> None:
"""Tests that a user can look up another user by searching for the start
of their user ID.
"""
r = self.get_success(self.store.search_user_dir(ALICE, "somenickname:exa", 10))
self.assertFalse(r["limited"])
self.assertEqual(1, len(r["results"]))
self.assertDictEqual(
r["results"][0],
{"user_id": BELA, "display_name": "Bela", "avatar_url": None},
)

Comment on lines +494 to +506
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this test (and any others) running under with-ICU and without-ICU? (Should it?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

now it is :)


class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase):
use_icu = True

if not icu:
skip = "Requires PyICU"


class UserDirectoryICUTestCase(HomeserverTestCase):
if not icu:
Expand Down Expand Up @@ -513,3 +546,31 @@ def test_icu_word_boundary(self) -> None:
r["results"][0],
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
)

def test_icu_word_boundary_punctuation(self) -> None:
"""
Tests the behaviour of punctuation with the ICU tokeniser.

Seems to depend on underlying version of ICU.
"""

# Note: either tokenisation is fine, because Postgres actually splits
# words itself afterwards.
self.assertIn(
_parse_words_with_icu("lazy'fox jumped:over the.dog"),
(
# ICU 66 on Ubuntu 20.04
["lazy'fox", "jumped", "over", "the", "dog"],
# ICU 70 on Ubuntu 22.04
["lazy'fox", "jumped:over", "the.dog"],
),
)

def test_regex_word_boundary_punctuation(self) -> None:
"""
Tests the behaviour of punctuation with the non-ICU tokeniser
"""
self.assertEqual(
_parse_words_with_regex("lazy'fox jumped:over the.dog"),
["lazy", "fox", "jumped", "over", "the", "dog"],
)