From 2dba6bdcb57b7890f6c160d28e9c953c50424f34 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 23 Feb 2023 14:02:54 +0000 Subject: [PATCH 1/8] Add function to transform display names for the user directory index Signed-off-by: Sean Quah --- .../storage/databases/main/user_directory.py | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 30af4b3b6cfe..88546317db78 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -491,6 +491,11 @@ def _update_profile_in_user_dir_txn(txn: LoggingTransaction) -> None: values={"display_name": display_name, "avatar_url": avatar_url}, ) + # The display name that goes into the database index. + index_display_name = display_name + if index_display_name is not None: + index_display_name = _filter_text_for_index(index_display_name) + if isinstance(self.database_engine, PostgresEngine): # We weight the localpart most highly, then display name and finally # server name @@ -508,11 +513,15 @@ def _update_profile_in_user_dir_txn(txn: LoggingTransaction) -> None: user_id, get_localpart_from_id(user_id), get_domain_from_id(user_id), - display_name, + index_display_name, ), ) elif isinstance(self.database_engine, Sqlite3Engine): - value = "%s %s" % (user_id, display_name) if display_name else user_id + value = ( + "%s %s" % (user_id, index_display_name) + if index_display_name + else user_id + ) self.db_pool.simple_upsert_txn( txn, table="user_directory_search", @@ -897,6 +906,16 @@ async def search_user_dir( return {"limited": limited, "results": results[0:limit]} +def _filter_text_for_index(text: str) -> str: + """Transforms text before it is inserted into the user directory index, or searched + for in the user directory index. + + Note that the user directory search table needs to be rebuilt whenever this function + changes. + """ + return text + + def _parse_query_sqlite(search_term: str) -> str: """Takes a plain unicode string from the user and converts it into a form that can be passed to database. @@ -906,6 +925,7 @@ def _parse_query_sqlite(search_term: str) -> str: We specifically add both a prefix and non prefix matching term so that exact matches get ranked higher. """ + search_term = _filter_text_for_index(search_term) # Pull out the individual words, discarding any non-word characters. results = _parse_words(search_term) @@ -918,6 +938,8 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]: We use this so that we can add prefix matching, which isn't something that is supported by default. """ + search_term = _filter_text_for_index(search_term) + escaped_words = [] for word in _parse_words(search_term): # Postgres tsvector and tsquery quoting rules: From 08562201e177460e500fafdbc3366bd7dad7cfb1 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 23 Feb 2023 14:12:04 +0000 Subject: [PATCH 2/8] Fix user directory searches being case-sensitive for accented characters Signed-off-by: Sean Quah --- synapse/storage/databases/main/user_directory.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 88546317db78..e538bbf3d66b 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -913,6 +913,13 @@ def _filter_text_for_index(text: str) -> str: Note that the user directory search table needs to be rebuilt whenever this function changes. """ + # Lowercase the text, to make searches case-insensitive. + # This is necessary for both PostgreSQL and SQLite. PostgreSQL's + # `to_tsquery/to_tsvector` functions don't lowercase non-ASCII characters when using + # the "C" collation, while SQLite just doesn't lowercase non-ASCII characters at + # all. + text = text.lower() + return text From 172b32feb45d2f396231ef489043de2c58d0a0b9 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 23 Feb 2023 14:20:18 +0000 Subject: [PATCH 3/8] Add tests for user directory search case insensitivity Signed-off-by: Sean Quah --- tests/storage/test_user_directory.py | 80 ++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 2d169684cf2e..e868d9e10b3e 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -504,6 +504,86 @@ def test_search_user_dir_start_of_user_id(self) -> None: {"user_id": BELA, "display_name": "Bela", "avatar_url": None}, ) + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_ascii_case_insensitivity(self) -> None: + """Tests that a user can look up another user by searching for their name in a + different case. + """ + CHARLIE = "@someuser:example.org" + self.get_success( + self.store.update_profile_in_user_dir(CHARLIE, "Charlie", None) + ) + + r = self.get_success(self.store.search_user_dir(ALICE, "cHARLIE", 10)) + self.assertFalse(r["limited"]) + self.assertEqual(1, len(r["results"])) + self.assertDictEqual( + r["results"][0], + {"user_id": CHARLIE, "display_name": "Charlie", "avatar_url": None}, + ) + + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_unicode_case_insensitivity(self) -> None: + """Tests that a user can look up another user by searching for their name in a + different case. + """ + IVAN = "@someuser:example.org" + self.get_success(self.store.update_profile_in_user_dir(IVAN, "Иван", None)) + + r = self.get_success(self.store.search_user_dir(ALICE, "иВАН", 10)) + self.assertFalse(r["limited"]) + self.assertEqual(1, len(r["results"])) + self.assertDictEqual( + r["results"][0], + {"user_id": IVAN, "display_name": "Иван", "avatar_url": None}, + ) + + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_dotted_dotless_i_case_insensitivity(self) -> None: + """Tests that a user can look up another user by searching for their name in a + different case, when their name contains dotted or dotless "i"s. + + Some languages have dotted and dotless versions of "i", which are considered to + be different letters: i <-> İ, ı <-> I. To make things difficult, they reuse the + ASCII "i" and "I" characters, despite having different lowercase / uppercase + forms. + """ + USER = "@someuser:example.org" + + expected_matches = [ + # (search_term, display_name) + # A search for "i" should match "İ". + ("iiiii", "İİİİİ"), + # A search for "I" should match "ı". + ("IIIII", "ııııı"), + # A search for "ı" should match "I". + ("ııııı", "IIIII"), + # A search for "İ" should match "i". + ("İİİİİ", "iiiii"), + ] + + for search_term, display_name in expected_matches: + self.get_success( + self.store.update_profile_in_user_dir(USER, display_name, None) + ) + + r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10)) + self.assertFalse(r["limited"]) + self.assertEqual( + 1, + len(r["results"]), + f"searching for {search_term!r} did not match {display_name!r}", + ) + self.assertDictEqual( + r["results"][0], + {"user_id": USER, "display_name": display_name, "avatar_url": None}, + ) + + # We don't test for negative matches, to allow implementations that consider all + # the i variants to be the same. + + test_search_user_dir_dotted_dotless_i_case_insensitivity.skip = "not supported" # type: ignore + class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): use_icu = True From cdda0e0c64eb4df2024eb02e5787b3eda020dc6f Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 23 Feb 2023 16:27:43 +0000 Subject: [PATCH 4/8] Add newsfile --- changelog.d/15143.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/15143.misc diff --git a/changelog.d/15143.misc b/changelog.d/15143.misc new file mode 100644 index 000000000000..cff4518811ec --- /dev/null +++ b/changelog.d/15143.misc @@ -0,0 +1 @@ +Fix a long-standing bug where the user directory search was not case-insensitive for accented characters. From dc49c1d44e1bae8a37543aece785e47403b51354 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 23 Feb 2023 14:29:06 +0000 Subject: [PATCH 5/8] Normalize accents in user directory search Signed-off-by: Sean Quah --- synapse/storage/databases/main/user_directory.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index e538bbf3d66b..64b3ae2d8b92 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -14,6 +14,7 @@ import logging import re +import unicodedata from typing import ( TYPE_CHECKING, Iterable, @@ -920,6 +921,16 @@ def _filter_text_for_index(text: str) -> str: # all. text = text.lower() + # Normalize the text. NFKC normalization has two effects: + # 1. It canonicalizes the text, ie. maps all visually identical strings to the same + # string. For example, ["e", "◌́"] is mapped to ["é"]. + # 2. It maps strings that are roughly equivalent to the same string. + # For example, ["dž"] is mapped to ["d", "ž"], ["①"] to ["1"] and ["i⁹"] to + # ["i", "9"]. + text = unicodedata.normalize("NFKC", text) + + # Note that nothing is done to make searches accent-insensitive. + return text From 6997d4241ff98feef00437f3e7bd4b7ddfc945ba Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 23 Feb 2023 14:30:28 +0000 Subject: [PATCH 6/8] Add test for accent normalization in user directory search Signed-off-by: Sean Quah --- tests/storage/test_user_directory.py | 30 ++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index e868d9e10b3e..576a06da8245 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -584,6 +584,36 @@ def test_search_user_dir_dotted_dotless_i_case_insensitivity(self) -> None: test_search_user_dir_dotted_dotless_i_case_insensitivity.skip = "not supported" # type: ignore + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_unicode_normalization(self) -> None: + """Tests that a user can look up another user by searching for their name with + either composed or decomposed accents. + """ + AMELIE = "@someuser:example.org" + + expected_matches = [ + # (search_term, display_name) + ("Ame\u0301lie", "Amélie"), + ("Amélie", "Ame\u0301lie"), + ] + + for search_term, display_name in expected_matches: + self.get_success( + self.store.update_profile_in_user_dir(AMELIE, display_name, None) + ) + + r = self.get_success(self.store.search_user_dir(ALICE, search_term, 10)) + self.assertFalse(r["limited"]) + self.assertEqual( + 1, + len(r["results"]), + f"searching for {search_term!r} did not match {display_name!r}", + ) + self.assertDictEqual( + r["results"][0], + {"user_id": AMELIE, "display_name": display_name, "avatar_url": None}, + ) + class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): use_icu = True From ec0c6e39be5cd4bbe446cdde4e860f0d79acb4df Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Thu, 23 Feb 2023 14:30:51 +0000 Subject: [PATCH 7/8] Add note about accent-insensitive user directory search --- .../storage/databases/main/user_directory.py | 8 +++++++ tests/storage/test_user_directory.py | 23 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/synapse/storage/databases/main/user_directory.py b/synapse/storage/databases/main/user_directory.py index 64b3ae2d8b92..277454a8dcb2 100644 --- a/synapse/storage/databases/main/user_directory.py +++ b/synapse/storage/databases/main/user_directory.py @@ -930,6 +930,14 @@ def _filter_text_for_index(text: str) -> str: text = unicodedata.normalize("NFKC", text) # Note that nothing is done to make searches accent-insensitive. + # That could be achieved by converting to NFKD form instead (with combining accents + # split out) and filtering out combining accents using `unicodedata.combining(c)`. + # The downside of this may be noisier search results, since search terms with + # explicit accents will match characters with no accents, or completely different + # accents. + # + # text = unicodedata.normalize("NFKD", text) + # text = "".join([c for c in text if not unicodedata.combining(c)]) return text diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 576a06da8245..2c157bbc2f90 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -614,6 +614,29 @@ def test_search_user_dir_unicode_normalization(self) -> None: {"user_id": AMELIE, "display_name": display_name, "avatar_url": None}, ) + @override_config({"user_directory": {"search_all_users": True}}) + def test_search_user_dir_accent_insensitivity(self) -> None: + """Tests that a user can look up another user by searching for their name + without any accents. + """ + AMELIE = "@someuser:example.org" + self.get_success(self.store.update_profile_in_user_dir(AMELIE, "Amélie", None)) + + r = self.get_success(self.store.search_user_dir(ALICE, "amelie", 10)) + self.assertFalse(r["limited"]) + self.assertEqual(1, len(r["results"])) + self.assertDictEqual( + r["results"][0], + {"user_id": AMELIE, "display_name": "Amélie", "avatar_url": None}, + ) + + # It may be desirable for "é"s in search terms to not match plain "e"s and we + # really don't want "é"s in search terms to match "e"s with different accents. + # But we don't test for this to allow implementations that consider all + # "e"-lookalikes to be the same. + + test_search_user_dir_accent_insensitivity.skip = "not supported yet" # type: ignore + class UserDirectoryStoreTestCaseWithIcu(UserDirectoryStoreTestCase): use_icu = True From f5f3e61694542f7174702d5df173945eff5cb2b4 Mon Sep 17 00:00:00 2001 From: Sean Quah Date: Fri, 24 Feb 2023 12:48:44 +0000 Subject: [PATCH 8/8] fixup comment --- tests/storage/test_user_directory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/storage/test_user_directory.py b/tests/storage/test_user_directory.py index 2c157bbc2f90..43b724c4ddd7 100644 --- a/tests/storage/test_user_directory.py +++ b/tests/storage/test_user_directory.py @@ -545,7 +545,7 @@ def test_search_user_dir_dotted_dotless_i_case_insensitivity(self) -> None: Some languages have dotted and dotless versions of "i", which are considered to be different letters: i <-> İ, ı <-> I. To make things difficult, they reuse the - ASCII "i" and "I" characters, despite having different lowercase / uppercase + ASCII "i" and "I" code points, despite having different lowercase / uppercase forms. """ USER = "@someuser:example.org"