Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.

Commit f7768f6

Browse files
authored
Avoid storing URL cache files in storage providers (#10911)
URL cache files are short-lived and it does not make sense to offload them (eg. to the cloud) or back them up.
1 parent 6c83c27 commit f7768f6

File tree

6 files changed

+154
-6
lines changed

6 files changed

+154
-6
lines changed

changelog.d/10911.bugfix

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Avoid storing URL cache files in storage providers. Server admins may safely delete the `url_cache/` and `url_cache_thumbnails/` directories from any configured storage providers to reclaim space.

docs/upgrade.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,13 @@ process, for example:
8585
dpkg -i matrix-synapse-py3_1.3.0+stretch1_amd64.deb
8686
```
8787

88+
# Upgrading to v1.44.0
89+
90+
## The URL preview cache is no longer mirrored to storage providers
91+
The `url_cache/` and `url_cache_thumbnails/` directories in the media store are
92+
no longer mirrored to storage providers. These two directories can be safely
93+
deleted from any configured storage providers to reclaim space.
94+
8895
# Upgrading to v1.43.0
8996

9097
## The spaces summary APIs can now be handled by workers

synapse/rest/media/v1/filepath.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -195,23 +195,24 @@ def url_cache_thumbnail_rel(
195195

196196
url_cache_thumbnail = _wrap_in_base_path(url_cache_thumbnail_rel)
197197

198-
def url_cache_thumbnail_directory(self, media_id: str) -> str:
198+
def url_cache_thumbnail_directory_rel(self, media_id: str) -> str:
199199
# Media id is of the form <DATE><RANDOM_STRING>
200200
# E.g.: 2017-09-28-fsdRDt24DS234dsf
201201

202202
if NEW_FORMAT_ID_RE.match(media_id):
203-
return os.path.join(
204-
self.base_path, "url_cache_thumbnails", media_id[:10], media_id[11:]
205-
)
203+
return os.path.join("url_cache_thumbnails", media_id[:10], media_id[11:])
206204
else:
207205
return os.path.join(
208-
self.base_path,
209206
"url_cache_thumbnails",
210207
media_id[0:2],
211208
media_id[2:4],
212209
media_id[4:],
213210
)
214211

212+
url_cache_thumbnail_directory = _wrap_in_base_path(
213+
url_cache_thumbnail_directory_rel
214+
)
215+
215216
def url_cache_thumbnail_dirs_to_delete(self, media_id: str) -> List[str]:
216217
"The dirs to try and remove if we delete the media_id thumbnails"
217218
# Media id is of the form <DATE><RANDOM_STRING>

synapse/rest/media/v1/preview_url_resource.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,6 @@ def _start_expire_url_cache_data(self) -> Deferred:
485485

486486
async def _expire_url_cache_data(self) -> None:
487487
"""Clean up expired url cache content, media and thumbnails."""
488-
# TODO: Delete from backup media store
489488

490489
assert self._worker_run_media_background_jobs
491490

synapse/rest/media/v1/storage_provider.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ async def store_file(self, path: str, file_info: FileInfo) -> None:
9393
if file_info.server_name and not self.store_remote:
9494
return None
9595

96+
if file_info.url_cache:
97+
# The URL preview cache is short lived and not worth offloading or
98+
# backing up.
99+
return None
100+
96101
if self.store_synchronous:
97102
# store_file is supposed to return an Awaitable, but guard
98103
# against improper implementations.
@@ -110,6 +115,11 @@ async def store() -> None:
110115
run_in_background(store)
111116

112117
async def fetch(self, path: str, file_info: FileInfo) -> Optional[Responder]:
118+
if file_info.url_cache:
119+
# Files in the URL preview cache definitely aren't stored here,
120+
# so avoid any potentially slow I/O or network access.
121+
return None
122+
113123
# store_file is supposed to return an Awaitable, but guard
114124
# against improper implementations.
115125
return await maybe_awaitable(self.backend.fetch(path, file_info))

tests/rest/media/v1/test_url_preview.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from twisted.test.proto_helpers import AccumulatingProtocol
2222

2323
from synapse.config.oembed import OEmbedEndpointConfig
24+
from synapse.util.stringutils import parse_and_validate_mxc_uri
2425

2526
from tests import unittest
2627
from tests.server import FakeTransport
@@ -721,3 +722,132 @@ def test_oembed_format(self):
721722
"og:description": "Content Preview",
722723
},
723724
)
725+
726+
def _download_image(self):
727+
"""Downloads an image into the URL cache.
728+
729+
Returns:
730+
A (host, media_id) tuple representing the MXC URI of the image.
731+
"""
732+
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
733+
734+
channel = self.make_request(
735+
"GET",
736+
"preview_url?url=http://cdn.twitter.com/matrixdotorg",
737+
shorthand=False,
738+
await_result=False,
739+
)
740+
self.pump()
741+
742+
client = self.reactor.tcpClients[0][2].buildProtocol(None)
743+
server = AccumulatingProtocol()
744+
server.makeConnection(FakeTransport(client, self.reactor))
745+
client.makeConnection(FakeTransport(server, self.reactor))
746+
client.dataReceived(
747+
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\nContent-Type: image/png\r\n\r\n"
748+
% (len(SMALL_PNG),)
749+
+ SMALL_PNG
750+
)
751+
752+
self.pump()
753+
self.assertEqual(channel.code, 200)
754+
body = channel.json_body
755+
mxc_uri = body["og:image"]
756+
host, _port, media_id = parse_and_validate_mxc_uri(mxc_uri)
757+
self.assertIsNone(_port)
758+
return host, media_id
759+
760+
def test_storage_providers_exclude_files(self):
761+
"""Test that files are not stored in or fetched from storage providers."""
762+
host, media_id = self._download_image()
763+
764+
rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
765+
media_store_path = os.path.join(self.media_store_path, rel_file_path)
766+
storage_provider_path = os.path.join(self.storage_path, rel_file_path)
767+
768+
# Check storage
769+
self.assertTrue(os.path.isfile(media_store_path))
770+
self.assertFalse(
771+
os.path.isfile(storage_provider_path),
772+
"URL cache file was unexpectedly stored in a storage provider",
773+
)
774+
775+
# Check fetching
776+
channel = self.make_request(
777+
"GET",
778+
f"download/{host}/{media_id}",
779+
shorthand=False,
780+
await_result=False,
781+
)
782+
self.pump()
783+
self.assertEqual(channel.code, 200)
784+
785+
# Move cached file into the storage provider
786+
os.makedirs(os.path.dirname(storage_provider_path), exist_ok=True)
787+
os.rename(media_store_path, storage_provider_path)
788+
789+
channel = self.make_request(
790+
"GET",
791+
f"download/{host}/{media_id}",
792+
shorthand=False,
793+
await_result=False,
794+
)
795+
self.pump()
796+
self.assertEqual(
797+
channel.code,
798+
404,
799+
"URL cache file was unexpectedly retrieved from a storage provider",
800+
)
801+
802+
def test_storage_providers_exclude_thumbnails(self):
803+
"""Test that thumbnails are not stored in or fetched from storage providers."""
804+
host, media_id = self._download_image()
805+
806+
rel_thumbnail_path = (
807+
self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id)
808+
)
809+
media_store_thumbnail_path = os.path.join(
810+
self.media_store_path, rel_thumbnail_path
811+
)
812+
storage_provider_thumbnail_path = os.path.join(
813+
self.storage_path, rel_thumbnail_path
814+
)
815+
816+
# Check storage
817+
self.assertTrue(os.path.isdir(media_store_thumbnail_path))
818+
self.assertFalse(
819+
os.path.isdir(storage_provider_thumbnail_path),
820+
"URL cache thumbnails were unexpectedly stored in a storage provider",
821+
)
822+
823+
# Check fetching
824+
channel = self.make_request(
825+
"GET",
826+
f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
827+
shorthand=False,
828+
await_result=False,
829+
)
830+
self.pump()
831+
self.assertEqual(channel.code, 200)
832+
833+
# Remove the original, otherwise thumbnails will regenerate
834+
rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id)
835+
media_store_path = os.path.join(self.media_store_path, rel_file_path)
836+
os.remove(media_store_path)
837+
838+
# Move cached thumbnails into the storage provider
839+
os.makedirs(os.path.dirname(storage_provider_thumbnail_path), exist_ok=True)
840+
os.rename(media_store_thumbnail_path, storage_provider_thumbnail_path)
841+
842+
channel = self.make_request(
843+
"GET",
844+
f"thumbnail/{host}/{media_id}?width=32&height=32&method=scale",
845+
shorthand=False,
846+
await_result=False,
847+
)
848+
self.pump()
849+
self.assertEqual(
850+
channel.code,
851+
404,
852+
"URL cache thumbnail was unexpectedly retrieved from a storage provider",
853+
)

0 commit comments

Comments
 (0)