Skip to content

Commit c52714b

Browse files
authored
Merge pull request #503 from python-jsonschema/fix-caching
Fix caching behavior to always use hashed URLs for the cache filenames
2 parents 85aa1cc + 6e94302 commit c52714b

File tree

8 files changed

+101
-104
lines changed

8 files changed

+101
-104
lines changed

docs/usage.rst

+4-3
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,10 @@ Downloading and Caching
118118
By default, when ``--schemafile`` is used to refer to an ``http://`` or
119119
``https://`` location, the schema is downloaded and cached based on the
120120
schema's Last-Modified time.
121+
122+
Additionally, when ``$ref``\s are looked up during schema resolution, they are
123+
similarly cached.
124+
121125
The following options control caching behaviors.
122126

123127
.. list-table:: Caching Options
@@ -128,9 +132,6 @@ The following options control caching behaviors.
128132
- Description
129133
* - ``--no-cache``
130134
- Disable caching.
131-
* - ``--cache-filename``
132-
- The name to use for caching a remote schema.
133-
Defaults to using the last slash-delimited part of the URI.
134135

135136
"format" Validation Options
136137
---------------------------

src/check_jsonschema/cachedownloader.py

+26-11
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import contextlib
4+
import hashlib
45
import io
56
import os
67
import platform
@@ -33,7 +34,7 @@ def _base_cache_dir() -> str | None:
3334
return cache_dir
3435

3536

36-
def _resolve_cache_dir(dirname: str = "downloads") -> str | None:
37+
def _resolve_cache_dir(dirname: str) -> str | None:
3738
cache_dir = _base_cache_dir()
3839
if cache_dir:
3940
cache_dir = os.path.join(cache_dir, "check_jsonschema", dirname)
@@ -95,18 +96,32 @@ def _cache_hit(cachefile: str, response: requests.Response) -> bool:
9596
return local_mtime >= remote_mtime
9697

9798

99+
def url_to_cache_filename(ref_url: str) -> str:
100+
"""
101+
Given a schema URL, convert it to a filename for caching in a cache dir.
102+
103+
Rules are as follows:
104+
- the base filename is an sha256 hash of the URL
105+
- if the filename ends in an extension (.json, .yaml, etc) that extension
106+
is appended to the hash
107+
108+
Preserving file extensions preserves the extension-based logic used for parsing, and
109+
it also helps a local editor (browsing the cache) identify filetypes.
110+
"""
111+
filename = hashlib.sha256(ref_url.encode()).hexdigest()
112+
if "." in (last_part := ref_url.rpartition("/")[-1]):
113+
_, _, extension = last_part.rpartition(".")
114+
filename = f"{filename}.{extension}"
115+
return filename
116+
117+
98118
class FailedDownloadError(Exception):
99119
pass
100120

101121

102122
class CacheDownloader:
103-
def __init__(
104-
self, cache_dir: str | None = None, disable_cache: bool = False
105-
) -> None:
106-
if cache_dir is None:
107-
self._cache_dir = _resolve_cache_dir()
108-
else:
109-
self._cache_dir = _resolve_cache_dir(cache_dir)
123+
def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None:
124+
self._cache_dir = _resolve_cache_dir(cache_dir)
110125
self._disable_cache = disable_cache
111126

112127
def _download(
@@ -160,21 +175,21 @@ def bind(
160175
validation_callback: t.Callable[[bytes], t.Any] | None = None,
161176
) -> BoundCacheDownloader:
162177
return BoundCacheDownloader(
163-
file_url, filename, self, validation_callback=validation_callback
178+
file_url, self, filename=filename, validation_callback=validation_callback
164179
)
165180

166181

167182
class BoundCacheDownloader:
168183
def __init__(
169184
self,
170185
file_url: str,
171-
filename: str | None,
172186
downloader: CacheDownloader,
173187
*,
188+
filename: str | None = None,
174189
validation_callback: t.Callable[[bytes], t.Any] | None = None,
175190
) -> None:
176191
self._file_url = file_url
177-
self._filename = filename or file_url.split("/")[-1]
192+
self._filename = filename or url_to_cache_filename(file_url)
178193
self._downloader = downloader
179194
self._validation_callback = validation_callback
180195

src/check_jsonschema/cli/main_command.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,7 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
130130
help="Disable schema caching. Always download remote schemas.",
131131
)
132132
@click.option(
133-
"--cache-filename",
134-
help=(
135-
"The name to use for caching a remote schema. "
136-
"Defaults to the last slash-delimited part of the URI."
137-
),
133+
"--cache-filename", help="Deprecated. This option no longer has any effect."
138134
)
139135
@click.option(
140136
"--disable-formats",
@@ -271,8 +267,6 @@ def main(
271267
args.disable_cache = no_cache
272268
args.default_filetype = default_filetype
273269
args.fill_defaults = fill_defaults
274-
if cache_filename is not None:
275-
args.cache_filename = cache_filename
276270
if data_transform is not None:
277271
args.data_transform = TRANSFORM_LIBRARY[data_transform]
278272

@@ -300,7 +294,6 @@ def build_schema_loader(args: ParseResult) -> SchemaLoaderBase:
300294
assert args.schema_path is not None
301295
return SchemaLoader(
302296
args.schema_path,
303-
cache_filename=args.cache_filename,
304297
disable_cache=args.disable_cache,
305298
base_uri=args.base_uri,
306299
validator_class=args.validator_class,

src/check_jsonschema/schema_loader/main.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,12 @@ def __init__(
6464
self,
6565
schemafile: str,
6666
*,
67-
cache_filename: str | None = None,
6867
base_uri: str | None = None,
6968
validator_class: type[jsonschema.protocols.Validator] | None = None,
7069
disable_cache: bool = True,
7170
) -> None:
7271
# record input parameters (these are not to be modified)
7372
self.schemafile = schemafile
74-
self.cache_filename = cache_filename
7573
self.disable_cache = disable_cache
7674
self.base_uri = base_uri
7775
self.validator_class = validator_class
@@ -105,11 +103,7 @@ def _get_schema_reader(
105103
return LocalSchemaReader(self.schemafile)
106104

107105
if self.url_info.scheme in ("http", "https"):
108-
return HttpSchemaReader(
109-
self.schemafile,
110-
self.cache_filename,
111-
self.disable_cache,
112-
)
106+
return HttpSchemaReader(self.schemafile, self.disable_cache)
113107
else:
114108
raise UnsupportedUrlScheme(
115109
"check-jsonschema only supports http, https, and local files. "

src/check_jsonschema/schema_loader/readers.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -73,14 +73,13 @@ class HttpSchemaReader:
7373
def __init__(
7474
self,
7575
url: str,
76-
cache_filename: str | None,
7776
disable_cache: bool,
7877
) -> None:
7978
self.url = url
8079
self.parsers = ParserSet()
81-
self.downloader = CacheDownloader(
82-
disable_cache=disable_cache,
83-
).bind(url, cache_filename, validation_callback=self._parse)
80+
self.downloader = CacheDownloader("schemas", disable_cache=disable_cache).bind(
81+
url, validation_callback=self._parse
82+
)
8483
self._parsed_schema: dict | _UnsetType = _UNSET
8584

8685
def _parse(self, schema_bytes: bytes) -> t.Any:

src/check_jsonschema/schema_loader/resolver.py

+2-20
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import hashlib
43
import typing as t
54
import urllib.parse
65

@@ -12,21 +11,6 @@
1211
from ..utils import filename2path
1312

1413

15-
def ref_url_to_cache_filename(ref_url: str) -> str:
16-
"""
17-
Given a $ref URL, convert it to the filename in the refs/ cache dir.
18-
Rules are as follows:
19-
- the base filename is an md5 hash of the URL
20-
- if the filename ends in an extension (.json, .yaml, etc) that extension
21-
is appended to the hash
22-
"""
23-
filename = hashlib.md5(ref_url.encode()).hexdigest()
24-
if "." in (last_part := ref_url.rpartition("/")[-1]):
25-
_, _, extension = last_part.rpartition(".")
26-
filename = f"{filename}.{extension}"
27-
return filename
28-
29-
3014
def make_reference_registry(
3115
parsers: ParserSet, retrieval_uri: str | None, schema: dict, disable_cache: bool
3216
) -> referencing.Registry:
@@ -66,7 +50,7 @@ def create_retrieve_callable(
6650
base_uri = retrieval_uri
6751

6852
cache = ResourceCache()
69-
downloader = CacheDownloader("refs", disable_cache)
53+
downloader = CacheDownloader("refs", disable_cache=disable_cache)
7054

7155
def get_local_file(uri: str) -> t.Any:
7256
path = filename2path(uri)
@@ -89,9 +73,7 @@ def validation_callback(content: bytes) -> None:
8973
parser_set.parse_data_with_path(content, full_uri, "json")
9074

9175
bound_downloader = downloader.bind(
92-
full_uri,
93-
ref_url_to_cache_filename(full_uri),
94-
validation_callback,
76+
full_uri, validation_callback=validation_callback
9577
)
9678
with bound_downloader.open() as fp:
9779
data = fp.read()

tests/conftest.py

+16-8
Original file line numberDiff line numberDiff line change
@@ -62,15 +62,25 @@ def patch_cache_dir(monkeypatch, cache_dir):
6262
yield m
6363

6464

65+
@pytest.fixture
66+
def url2cachepath():
67+
from check_jsonschema.cachedownloader import url_to_cache_filename
68+
69+
def _get(cache_dir, url):
70+
return cache_dir / url_to_cache_filename(url)
71+
72+
return _get
73+
74+
6575
@pytest.fixture
6676
def downloads_cache_dir(tmp_path):
6777
return tmp_path / ".cache" / "check_jsonschema" / "downloads"
6878

6979

7080
@pytest.fixture
71-
def get_download_cache_loc(downloads_cache_dir):
72-
def _get(uri):
73-
return downloads_cache_dir / uri.split("/")[-1]
81+
def get_download_cache_loc(downloads_cache_dir, url2cachepath):
82+
def _get(url):
83+
return url2cachepath(downloads_cache_dir, url)
7484

7585
return _get
7686

@@ -94,11 +104,9 @@ def refs_cache_dir(tmp_path):
94104

95105

96106
@pytest.fixture
97-
def get_ref_cache_loc(refs_cache_dir):
98-
from check_jsonschema.schema_loader.resolver import ref_url_to_cache_filename
99-
100-
def _get(uri):
101-
return refs_cache_dir / ref_url_to_cache_filename(uri)
107+
def get_ref_cache_loc(refs_cache_dir, url2cachepath):
108+
def _get(url):
109+
return url2cachepath(refs_cache_dir, url)
102110

103111
return _get
104112

0 commit comments

Comments
 (0)