fix load_train_test() caching all data versions to same directory

janosh · janosh · commit 50848865f8a4 · 2023-06-19T20:29:22.000-07:00
improve load_train_test() progress reporting
expand load_train_test() test coverage
bump flake8 max-complexity = 16 -&gt; 18
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,12 +7,12 @@ default_install_hook_types: [pre-commit, commit-msg]
 
 repos:
   - repo: https://github.com/PyCQA/isort
-    rev: 5.10.1
+    rev: 5.11.4
     hooks:
       - id: isort
 
   - repo: https://github.com/psf/black
-    rev: 22.10.0
+    rev: 22.12.0
     hooks:
       - id: black
 
@@ -23,7 +23,7 @@ repos:
         additional_dependencies: [flake8-bugbear]
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.2.2
+    rev: v3.3.1
     hooks:
       - id: pyupgrade
         args: [--py39-plus]
@@ -63,7 +63,7 @@ repos:
       - id: autoflake
 
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v3.0.0-alpha.0
+    rev: v3.0.0-alpha.4
     hooks:
       - id: prettier
         args: [--write] # edit files in-place
@@ -74,7 +74,7 @@ repos:
         exclude: ^figures/.*$
 
   - repo: https://github.com/pre-commit/mirrors-eslint
-    rev: v8.24.0
+    rev: v8.30.0
     hooks:
       - id: eslint
         types: [file]
diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -1,8 +1,10 @@
 from __future__ import annotations
 
 import os
+import urllib.error
 from collections.abc import Generator, Sequence
 from glob import glob
+from pathlib import Path
 from typing import Any, Callable
 
 import pandas as pd
@@ -46,11 +48,12 @@ def as_dict_handler(obj: Any) -> dict[str, Any] | None:
 
 
 def load_train_test(
-    parts: str | Sequence[str] = ("summary",),
-    version: int = 1,
-    cache_dir: str | None = default_cache_dir,
+    data_names: str | Sequence[str] = ("summary",),
+    version: str = "1.0.0",
+    cache_dir: str | Path | None = default_cache_dir,
     hydrate: bool = False,
-) -> pd.DataFrame | dict[str, pd.DataFrame]:
+    **kwargs: Any,
+) -> pd.DataFrame:
     """Download parts of or the full MP training data and WBM test data as pandas
     DataFrames. The full training and test sets are each about ~500 MB as compressed
     JSON which will be cached locally to cache_dir for faster re-loading unless
@@ -62,46 +65,50 @@ def load_train_test(
     https://matbench-discovery.janosh.dev/how-to-use for brief data descriptions.
 
     Args:
-        parts (str | list[str], optional): Which parts of the MP/WBM dataset to load.
-            Can be any subset of the above data names. Defaults to ["summary"].
-        version (int, optional): Which version of the dataset to load. Defaults to 1
-            (currently the only available option).
+        data_names (str | list[str], optional): Which parts of the MP/WBM dataset to load.
+            Can be any subset of the above data names or 'all'. Defaults to ["summary"].
+        version (str, optional): Which version of the dataset to load. Defaults to
+            '1.0.0'. Can be any git tag, branch or commit hash.
         cache_dir (str, optional): Where to cache data files on local drive. Defaults to
             '~/.cache/matbench-discovery'. Set to None to disable caching.
         hydrate (bool, optional): Whether to hydrate pymatgen objects. If False,
             Structures and ComputedStructureEntries are returned as dictionaries which
             can be hydrated on-demand with df.col.map(Structure.from_dict). Defaults to
             False as it noticeably increases load time.
+        **kwargs: Additional keyword arguments passed to pandas.read_json or read_csv,
+            depending on which file is loaded.
 
     Raises:
-        ValueError: On bad version number or bad part names.
+        ValueError: On bad version number or bad data names.
 
     Returns:
-        pd.DataFrame | dict[str, pd.DataFrame]: Single dataframe of dictionary of
-        multiple data parts were requested.
+        pd.DataFrame: Single dataframe or dictionary of dfs if
+        multiple data were requested.
     """
-    if parts == "all":
-        parts = list(DATA_FILENAMES)
-    elif isinstance(parts, str):
-        parts = [parts]
-
-    if version != 1:
-        raise ValueError(f"Only version 1 currently available, got {version=}")
-    if missing := set(parts) - set(DATA_FILENAMES):
+    if data_names == "all":
+        data_names = list(DATA_FILENAMES)
+    elif isinstance(data_names, str):
+        data_names = [data_names]
+
+    if missing := set(data_names) - set(DATA_FILENAMES):
         raise ValueError(f"{missing} must be subset of {set(DATA_FILENAMES)}")
 
     dfs = {}
-    for key in parts:
+    for key in data_names:
         file = DATA_FILENAMES[key]
         reader = pd.read_csv if file.endswith(".csv") else pd.read_json
 
-        cache_path = f"{cache_dir}/{file}"
+        cache_path = f"{cache_dir}/{version}/{file}"
         if os.path.isfile(cache_path):
-            df = reader(cache_path)
+            print(f"Loading '{key}' from cached file at '{cache_path}'")
+            df = reader(cache_path, **kwargs)
         else:
-            url = f"{RAW_REPO_URL}/{version}.0.0/data/{file}"
-            print(f"Downloading {key} from {url}")
-            df = reader(url)
+            url = f"{RAW_REPO_URL}/{version}/data/{file}"
+            print(f"Downloading '{key}' from {url}")
+            try:
+                df = reader(url)
+            except urllib.error.HTTPError as exc:
+                raise ValueError(f"Bad {url=}") from exc
             if cache_dir and not os.path.isfile(cache_path):
                 os.makedirs(os.path.dirname(cache_path), exist_ok=True)
                 if ".csv" in file:
@@ -128,8 +135,8 @@ def load_train_test(
 
         dfs[key] = df
 
-    if len(parts) == 1:
-        return dfs[parts[0]]
+    if len(data_names) == 1:
+        return dfs[data_names[0]]
     return dfs
 
 
diff --git a/readme.md b/readme.md
@@ -8,8 +8,8 @@ Matbench Discovery
 [![Tests](https://github.com/janosh/matbench-discovery/actions/workflows/test.yml/badge.svg)](https://github.com/janosh/matbench-discovery/actions/workflows/test.yml)
 [![GitHub Pages](https://github.com/janosh/matbench-discovery/actions/workflows/gh-pages.yml/badge.svg)](https://github.com/janosh/matbench-discovery/actions/workflows/gh-pages.yml)
 [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/janosh/matbench-discovery/main.svg?badge_token=Qza33izjRxSbegTqeSyDvA)](https://results.pre-commit.ci/latest/github/janosh/matbench-discovery/main?badge_token=Qza33izjRxSbegTqeSyDvA)
-[![Requires Python 3.9+](https://img.shields.io/badge/Python-3.9+-blue.svg?logo=python)](https://python.org/downloads)
-[![PyPI](https://img.shields.io/pypi/v/matbench-discovery?logo=pypi)](https://pypi.org/project/matbench-discovery?logo=pypi)
+[![Requires Python 3.9+](https://img.shields.io/badge/Python-3.9+-blue.svg?logo=python&logoColor=white)](https://python.org/downloads)
+[![PyPI](https://img.shields.io/pypi/v/matbench-discovery?logo=pypi&logoColor=white)](https://pypi.org/project/matbench-discovery?logo=pypi&logoColor=white)
 
 </h4>
 
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -1,13 +1,16 @@
 from __future__ import annotations
 
 import os
+import urllib.request
+from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any
 from unittest.mock import patch
 
 import pandas as pd
 import pytest
 from pymatgen.core import Lattice, Structure
+from pytest import CaptureFixture
 
 from matbench_discovery import ROOT
 from matbench_discovery.data import (
@@ -28,9 +31,14 @@
     coords=((0, 0, 0), (0.5, 0.5, 0.5)),
 )
 
+try:
+    website_down = urllib.request.urlopen(RAW_REPO_URL).status != 200
+except Exception:
+    website_down = True
+
 
 @pytest.mark.parametrize(
-    "parts, cache_dir, hydrate",
+    "data_names, cache_dir, hydrate",
     [
         (["wbm-summary"], None, True),
         (["wbm-initial-structures"], TemporaryDirectory().name, True),
@@ -41,64 +49,109 @@
     ],
 )
 def test_load_train_test(
-    parts: list[str],
+    data_names: list[str],
     cache_dir: str | None,
     hydrate: bool,
     dummy_df_with_structures: pd.DataFrame,
-    capsys: pytest.CaptureFixture,
+    capsys: CaptureFixture[str],
 ) -> None:
     # intercept HTTP requests to GitHub raw user content and return dummy df instead
     with patch("matbench_discovery.data.pd.read_csv") as read_csv, patch(
         "matbench_discovery.data.pd.read_json"
     ) as read_json:
         read_csv.return_value = read_json.return_value = dummy_df_with_structures
-        out = load_train_test(parts, cache_dir=cache_dir, hydrate=hydrate)
+        out = load_train_test(data_names, cache_dir=cache_dir, hydrate=hydrate)
 
     stdout, stderr = capsys.readouterr()
 
-    assert (
-        "\n".join(
-            f"Downloading {part} from {RAW_REPO_URL}/1.0.0/data/{DATA_FILENAMES[part]}"
-            for part in parts
-        )
-        in stdout
+    expected_out = "\n".join(
+        f"Downloading '{name}' from {RAW_REPO_URL}/1.0.0/data/{DATA_FILENAMES[name]}"
+        for name in data_names
     )
+    assert expected_out in stdout
     assert "" == stderr
 
-    assert read_json.call_count + read_csv.call_count == len(parts)
+    assert read_json.call_count + read_csv.call_count == len(data_names)
 
-    if len(parts) > 1:
+    if len(data_names) > 1:
         assert isinstance(out, dict)
-        assert list(out) == parts
+        assert list(out) == data_names
         for df in out.values():
             assert isinstance(df, pd.DataFrame)
     else:
         assert isinstance(out, pd.DataFrame)
 
 
-def test_load_train_test_raises() -> None:
-    with pytest.raises(
-        ValueError,
-        match=f"must be subset of {set(DATA_FILENAMES)}",
-    ):
-        load_train_test(["invalid-part"])
+def test_load_train_test_raises(tmp_path: Path) -> None:
+    # bad data name
+    with pytest.raises(ValueError, match=f"must be subset of {set(DATA_FILENAMES)}"):
+        load_train_test(["bad-data-name"])
+
+    # bad_version
+    version = "not-a-real-branch"
+    with pytest.raises(ValueError) as exc_info:
+        load_train_test("wbm-summary", version=version, cache_dir=tmp_path)
 
-    with pytest.raises(
-        ValueError, match="Only version 1 currently available, got version=2"
-    ):
-        load_train_test(version=2)
+    assert (
+        str(exc_info.value)
+        == "Bad url='https://raw.githubusercontent.com/janosh/matbench-discovery"
+        f"/{version}/data/wbm/2022-10-19-wbm-summary.csv'"
+    )
 
 
 def test_load_train_test_doc_str() -> None:
     doc_str = load_train_test.__doc__
     assert isinstance(doc_str, str)  # mypy type narrowing
 
-    assert all(key in doc_str for key in DATA_FILENAMES)
+    for name in DATA_FILENAMES:
+        assert name in doc_str, f"Missing data {name=} in load_train_test() docstring"
 
     # TODO refactor to load site URL from site/package.json for SSoT
     assert "https://matbench-discovery.janosh.dev" in doc_str
 
 
+@pytest.mark.skipif(website_down, reason=f"{RAW_REPO_URL} unreachable")
+@pytest.mark.parametrize("version", ["main"])  # , "d00d475"
+def test_load_train_test_no_mock(
+    version: str, capsys: CaptureFixture[str], tmp_path: Path
+) -> None:
+    # this function runs the download from GitHub raw user content for real
+    # hence takes some time and requires being online
+    df_wbm = load_train_test("wbm-summary", version=version, cache_dir=tmp_path)
+    assert df_wbm.shape == (256963, 17)
+    assert set(df_wbm) > {
+        "bandgap_pbe",
+        "e_form_per_atom_mp2020_corrected",
+        "e_form_per_atom_uncorrected",
+        "e_form_per_atom_wbm",
+        "e_hull_wbm",
+        "formula",
+        "n_sites",
+        "uncorrected_energy",
+        "uncorrected_energy_from_cse",
+        "volume",
+        "wyckoff_spglib",
+    }, "Loaded df missing columns"
+
+    stdout, stderr = capsys.readouterr()
+    assert stderr == ""
+    assert (
+        stdout
+        == "Downloading 'wbm-summary' from https://raw.githubusercontent.com/janosh"
+        f"/matbench-discovery/{version}/data/wbm/2022-10-19-wbm-summary.csv\n"
+    )
+
+    df_wbm = load_train_test("wbm-summary", version=version, cache_dir=tmp_path)
+
+    stdout, stderr = capsys.readouterr()
+    assert stderr == ""
+    assert (
+        stdout
+        == f"Loading 'wbm-summary' from cached file at '{tmp_path}/main/wbm/2022-10-19-"
+        "wbm-summary.csv'\n"
+    )
+
+
 def test_chunks() -> None:
     assert list(chunks([], 1)) == []
     assert list(chunks([1], 1)) == [[1]]