rename load_wbm() -> load_train_test() and add ability to download MP training files too

janosh · janosh · commit 811f581e5bfb · 2023-06-19T20:29:22.000-07:00
increase flake8 max-line-length = 95
diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -9,14 +9,18 @@
 from pymatgen.entries.computed_entries import ComputedStructureEntry
 from tqdm import tqdm
 
-data_files = {
-    "summary": "2022-10-19-wbm-summary.csv",
-    "initial-structures": "2022-10-19-wbm-init-structs.json.bz2",
-    "computed-structure-entries": "2022-10-19-wbm-cses.json.bz2",
+DATA_FILENAMES = {
+    "wbm-summary": "wbm/2022-10-19-wbm-summary.csv",
+    "wbm-initial-structures": "wbm/2022-10-19-wbm-init-structs.json.bz2",
+    "wbm-computed-structure-entries": "wbm/2022-10-19-wbm-cses.json.bz2",
+    "mp-energies": "mp/2022-08-13-mp-energies.json.gz",
+    "mp-computed-structure-entries": "mp/2022-09-16-mp-computed-structure-entries.json.gz",
+    "mp-patched-phase-diagram": "mp/2022-09-18-ppd-mp.pkl.gz",
+    "mp-elemental-ref-energies": "mp/2022-09-19-mp-elemental-ref-energies.json",
 }
 
-base_url = "https://raw.githubusercontent.com/janosh/matbench-discovery/main/data/wbm"
-default_cache_loc = os.path.expanduser("~/.cache/matbench-discovery")
+RAW_REPO_URL = "https://raw.githubusercontent.com/janosh/matbench-discovery"
+default_cache_dir = os.path.expanduser("~/.cache/matbench-discovery")
 
 
 def chunks(xs: Sequence[Any], n: int) -> Generator[Sequence[Any], None, None]:
@@ -35,21 +39,26 @@ def as_dict_handler(obj: Any) -> dict[str, Any] | None:
         # removes e.g. non-serializable AseAtoms from M3GNet relaxation trajectories
 
 
-def load_wbm(
-    parts: Sequence[str] = ("summary",),
+def load_train_test(
+    parts: str | Sequence[str] = ("summary",),
     version: int = 1,
-    cache_dir: str | None = default_cache_loc,
+    cache_dir: str | None = default_cache_dir,
     hydrate: bool = False,
 ) -> pd.DataFrame | dict[str, pd.DataFrame]:
-    """_summary_
+    """Download the MP training data and WBM test data in parts or in full as pandas
+    DataFrames. The full training and test sets are each about ~500 MB as compressed
+    JSON will be cached locally for faster re-loading unless cache_dir is set to None.
+
+    Hint: Import DATA_FILES from the same module as this function and
+    print(list(DATA_FILES)) to see permissible data names.
 
     Args:
-        parts (str, optional): Which parts of the WBM dataset to load. Can be any subset
-            of {'summary', 'initial-structures', 'computed-structure-entries'}. Defaults
-            to ["summary"], a dataframe with columns for material properties like VASP
-            energy, formation energy, energy above the convex hull (3 columns with old,
-            new and no Materials Project energy corrections applied for each), volume,
-            band gap, number of sites per unit cell, and more.
+        parts (str | list[str], optional): Which parts of the MP/WBM dataset to load.
+            Can be any subset of list(DATA_FILES). Defaults to ["summary"], a dataframe
+            with columns for material properties like VASP energy, formation energy,
+            energy above the convex hull (3 columns with old, new and no Materials
+            Project energy corrections applied for each), volume, band gap, number of
+            sites per unit cell, and more.
         version (int, optional): Which version of the dataset to load. Defaults to 1
             (currently the only available option).
         cache_dir (str, optional): Where to cache data files on local drive. Defaults to
@@ -60,31 +69,36 @@ def load_wbm(
             False as it noticeably increases load time.
 
     Raises:
-        ValueError: On bad version or bad keys for which data parts to load.
+        ValueError: On bad version number or bad part names.
 
     Returns:
         pd.DataFrame | dict[str, pd.DataFrame]: Single dataframe of dictionary of
         multiple data parts were requested.
     """
+    if parts == "all":
+        parts = list(DATA_FILENAMES)
+    elif isinstance(parts, str):
+        parts = [parts]
+
     if version != 1:
         raise ValueError(f"Only version 1 currently available, got {version=}")
-    if missing := set(parts) - set(data_files):
-        raise ValueError(f"{missing} must be subset of {set(data_files)}")
+    if missing := set(parts) - set(DATA_FILENAMES):
+        raise ValueError(f"{missing} must be subset of {set(DATA_FILENAMES)}")
 
     dfs = {}
     for key in parts:
-        file = data_files[key]
+        file = DATA_FILENAMES[key]
         reader = pd.read_csv if file.endswith(".csv") else pd.read_json
 
         cache_path = f"{cache_dir}/{file}"
         if os.path.isfile(cache_path):
             df = reader(cache_path)
         else:
-            url = f"{base_url}/{file}"
-            print(f"Downloading {url=}")
+            url = f"{RAW_REPO_URL}/{version}.0.0/data/{file}"
+            print(f"Downloading {key} from {url}")
             df = reader(url)
             if cache_dir and not os.path.isfile(cache_path):
-                os.makedirs(cache_dir, exist_ok=True)
+                os.makedirs(os.path.dirname(cache_path), exist_ok=True)
                 if ".csv" in file:
                     df.to_csv(cache_path)
                 elif ".json" in file:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,13 +3,15 @@
 import pandas as pd
 import pytest
 from pymatgen.core import Lattice, Structure
+from pymatgen.entries.computed_entries import ComputedStructureEntry
 
 
 @pytest.fixture
 def dummy_df_with_structures(dummy_struct: Structure) -> pd.DataFrame:
     # create a dummy df with a structure column
     df = pd.DataFrame(dict(material_id=range(10), structure=[dummy_struct] * 10))
     df["volume"] = [x.volume for x in df.structure]
+    df["computed_structure_entry"] = [ComputedStructureEntry(dummy_struct, 0)] * 10
     return df
 
 
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -8,7 +8,13 @@
 import pytest
 from pymatgen.core import Lattice, Structure
 
-from matbench_discovery.data import as_dict_handler, chunks, data_files, load_wbm
+from matbench_discovery.data import (
+    DATA_FILENAMES,
+    RAW_REPO_URL,
+    as_dict_handler,
+    chunks,
+    load_train_test,
+)
 
 structure = Structure(
     lattice=Lattice.cubic(5),
@@ -20,24 +26,38 @@
 @pytest.mark.parametrize(
     "parts, cache_dir, hydrate",
     [
-        (["summary"], None, True),
-        (["initial-structures"], TemporaryDirectory().name, True),
-        (["computed-structure-entries"], None, False),
-        (["summary", "initial-structures"], TemporaryDirectory().name, True),
+        (["wbm-summary"], None, True),
+        (["wbm-initial-structures"], TemporaryDirectory().name, True),
+        (["wbm-computed-structure-entries"], None, False),
+        (["wbm-summary", "wbm-initial-structures"], TemporaryDirectory().name, True),
+        (["mp-elemental-ref-energies"], None, True),
+        (["mp-energies"], None, True),
     ],
 )
 def test_load_wbm(
     parts: list[str],
     cache_dir: str | None,
     hydrate: bool,
     dummy_df_with_structures: pd.DataFrame,
+    capsys: pytest.CaptureFixture,
 ) -> None:
     # intercept HTTP requests to GitHub raw user content and return dummy df instead
     with patch("matbench_discovery.data.pd.read_csv") as read_csv, patch(
         "matbench_discovery.data.pd.read_json"
     ) as read_json:
         read_csv.return_value = read_json.return_value = dummy_df_with_structures
-        out = load_wbm(parts, cache_dir=cache_dir, hydrate=hydrate)
+        out = load_train_test(parts, cache_dir=cache_dir, hydrate=hydrate)
+
+    stdout, stderr = capsys.readouterr()
+
+    assert (
+        "\n".join(
+            f"Downloading {part} from {RAW_REPO_URL}/1.0.0/data/{DATA_FILENAMES[part]}"
+            for part in parts
+        )
+        in stdout
+    )
+    assert "" == stderr
 
     assert read_json.call_count + read_csv.call_count == len(parts)
 
@@ -53,14 +73,14 @@ def test_load_wbm(
 def test_load_wbm_raises() -> None:
     with pytest.raises(
         ValueError,
-        match=f"must be subset of {set(data_files)}",
+        match=f"must be subset of {set(DATA_FILENAMES)}",
     ):
-        load_wbm(["invalid-part"])
+        load_train_test(["invalid-part"])
 
     with pytest.raises(
         ValueError, match="Only version 1 currently available, got version=2"
     ):
-        load_wbm(version=2)
+        load_train_test(version=2)
 
 
 def test_chunks() -> None: