add new module matbench_discovery/data.py with func load_wbm()

janosh · janosh · commit 077311216c62 · 2023-06-19T20:29:22.000-07:00
add tests/test_data.py with test_load_wbm()
diff --git a/matbench_discovery/__init__.py b/matbench_discovery/__init__.py
@@ -2,26 +2,11 @@
 
 import os
 import sys
-from collections.abc import Generator, Sequence
 from datetime import datetime
-from typing import Any
 
 ROOT = os.path.dirname(os.path.dirname(__file__))
 DEBUG = "slurm-submit" not in sys.argv and "SLURM_JOB_ID" not in os.environ
 CHECKPOINT_DIR = f"{ROOT}/wandb/checkpoints"
 
 timestamp = f"{datetime.now():%Y-%m-%d@%H-%M-%S}"
 today = timestamp.split("@")[0]
-
-
-def chunks(xs: Sequence[Any], n: int) -> Generator[Sequence[Any], None, None]:
-    return (xs[i : i + n] for i in range(0, len(xs), n))
-
-
-def as_dict_handler(obj: Any) -> dict[str, Any] | None:
-    """Use as default_handler kwarg to json.dump() or pandas.to_json()."""
-    try:
-        return obj.as_dict()  # all MSONable objects implement as_dict()
-    except AttributeError:
-        return None  # replace unhandled objects with None in serialized data
-        # removes e.g. non-serializable AseAtoms from M3GNet relaxation trajectories
diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import os
+from collections.abc import Generator, Sequence
+from typing import Any
+
+import pandas as pd
+from pymatgen.core import Structure
+from pymatgen.entries.computed_entries import ComputedStructureEntry
+from tqdm import tqdm
+
+data_files = {
+    "summary": "2022-10-19-wbm-summary.csv",
+    "initial-structures": "2022-10-19-wbm-init-structs.json.bz2",
+    "computed-structure-entries": "2022-10-19-wbm-cses.json.bz2",
+}
+
+base_url = "https://raw.githubusercontent.com/janosh/matbench-discovery/main/data/wbm"
+default_cache_loc = os.path.expanduser("~/.cache/matbench-discovery")
+
+
+def chunks(xs: Sequence[Any], n: int) -> Generator[Sequence[Any], None, None]:
+    return (xs[i : i + n] for i in range(0, len(xs), n))
+
+
+def as_dict_handler(obj: Any) -> dict[str, Any] | None:
+    """Pass this to json.dump(default=) or as pandas.to_json(default_handler=) to
+    convert Python classes with a as_dict() method to dictionaries on serialization.
+    Objects without a as_dict() method are replaced with None in the serialized data.
+    """
+    try:
+        return obj.as_dict()  # all MSONable objects implement as_dict()
+    except AttributeError:
+        return None  # replace unhandled objects with None in serialized data
+        # removes e.g. non-serializable AseAtoms from M3GNet relaxation trajectories
+
+
+def load_wbm(
+    parts: Sequence[str] = ("summary",),
+    version: int = 1,
+    cache_dir: str | None = default_cache_loc,
+    hydrate: bool = False,
+) -> pd.DataFrame | dict[str, pd.DataFrame]:
+    """_summary_
+
+    Args:
+        parts (str, optional): Which parts of the WBM dataset to load. Can be any subset
+            of {'summary', 'initial-structures', 'computed-structure-entries'}. Defaults
+            to ["summary"], a dataframe with columns for material properties like VASP
+            energy, formation energy, energy above the convex hull (3 columns with old,
+            new and no Materials Project energy corrections applied for each), volume,
+            band gap, number of sites per unit cell, and more.
+        version (int, optional): Which version of the dataset to load. Defaults to 1
+            (currently the only available option).
+        cache_dir (str, optional): Where to cache data files on local drive. Defaults to
+            '~/.cache/matbench-discovery'. Set to None to disable caching.
+        hydrate (bool, optional): Whether to hydrate pymatgen objects. If False,
+            Structures and ComputedStructureEntries are returned as dictionaries which
+            can be hydrated on-demand with df.col.map(Structure.from_dict). Defaults to
+            False as it noticeably increases load time.
+
+    Raises:
+        ValueError: On bad version or bad keys for which data parts to load.
+
+    Returns:
+        pd.DataFrame | dict[str, pd.DataFrame]: Single dataframe of dictionary of
+        multiple data parts were requested.
+    """
+    if version != 1:
+        raise ValueError(f"Only version 1 currently available, got {version=}")
+    if missing := set(parts) - set(data_files):
+        raise ValueError(f"{missing} must be subset of {set(data_files)}")
+
+    dfs = {}
+    for key in parts:
+        file = data_files[key]
+        reader = pd.read_csv if file.endswith(".csv") else pd.read_json
+
+        cache_path = f"{cache_dir}/{file}"
+        if os.path.isfile(cache_path):
+            df = reader(cache_path)
+        else:
+            url = f"{base_url}/{file}"
+            print(f"Downloading {url=}")
+            df = reader(url)
+            if cache_dir and not os.path.isfile(cache_path):
+                os.makedirs(cache_dir, exist_ok=True)
+                if ".csv" in file:
+                    df.to_csv(cache_path)
+                elif ".json" in file:
+                    df.reset_index().to_json(
+                        cache_path, default_handler=as_dict_handler
+                    )
+                else:
+                    raise ValueError(f"Unexpected file type {file}")
+
+        df = df.set_index("material_id")
+        if hydrate:
+            for col in df:
+                if not isinstance(df[col].iloc[0], dict):
+                    continue
+                try:
+                    df[col] = [
+                        ComputedStructureEntry.from_dict(d)
+                        for d in tqdm(df[col], desc=col)
+                    ]
+                except Exception:
+                    df[col] = [Structure.from_dict(d) for d in tqdm(df[col], desc=col)]
+
+        dfs[key] = df
+
+    if len(parts) == 1:
+        return dfs[parts[0]]
+    return dfs
diff --git a/models/bowsr/test_bowsr.py b/models/bowsr/test_bowsr.py
@@ -13,7 +13,8 @@
 from maml.apps.bowsr.optimizer import BayesianOptimizer
 from tqdm import tqdm
 
-from matbench_discovery import DEBUG, ROOT, as_dict_handler, timestamp, today
+from matbench_discovery import DEBUG, ROOT, timestamp, today
+from matbench_discovery.data import as_dict_handler
 from matbench_discovery.slurm import slurm_submit
 
 __author__ = "Janosh Riebesell"
diff --git a/models/cgcnn/test_cgcnn.py b/models/cgcnn/test_cgcnn.py
@@ -109,7 +109,8 @@
     data_loader=data_loader,
 )
 
-df_preds.to_csv(f"{out_dir}/{job_name}-preds.csv", index=False)
+slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")
+df_preds.to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv", index=False)
 pred_col = f"{target_col}_pred_ens"
 assert pred_col in df, f"{pred_col=} not in {list(df)}"
 table = wandb.Table(dataframe=df_preds[[target_col, pred_col]].reset_index())
diff --git a/models/m3gnet/join_m3gnet_results.py b/models/m3gnet/join_m3gnet_results.py
@@ -8,7 +8,8 @@
 from pymatgen.analysis.phase_diagram import PDEntry
 from tqdm import tqdm
 
-from matbench_discovery import ROOT, as_dict_handler, today
+from matbench_discovery import ROOT, today
+from matbench_discovery.data import as_dict_handler
 from matbench_discovery.energy import get_e_form_per_atom
 
 __author__ = "Janosh Riebesell"
diff --git a/models/m3gnet/test_m3gnet.py b/models/m3gnet/test_m3gnet.py
@@ -12,7 +12,8 @@
 from m3gnet.models import Relaxer
 from tqdm import tqdm
 
-from matbench_discovery import DEBUG, ROOT, as_dict_handler, timestamp, today
+from matbench_discovery import DEBUG, ROOT, timestamp, today
+from matbench_discovery.data import as_dict_handler
 from matbench_discovery.slurm import slurm_submit
 
 """
diff --git a/models/wrenformer/test_wrenformer.py b/models/wrenformer/test_wrenformer.py
@@ -96,7 +96,8 @@
     runs, data_loader=data_loader, df=df, model_cls=Wrenformer, target_col=target_col
 )
 
-df.to_csv(f"{out_dir}/{job_name}-preds.csv")
+slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")
+df.to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv")
 
 
 # %%
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,22 @@
+from __future__ import annotations
+
+import pandas as pd
+import pytest
+from pymatgen.core import Lattice, Structure
+
+
+@pytest.fixture
+def dummy_df_with_structures(dummy_struct: Structure) -> pd.DataFrame:
+    # create a dummy df with a structure column
+    df = pd.DataFrame(dict(material_id=range(10), structure=[dummy_struct] * 10))
+    df["volume"] = [x.volume for x in df.structure]
+    return df
+
+
+@pytest.fixture
+def dummy_struct() -> Structure:
+    return Structure(
+        lattice=Lattice.cubic(5),
+        species=("Fe", "O"),
+        coords=((0, 0, 0), (0.5, 0.5, 0.5)),
+    )
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from tempfile import TemporaryDirectory
+from typing import Any
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+from pymatgen.core import Lattice, Structure
+
+from matbench_discovery.data import as_dict_handler, chunks, data_files, load_wbm
+
+structure = Structure(
+    lattice=Lattice.cubic(5),
+    species=("Fe", "O"),
+    coords=((0, 0, 0), (0.5, 0.5, 0.5)),
+)
+
+
+@pytest.mark.parametrize(
+    "parts, cache_dir, hydrate",
+    [
+        (["summary"], None, True),
+        (["initial-structures"], TemporaryDirectory().name, True),
+        (["computed-structure-entries"], None, False),
+        (["summary", "initial-structures"], TemporaryDirectory().name, True),
+    ],
+)
+def test_load_wbm(
+    parts: list[str],
+    cache_dir: str | None,
+    hydrate: bool,
+    dummy_df_with_structures: pd.DataFrame,
+) -> None:
+    # intercept HTTP requests to GitHub raw user content and return dummy df instead
+    with patch("matbench_discovery.data.pd.read_csv") as read_csv, patch(
+        "matbench_discovery.data.pd.read_json"
+    ) as read_json:
+        read_csv.return_value = read_json.return_value = dummy_df_with_structures
+        out = load_wbm(parts, cache_dir=cache_dir, hydrate=hydrate)
+
+    assert read_json.call_count + read_csv.call_count == len(parts)
+
+    if len(parts) > 1:
+        assert isinstance(out, dict)
+        assert list(out) == parts
+        for df in out.values():
+            assert isinstance(df, pd.DataFrame)
+    else:
+        assert isinstance(out, pd.DataFrame)
+
+
+def test_load_wbm_raises() -> None:
+    with pytest.raises(
+        ValueError,
+        match=f"must be subset of {set(data_files)}",
+    ):
+        load_wbm(["invalid-part"])
+
+    with pytest.raises(
+        ValueError, match="Only version 1 currently available, got version=2"
+    ):
+        load_wbm(version=2)
+
+
+def test_chunks() -> None:
+    assert list(chunks([], 1)) == []
+    assert list(chunks([1], 1)) == [[1]]
+    assert list(chunks([1, 2], 1)) == [[1], [2]]
+    assert list(chunks([1, 2, 3], 1)) == [[1], [2], [3]]
+    assert list(chunks([1, 2, 3], 2)) == [[1, 2], [3]]
+    assert list(chunks(range(1, 4), 2)) == [range(1, 3), range(3, 4)]
+    assert list(chunks(range(1, 5), 2)) == [range(1, 3), range(3, 5)]
+    assert list(chunks(range(1, 5), 3)) == [range(1, 4), range(4, 5)]
+
+
+def test_as_dict_handler() -> None:
+    class C:
+        def as_dict(self) -> dict[str, Any]:
+            return {"foo": "bar"}
+
+    assert as_dict_handler(C()) == {"foo": "bar"}
+    assert as_dict_handler(1) is None
+    assert as_dict_handler("foo") is None
+    assert as_dict_handler([1, 2, 3]) is None
+    assert as_dict_handler({"foo": "bar"}) is None
diff --git a/tests/test_init.py b/tests/test_init.py
@@ -1,35 +1,11 @@
 from __future__ import annotations
 
 import os
-from typing import Any
 
-from matbench_discovery import ROOT, as_dict_handler, chunks, timestamp, today
+from matbench_discovery import ROOT, timestamp, today
 
 
 def test_has_globals() -> None:
     assert os.path.isdir(ROOT)
     assert today == timestamp.split("@")[0]
     assert len(timestamp) == 19
-
-
-def test_chunks() -> None:
-    assert list(chunks([], 1)) == []
-    assert list(chunks([1], 1)) == [[1]]
-    assert list(chunks([1, 2], 1)) == [[1], [2]]
-    assert list(chunks([1, 2, 3], 1)) == [[1], [2], [3]]
-    assert list(chunks([1, 2, 3], 2)) == [[1, 2], [3]]
-    assert list(chunks(range(1, 4), 2)) == [range(1, 3), range(3, 4)]
-    assert list(chunks(range(1, 5), 2)) == [range(1, 3), range(3, 5)]
-    assert list(chunks(range(1, 5), 3)) == [range(1, 4), range(4, 5)]
-
-
-def test_as_dict_handler() -> None:
-    class C:
-        def as_dict(self) -> dict[str, Any]:
-            return {"foo": "bar"}
-
-    assert as_dict_handler(C()) == {"foo": "bar"}
-    assert as_dict_handler(1) is None
-    assert as_dict_handler("foo") is None
-    assert as_dict_handler([1, 2, 3]) is None
-    assert as_dict_handler({"foo": "bar"}) is None
diff --git a/tests/test_structure.py b/tests/test_structure.py
@@ -1,32 +1,22 @@
 from __future__ import annotations
 
 import numpy as np
-import pytest
-from pymatgen.core import Lattice, Structure
+from pymatgen.core import Structure
 
 from matbench_discovery.structure import perturb_structure
 
 
-@pytest.fixture
-def struct() -> Structure:
-    return Structure(
-        lattice=Lattice.cubic(5),
-        species=("Fe", "O"),
-        coords=((0, 0, 0), (0.5, 0.5, 0.5)),
-    )
-
-
-def test_perturb_structure(struct: Structure) -> None:
+def test_perturb_structure(dummy_struct: Structure) -> None:
     np.random.seed(0)
-    perturbed = perturb_structure(struct)
-    assert len(perturbed) == len(struct)
+    perturbed = perturb_structure(dummy_struct)
+    assert len(perturbed) == len(dummy_struct)
 
-    for site, new in zip(struct, perturbed):
+    for site, new in zip(dummy_struct, perturbed):
         assert site.specie == new.specie
         assert tuple(site.coords) != tuple(new.coords)
 
     # test that the perturbation is reproducible
     np.random.seed(0)
-    assert perturbed == perturb_structure(struct)
+    assert perturbed == perturb_structure(dummy_struct)
     # but different on subsequent calls
-    assert perturb_structure(struct) != perturb_structure(struct)
+    assert perturb_structure(dummy_struct) != perturb_structure(dummy_struct)

Original file line number	Diff line number	Diff line change
`@@ -109,7 +109,8 @@`
`109`	`109`	`data_loader=data_loader,`
`110`	`110`	`)`
`111`	`111`
`112`		`-df_preds.to_csv(f"{out_dir}/{job_name}-preds.csv", index=False)`
	`112`	`+slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")`
	`113`	`+df_preds.to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv", index=False)`
`113`	`114`	`pred_col = f"{target_col}_pred_ens"`
`114`	`115`	`assert pred_col in df, f"{pred_col=} not in {list(df)}"`
`115`	`116`	`table = wandb.Table(dataframe=df_preds[[target_col, pred_col]].reset_index())`
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,8 @@`
`96`	`96`	`runs, data_loader=data_loader, df=df, model_cls=Wrenformer, target_col=target_col`
`97`	`97`	`)`
`98`	`98`
`99`		`-df.to_csv(f"{out_dir}/{job_name}-preds.csv")`
	`99`	`+slurm_job_id = os.environ.get("SLURM_JOB_ID", "debug")`
	`100`	`+df.to_csv(f"{out_dir}/{job_name}-preds-{slurm_job_id}.csv")`
`100`	`101`
`101`	`102`
`102`	`103`	`# %%`