|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import os |
| 4 | +from collections.abc import Generator, Sequence |
| 5 | +from typing import Any |
| 6 | + |
| 7 | +import pandas as pd |
| 8 | +from pymatgen.core import Structure |
| 9 | +from pymatgen.entries.computed_entries import ComputedStructureEntry |
| 10 | +from tqdm import tqdm |
| 11 | + |
| 12 | +data_files = { |
| 13 | + "summary": "2022-10-19-wbm-summary.csv", |
| 14 | + "initial-structures": "2022-10-19-wbm-init-structs.json.bz2", |
| 15 | + "computed-structure-entries": "2022-10-19-wbm-cses.json.bz2", |
| 16 | +} |
| 17 | + |
| 18 | +base_url = "https://raw.githubusercontent.com/janosh/matbench-discovery/main/data/wbm" |
| 19 | +default_cache_loc = os.path.expanduser("~/.cache/matbench-discovery") |
| 20 | + |
| 21 | + |
| 22 | +def chunks(xs: Sequence[Any], n: int) -> Generator[Sequence[Any], None, None]: |
| 23 | + return (xs[i : i + n] for i in range(0, len(xs), n)) |
| 24 | + |
| 25 | + |
| 26 | +def as_dict_handler(obj: Any) -> dict[str, Any] | None: |
| 27 | + """Pass this to json.dump(default=) or as pandas.to_json(default_handler=) to |
| 28 | + convert Python classes with a as_dict() method to dictionaries on serialization. |
| 29 | + Objects without a as_dict() method are replaced with None in the serialized data. |
| 30 | + """ |
| 31 | + try: |
| 32 | + return obj.as_dict() # all MSONable objects implement as_dict() |
| 33 | + except AttributeError: |
| 34 | + return None # replace unhandled objects with None in serialized data |
| 35 | + # removes e.g. non-serializable AseAtoms from M3GNet relaxation trajectories |
| 36 | + |
| 37 | + |
| 38 | +def load_wbm( |
| 39 | + parts: Sequence[str] = ("summary",), |
| 40 | + version: int = 1, |
| 41 | + cache_dir: str | None = default_cache_loc, |
| 42 | + hydrate: bool = False, |
| 43 | +) -> pd.DataFrame | dict[str, pd.DataFrame]: |
| 44 | + """_summary_ |
| 45 | +
|
| 46 | + Args: |
| 47 | + parts (str, optional): Which parts of the WBM dataset to load. Can be any subset |
| 48 | + of {'summary', 'initial-structures', 'computed-structure-entries'}. Defaults |
| 49 | + to ["summary"], a dataframe with columns for material properties like VASP |
| 50 | + energy, formation energy, energy above the convex hull (3 columns with old, |
| 51 | + new and no Materials Project energy corrections applied for each), volume, |
| 52 | + band gap, number of sites per unit cell, and more. |
| 53 | + version (int, optional): Which version of the dataset to load. Defaults to 1 |
| 54 | + (currently the only available option). |
| 55 | + cache_dir (str, optional): Where to cache data files on local drive. Defaults to |
| 56 | + '~/.cache/matbench-discovery'. Set to None to disable caching. |
| 57 | + hydrate (bool, optional): Whether to hydrate pymatgen objects. If False, |
| 58 | + Structures and ComputedStructureEntries are returned as dictionaries which |
| 59 | + can be hydrated on-demand with df.col.map(Structure.from_dict). Defaults to |
| 60 | + False as it noticeably increases load time. |
| 61 | +
|
| 62 | + Raises: |
| 63 | + ValueError: On bad version or bad keys for which data parts to load. |
| 64 | +
|
| 65 | + Returns: |
| 66 | + pd.DataFrame | dict[str, pd.DataFrame]: Single dataframe of dictionary of |
| 67 | + multiple data parts were requested. |
| 68 | + """ |
| 69 | + if version != 1: |
| 70 | + raise ValueError(f"Only version 1 currently available, got {version=}") |
| 71 | + if missing := set(parts) - set(data_files): |
| 72 | + raise ValueError(f"{missing} must be subset of {set(data_files)}") |
| 73 | + |
| 74 | + dfs = {} |
| 75 | + for key in parts: |
| 76 | + file = data_files[key] |
| 77 | + reader = pd.read_csv if file.endswith(".csv") else pd.read_json |
| 78 | + |
| 79 | + cache_path = f"{cache_dir}/{file}" |
| 80 | + if os.path.isfile(cache_path): |
| 81 | + df = reader(cache_path) |
| 82 | + else: |
| 83 | + url = f"{base_url}/{file}" |
| 84 | + print(f"Downloading {url=}") |
| 85 | + df = reader(url) |
| 86 | + if cache_dir and not os.path.isfile(cache_path): |
| 87 | + os.makedirs(cache_dir, exist_ok=True) |
| 88 | + if ".csv" in file: |
| 89 | + df.to_csv(cache_path) |
| 90 | + elif ".json" in file: |
| 91 | + df.reset_index().to_json( |
| 92 | + cache_path, default_handler=as_dict_handler |
| 93 | + ) |
| 94 | + else: |
| 95 | + raise ValueError(f"Unexpected file type {file}") |
| 96 | + |
| 97 | + df = df.set_index("material_id") |
| 98 | + if hydrate: |
| 99 | + for col in df: |
| 100 | + if not isinstance(df[col].iloc[0], dict): |
| 101 | + continue |
| 102 | + try: |
| 103 | + df[col] = [ |
| 104 | + ComputedStructureEntry.from_dict(d) |
| 105 | + for d in tqdm(df[col], desc=col) |
| 106 | + ] |
| 107 | + except Exception: |
| 108 | + df[col] = [Structure.from_dict(d) for d in tqdm(df[col], desc=col)] |
| 109 | + |
| 110 | + dfs[key] = df |
| 111 | + |
| 112 | + if len(parts) == 1: |
| 113 | + return dfs[parts[0]] |
| 114 | + return dfs |
0 commit comments