Skip to content

Commit eec1cb4

Browse files
committed
add scripts/compute_projections.py
add site/src/figs/element-prevalence-vs-error.svelte to /models/tmi page add site/src/figs/hist-largest-each-errors-fp-diff-models.svelte generated by scripts/difficult_structures.py use border-left instead of bg color to highlight active ToC item
1 parent 4b6e83a commit eec1cb4

18 files changed

+359
-51
lines changed

data/mp/build_phase_diagram.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@
102102
json.dump(elemental_ref_entries, file, default=lambda x: x.as_dict())
103103

104104

105-
df_mp = pd.read_csv(DATA_FILES.mp_energies).set_index("material_id")
105+
df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index("material_id")
106106

107107

108108
# %%

data/mp/get_mp_energies.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
df["wyckoff_spglib"] = [get_aflow_label_from_spglib(x) for x in tqdm(df.structure)]
5858

5959
df.to_csv(DATA_FILES.mp_energies)
60-
# df = pd.read_csv(DATA_FILES.mp_energies)
60+
# df = pd.read_csv(DATA_FILES.mp_energies, na_filter=False)
6161

6262

6363
# %% reproduce fig. 1b from https://arxiv.org/abs/2001.10591 (as data consistency check)

data/wbm/analysis.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@
6969

7070

7171
# %% load MP training set
72-
df = pd.read_csv(DATA_FILES.mp_energies)
72+
df = pd.read_csv(DATA_FILES.mp_energies, na_filter=False)
7373
mp_elem_counts = count_elements(df.formula_pretty).astype(int)
7474

7575
# mp_elem_counts.to_json(f"{about_data_page}/mp-element-counts.json")

data/wbm/fetch_process_wbm_dataset.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -445,8 +445,7 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
445445
)
446446
x_axis_title = "WBM uncorrected formation energy (eV/atom)"
447447
fig.update_layout(xaxis_title=x_axis_title, margin=dict(l=10, r=10, t=40, b=10))
448-
# disabling zooming y-axis
449-
fig.update_yaxes(fixedrange=True)
448+
fig.update_yaxes(fixedrange=True) # disable zooming y-axis
450449
fig.show(
451450
config=dict(
452451
modeBarButtonsToRemove=["lasso2d", "select2d", "autoScale2d", "toImage"],

matbench_discovery/slurm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def slurm_submit(
5151
partition (str, optional): Slurm partition.
5252
account (str, optional): Account to charge for this job.
5353
slurm_flags (str | list[str], optional): Extra slurm CLI flags. Defaults to ().
54-
Examples: ('--nodes 1', '--gpus-per-node 1') or ('--mem', '16000').
54+
Examples: ('--nodes 1', '--gpus-per-node 1') or ('--mem', '16G').
5555
array (str, optional): Slurm array specifier. Defaults to None. Example:
5656
'9' (for SLURM_ARRAY_TASK_ID from 0-9 inclusive), '1-10' or '1-10%2', etc.
5757
pre_cmd (str, optional): Things like `module load` commands and environment

models/bowsr/test_bowsr.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
array=f"1-{slurm_array_task_count}%{slurm_max_parallel}",
5555
# --mem 12000 avoids slurmstepd: error: Detected 1 oom-kill event(s)
5656
# Some of your processes may have been killed by the cgroup out-of-memory handler.
57-
slurm_flags=("--mem", str(12_000)),
57+
slurm_flags=("--mem", "12G"),
5858
# TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
5959
# https://stackoverflow.com/a/40982782
6060
pre_cmd="TF_CPP_MIN_LOG_LEVEL=2",

models/m3gnet/test_m3gnet.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
account="LEE-SL3-CPU",
4242
time="3:0:0",
4343
array=f"1-{slurm_array_task_count}",
44-
slurm_flags=("--mem", str(12_000)),
44+
slurm_flags=("--mem", "12G"),
4545
# TF_CPP_MIN_LOG_LEVEL=2 means INFO and WARNING logs are not printed
4646
# https://stackoverflow.com/a/40982782
4747
pre_cmd="TF_CPP_MIN_LOG_LEVEL=2",

models/voronoi/train_test_voronoi_rf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
df_train = glob_to_df(train_path).set_index("material_id")
4949
print(f"{df_train.shape=}")
5050

51-
df_mp = pd.read_csv(DATA_FILES.mp_energies).set_index("material_id")
51+
df_mp = pd.read_csv(DATA_FILES.mp_energies, na_filter=False).set_index("material_id")
5252
train_e_form_col = "formation_energy_per_atom"
5353

5454
test_path = f"{module_dir}/2022-11-18-features-wbm-{task_type}.csv.bz2"

scripts/compute_projections.py

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
"""Compute t-SNE and UMAP projections of the WBM and MP datasets."""
2+
3+
4+
# %%
5+
import os
6+
from typing import Any, Literal
7+
8+
import numpy as np
9+
import pandas as pd
10+
from pymatgen.core import Composition
11+
from tqdm import tqdm
12+
13+
from matbench_discovery import ROOT
14+
from matbench_discovery.data import DATA_FILES
15+
from matbench_discovery.slurm import slurm_submit
16+
17+
__author__ = "Janosh Riebesell"
18+
__date__ = "2023-03-28"
19+
20+
21+
data_name = "mp" # which data to project
22+
projection_type: Literal["tsne", "umap"] = "tsne" # which projection method to use
23+
out_dim = 2 # number of dimensions to project to
24+
one_hot_dim = 112 # number of elements to use for one-hot encoding
25+
26+
out_dir = f"{ROOT}/data/{data_name}/{projection_type}"
27+
os.makedirs(out_dir, exist_ok=True)
28+
29+
slurm_vars = slurm_submit(
30+
job_name=f"{data_name}-{projection_type}-{out_dim}d",
31+
out_dir=out_dir,
32+
partition="icelake-himem",
33+
account="LEE-SL3-CPU",
34+
time="6:0:0",
35+
)
36+
37+
data_path = {"wbm": DATA_FILES.wbm_summary, "mp": DATA_FILES.mp_energies}[data_name]
38+
print(f"{data_path=}")
39+
print(f"{out_dim=}")
40+
print(f"{projection_type=}")
41+
df_in = pd.read_csv(data_path, na_filter=False).set_index("material_id")
42+
43+
44+
def metric(
45+
x: np.ndarray,
46+
y: np.ndarray,
47+
err_weight: float = 3,
48+
split_dim: int = one_hot_dim,
49+
) -> float:
50+
"""Custom metric for t-SNE/UMAP that weights the error dimension higher by a factor
51+
of err_weight than the composition dimensions.
52+
"""
53+
x_comp, x_err = np.split(x, [split_dim])
54+
y_comp, y_err = np.split(y, [split_dim])
55+
return np.linalg.norm(x_comp - y_comp) + err_weight * np.linalg.norm(x_err - y_err)
56+
57+
58+
if projection_type == "tsne":
59+
from sklearn.manifold import TSNE
60+
61+
projector = TSNE(
62+
n_components=out_dim, random_state=0, n_iter=250, n_iter_without_progress=50
63+
)
64+
out_cols = [f"t-SNE {idx}" for idx in range(out_dim)]
65+
elif projection_type == "umap":
66+
from umap import UMAP
67+
68+
# TODO this execution path is untested (was never run yet)
69+
projector = UMAP(n_components=out_dim, random_state=0, metric=metric)
70+
out_cols = [f"t-SNE {idx+1}" for idx in range(out_dim)]
71+
72+
identity = np.eye(one_hot_dim)
73+
74+
75+
def sum_one_hot_elem(formula: str) -> np.ndarray[Any, np.int64]:
76+
"""Return sum of one-hot encoded elements in weighted by amount in composition."""
77+
return sum(identity[el.Z - 1] * amt for el, amt in Composition(formula).items())
78+
79+
80+
in_col = {"wbm": "formula", "mp": "formula_pretty"}[data_name]
81+
df_in[f"one_hot_{one_hot_dim}"] = [
82+
sum_one_hot_elem(formula) for formula in tqdm(df_in[in_col])
83+
]
84+
85+
86+
one_hot_encoding = np.array(df_in[f"one_hot_{one_hot_dim}"].to_list())
87+
projections = projector.fit_transform(one_hot_encoding)
88+
89+
df_in[out_cols] = projections
90+
91+
out_path = f"{out_dir}/one-hot-{one_hot_dim}-composition-{out_dim}d.csv"
92+
df_in[out_cols].to_csv(out_path)
93+
94+
print(f"Wrote projections to {out_path!r}")

scripts/compute_struct_fingerprints.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,11 @@
3636
slurm_array_task_id = int(os.getenv("SLURM_ARRAY_TASK_ID", 0))
3737
slurm_array_task_count = 100
3838

39-
job_name = f"make-{data_name}-struct-fingerprints"
4039
out_dir = f"{ROOT}/data/{data_name}/structure-fingerprints"
4140
os.makedirs(out_dir, exist_ok=True)
4241

4342
slurm_vars = slurm_submit(
44-
job_name=job_name,
43+
job_name=f"{data_name}-struct-fingerprints",
4544
out_dir=out_dir,
4645
partition="icelake-himem",
4746
account="LEE-SL3-CPU",
@@ -118,7 +117,6 @@
118117

119118
df_out = pd.concat(pd.read_json(out_file) for out_file in tqdm(out_files))
120119

121-
122120
df_out.index.name = "material_id"
123121

124122
df_out.reset_index().to_json(f"{out_dir}/site-stats.json.gz")

0 commit comments

Comments
 (0)