rename data/wbm/(analysis->eda).py

janosh · janosh · commit 0f2410de1d61 · 2023-06-19T20:29:25.000-07:00
rename scripts/(analyze_failure_cases-&gt;analyze_model_failure_cases).py
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,7 @@ __pycache__
 *.csv.bz2
 *.pkl.gz
 data/**/raw
+data/**/tsne
 data/2022-*
 data/m3gnet-*
 
diff --git a/data/wbm/eda.py b/data/wbm/eda.py
@@ -3,21 +3,26 @@
 
 import numpy as np
 import pandas as pd
+import plotly.express as px
 from pymatgen.core import Composition
 from pymatviz import count_elements, ptable_heatmap_plotly
 from pymatviz.utils import save_fig
 
 from matbench_discovery import FIGS, ROOT, today
+from matbench_discovery import plots as plots
 from matbench_discovery.data import DATA_FILES, df_wbm
 from matbench_discovery.energy import mp_elem_reference_entries
-from matbench_discovery.plots import pio
+from matbench_discovery.preds import df_each_err, each_true_col
+
+__author__ = "Janosh Riebesell"
+__date__ = "2023-03-30"
 
 """
-Compare MP and WBM elemental prevalence. Starting with WBM, MP below.
+WBM exploratory data analysis.
+Start with comparing MP and WBM elemental prevalence.
 """
 
 module_dir = os.path.dirname(__file__)
-print(f"{pio.templates.default=}")
 about_data_page = f"{ROOT}/site/src/routes/about-the-data"
 
 
@@ -170,3 +175,61 @@
 fig.show()
 
 save_fig(fig, f"{FIGS}/mp-elemental-ref-energies.svelte")
+
+
+# %% plot 2d and 3d t-SNE projections of one-hot encoded element vectors summed by
+# weight in each WBM composition. TLDR: no obvious structure in the data
+# was hoping to find certain clusters to have higher or lower errors after seeing
+# many models struggle on the halogens in per-element error periodic table heatmaps
+# https://matbench-discovery.janosh.dev/models
+df_2d_tsne = pd.read_csv(f"{module_dir}/tsne/one-hot-112-composition-2d.csv.gz")
+df_2d_tsne = df_2d_tsne.set_index("material_id")
+
+df_3d_tsne = pd.read_csv(f"{module_dir}/tsne/one-hot-112-composition-3d.csv.gz")
+model = "Wrenformer"
+df_3d_tsne = pd.read_csv(
+    f"{module_dir}/tsne/one-hot-112-composition+{model}-each-err-3d-metric=eucl.csv.gz"
+)
+df_3d_tsne = df_3d_tsne.set_index("material_id")
+
+df_wbm[list(df_2d_tsne)] = df_2d_tsne
+df_wbm[list(df_3d_tsne)] = df_3d_tsne
+df_wbm[list(df_each_err.add_suffix(" abs EACH error"))] = df_each_err.abs()
+
+
+# %%
+color_col = f"{model} abs EACH error"
+clr_range_max = df_wbm[color_col].mean() + df_wbm[color_col].std()
+
+
+# %%
+fig = px.scatter(
+    df_wbm,
+    x="2d t-SNE 1",
+    y="2d t-SNE 2",
+    color=color_col,
+    hover_name="material_id",
+    hover_data=("formula", each_true_col),
+    range_color=(0, clr_range_max),
+)
+fig.show()
+
+
+# %%
+fig = px.scatter_3d(
+    df_wbm,
+    x="3d t-SNE 1",
+    y="3d t-SNE 2",
+    z="3d t-SNE 3",
+    color=color_col,
+    custom_data=["material_id", "formula", each_true_col, color_col],
+    range_color=(0, clr_range_max),
+)
+fig.data[0].hovertemplate = (
+    "<b>material_id: %{customdata[0]}</b><br><br>"
+    "t-SNE: (%{x:.2f}, %{y:.2f}, %{z:.2f})<br>"
+    "Formula: %{customdata[1]}<br>"
+    "E<sub>above hull</sub>: %{customdata[2]:.2f}<br>"
+    f"{color_col}: %{{customdata[3]:.2f}}<br>"
+)
+fig.show()
diff --git a/scripts/analyze_model_failure_cases.py b/scripts/analyze_model_failure_cases.py
diff --git a/scripts/compute_projections.py b/scripts/compute_projections.py
@@ -3,6 +3,7 @@
 
 # %%
 import os
+from datetime import datetime
 from typing import Any, Literal
 
 import numpy as np
@@ -38,6 +39,8 @@
 print(f"{data_path=}")
 print(f"{out_dim=}")
 print(f"{projection_type=}")
+start_time = datetime.now()
+print(f"job started at {start_time:%Y-%m-%d %H:%M:%S}")
 df_in = pd.read_csv(data_path, na_filter=False).set_index("material_id")
 
 
@@ -61,13 +64,13 @@ def metric(
     projector = TSNE(
         n_components=out_dim, random_state=0, n_iter=250, n_iter_without_progress=50
     )
-    out_cols = [f"t-SNE {idx}" for idx in range(out_dim)]
+    out_cols = [f"{out_dim}d t-SNE {idx + 1}" for idx in range(out_dim)]
 elif projection_type == "umap":
     from umap import UMAP
 
     # TODO this execution path is untested (was never run yet)
     projector = UMAP(n_components=out_dim, random_state=0, metric=metric)
-    out_cols = [f"t-SNE {idx+1}" for idx in range(out_dim)]
+    out_cols = [f"{out_dim}d UMAP {idx + 1}" for idx in range(out_dim)]
 
 identity = np.eye(one_hot_dim)
 
@@ -78,17 +81,20 @@ def sum_one_hot_elem(formula: str) -> np.ndarray[Any, np.int64]:
 
 
 in_col = {"wbm": "formula", "mp": "formula_pretty"}[data_name]
-df_in[f"one_hot_{one_hot_dim}"] = [
-    sum_one_hot_elem(formula) for formula in tqdm(df_in[in_col])
-]
-
+one_hot_encoding = np.array(
+    [sum_one_hot_elem(formula) for formula in tqdm(df_in[in_col])]
+)
 
-one_hot_encoding = np.array(df_in[f"one_hot_{one_hot_dim}"].to_list())
 projections = projector.fit_transform(one_hot_encoding)
 
 df_in[out_cols] = projections
 
-out_path = f"{out_dir}/one-hot-{one_hot_dim}-composition-{out_dim}d.csv"
+out_path = f"{out_dir}/one-hot-{one_hot_dim}-composition-{out_dim}d.csv.gz"
 df_in[out_cols].to_csv(out_path)
 
 print(f"Wrote projections to {out_path!r}")
+end_time = datetime.now()
+print(
+    f"Job finished at {end_time:%Y-%m-%d %H:%M:%S} and took "
+    f"{(end_time - start_time).seconds} sec"
+)
diff --git a/scripts/make_api_docs.py b/scripts/make_api_docs.py
@@ -6,12 +6,12 @@
 
 from lazydocs import generate_docs
 
-from matbench_discovery import ROOT
+SITE = f"{os.path.dirname(__file__)}/../site"
 
-with open(f"{ROOT}/site/package.json") as file:
+with open(f"{SITE}/package.json") as file:
     pkg = json.load(file)  # get repo URL from package.json
 
-out_path = f"{ROOT}/site/src/routes/api"
+out_path = f"{SITE}/src/routes/api"
 
 for path in glob(f"{out_path}/*.md"):
     os.remove(path)
diff --git a/site/package.json b/site/package.json
@@ -13,8 +13,7 @@
     "build": "vite build",
     "preview": "vite preview",
     "serve": "vite build && vite preview",
-    "check": "svelte-check",
-    "make-api-docs": "cd .. && python scripts/make_api_docs.py"
+    "check": "svelte-check"
   },
   "devDependencies": {
     "@iconify/svelte": "^3.1.0",
diff --git a/site/src/app.css b/site/src/app.css
@@ -15,6 +15,7 @@
   --toc-active-border-width: 0 0 0 1pt;
   --toc-active-bg: none;
   --toc-active-border-radius: 0;
+  --toc-max-height: 85vh;
 
   --zoo-github-corner-color: var(--night);
   --zoo-github-corner-bg: white;