add kwarg intersect_lines: str | Sequence[str] = () to precision_recall_vs_calc_count()

janosh · janosh · commit 39fa65a7422f · 2023-06-19T20:29:21.000-07:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,6 +20,7 @@ repos:
     rev: 4.0.1
     hooks:
       - id: flake8
+        additional_dependencies: [flake8-bugbear]
 
   - repo: https://github.com/asottile/pyupgrade
     rev: v2.34.0
@@ -56,13 +57,7 @@ repos:
         stages: [commit, commit-msg]
         exclude_types: [csv, html, json]
 
-  - repo: https://github.com/myint/autoflake
+  - repo: https://github.com/PyCQA/autoflake
     rev: v1.4
     hooks:
       - id: autoflake
-        args:
-          - --in-place
-          - --remove-unused-variables
-          - --remove-all-unused-imports
-          - --expand-star-imports
-          - --ignore-init-module-imports
diff --git a/mb_discovery/m3gnet/eda_wbm_pre_vs_post_m3gnet_relaxation.py b/mb_discovery/m3gnet/eda_wbm_pre_vs_post_m3gnet_relaxation.py
@@ -193,9 +193,11 @@
 df_m3gnet_is2re["m3gnet_energy_rs2re"] = df_m3gnet_rs2re.m3gnet_energy
 
 for task_type in ["is2re", "rs2re"]:
-    e_per_atom = df_m3gnet_is2re[f"m3gnet_energy_{task_type}"] / df_m3gnet_is2re.n_sites
+    energy_per_atom = (
+        df_m3gnet_is2re[f"m3gnet_energy_{task_type}"] / df_m3gnet_is2re.n_sites
+    )
 
-    df_m3gnet_is2re[f"e_m3gnet_per_atom_{task_type}"] = e_per_atom
+    df_m3gnet_is2re[f"e_m3gnet_per_atom_{task_type}"] = energy_per_atom
 
 fig = px.scatter(
     df_m3gnet_is2re,
diff --git a/mb_discovery/plot_scripts/plot_funcs.py b/mb_discovery/plot_scripts/plot_funcs.py
@@ -261,9 +261,30 @@ def precision_recall_vs_calc_count(
     # in eV / atom, usually 0 or 0.1 eV
     ax: plt.Axes = None,
     label: str = None,
+    intersect_lines: str | Sequence[str] = (),
     **kwargs: Any,
 ) -> plt.Axes:
-    """Precision and recall as a function of the number of calculations performed."""
+    """Precision and recall as a function of the number of calculations performed.
+
+    Args:
+        df (pd.DataFrame): Model predictions and target energy values.
+        residual_col (str, optional): Column name with residuals of model predictions,
+            i.e. residual = pred - target. Defaults to "residual".
+        e_above_hull_col (str, optional): Column name with convex hull distance values.
+            Defaults to "e_above_hull".
+        criterion (Literal['energy', 'std', 'neg_std'], optional): Whether to use
+            energy, energy+model_std, or energy-model_std as stability criterion.
+            Defaults to "energy".
+        stability_thresh (float, optional): Max distance from convex hull before
+            material is considered unstable. Defaults to 0.
+        label (str, optional): Model name used to identify its liens in the legend.
+            Defaults to None.
+        intersect_lines (Sequence[str], optional): precision_{x,y,xy} and/or
+            recall_{x,y,xy}. Defaults to (), i.e. no intersect lines.
+
+    Returns:
+        plt.Axes: The matplotlib axes object.
+    """
     if ax is None:
         ax = plt.gca()
 
@@ -315,7 +336,7 @@ def precision_recall_vs_calc_count(
     rolling_recall_curve = scipy.interpolate.interp1d(xs, tpr[:end], kind="cubic")
 
     line_kwargs = dict(
-        linewidth=3,
+        linewidth=4,
         markevery=[-1],
         marker="x",
         markersize=14,
@@ -326,6 +347,22 @@ def precision_recall_vs_calc_count(
     ax.plot(xs, rolling_recall_curve(xs), linestyle=":", **line_kwargs)
     ax.plot((0, 0), (0, 0), label=label, **line_kwargs)
 
+    if intersect_lines == "all":
+        intersect_lines = ("precision_xy", "recall_xy")
+    for line_name in intersect_lines:
+        y_func = dict(
+            precision=precision_curve,
+            recall=rolling_recall_curve,
+        )[line_name.split("_")[0]]
+        intersect_kwargs = dict(
+            linestyle=":", alpha=0.4, color=kwargs.get("color", "gray")
+        )
+        # Add some visual guidelines
+        if "x" in line_name:
+            ax.plot((0, xs[-1]), (y_func(xs[-1]), y_func(xs[-1])), **intersect_kwargs)
+        if "y" in line_name:
+            ax.plot((xs[-1], xs[-1]), (0, y_func(xs[-1])), **intersect_kwargs)
+
     if not is_fresh_ax:
         # return earlier if all plot objects besides the line were already drawn by a
         # previous call
diff --git a/mb_discovery/plot_scripts/precision_recall_vs_calc_count.py b/mb_discovery/plot_scripts/precision_recall_vs_calc_count.py
@@ -23,8 +23,8 @@
 df_hull = pd.read_csv(
     f"{ROOT}/data/2022-06-11-from-rhys/wbm-e-above-mp-hull.csv"
 ).set_index("material_id")
-dfs: dict[str, pd.DataFrame] = {}
 
+dfs: dict[str, pd.DataFrame] = {}
 for model_name in ("Wren", "CGCNN", "Voronoi"):
     dfs[model_name] = pd.read_csv(
         f"{ROOT}/data/2022-06-11-from-rhys/{model_name}-mp-initial-structures.csv"
@@ -34,17 +34,21 @@
 #     f"{ROOT}/data/2022-08-16-m3gnet-wbm-relax-results-IS2RE.json.gz"
 # ).set_index("material_id")
 
-# dfs["Wrenformer"] = pd.read_csv(
-#     f"{ROOT}/data/2022-08-16-wrenformer-ensemble-predictions.csv.bz2"
-# ).set_index("material_id")
+dfs["Wrenformer"] = pd.read_csv(
+    f"{ROOT}/data/2022-08-16-wrenformer-ensemble-predictions.csv.bz2"
+).set_index("material_id")
+
 
-# dfs["Wrenformer"]["e_form_target"] = dfs["Wren"]["e_form_target"]
-# dfs["M3GNet"]["e_form_target"] = dfs["Wren"]["e_form_target"]
+# download wbm-steps-summary.csv (23.31 MB)
+df_summary = pd.read_csv(
+    "https://figshare.com/ndownloader/files/36714216?private_link=ff0ad14505f9624f0c05"
+).set_index("material_id")
 
 
 # %%
 for (model_name, df), color in zip(
-    dfs.items(), ("tab:blue", "tab:orange", "teal", "tab:pink", "black")
+    dfs.items(),
+    ("tab:blue", "tab:orange", "teal", "tab:pink", "black", "red", "turquoise"),
 ):
     df["e_above_mp_hull"] = df_hull.e_above_mp_hull
 
@@ -66,9 +70,11 @@
         if model_name == "M3GNet":
             model_preds = df.e_form_m3gnet
             targets = df.e_form_wbm
-        elif model_name == "Wrenformer":
-            model_preds = df.e_form_pred_ens
-            targets = df.e_form
+        elif "Wrenformer" in model_name:
+            df["e_form_per_atom_pred_ens"] = df.e_form_pred_ens / df.n_sites
+            df["e_form_per_atom"] = df.e_form / df.n_sites
+            model_preds = df.e_form_per_atom_pred_ens
+            targets = df.e_form_per_atom
         elif df.filter(regex=r"_pred_\d").shape[1] > 1:
             assert df.filter(regex=r"_pred_\d").shape[1] == 10
             model_preds = df.filter(regex=r"_pred_\d").mean(axis=1)
@@ -89,10 +95,11 @@
         e_above_hull_col="e_above_mp_hull",
         color=color,
         label=model_name,
+        intersect_lines="recall_xy",
+        # intersect_lines="all",
     )
 
-model_legend = ax.legend(frameon=False, loc="lower right")
-ax.add_artist(model_legend)
+ax.legend(frameon=False, loc="lower right")
 
 ax.figure.set_size_inches(10, 9)
 
diff --git a/readme.md b/readme.md
@@ -1 +1 @@
-# ML Stability
+# Matbench Discovery