|
| 1 | +"""Download all MP ionic steps on 2023-03-15.""" |
| 2 | + |
| 3 | + |
| 4 | +# %% |
| 5 | +import os |
| 6 | + |
| 7 | +import pandas as pd |
| 8 | +from emmet.core.tasks import TaskDoc |
| 9 | +from pymongo import MongoClient |
| 10 | +from pymongo.database import Database |
| 11 | +from tqdm import trange |
| 12 | + |
| 13 | +from matbench_discovery import ROOT, today |
| 14 | + |
| 15 | +__author__ = "Janosh Riebesell" |
| 16 | +__date__ = "2023-03-15" |
| 17 | + |
| 18 | +module_dir = os.path.dirname(__file__) |
| 19 | + |
| 20 | + |
| 21 | +# %% access mp_core database directly through pymongo instead of API for speed |
| 22 | +host = "knowhere.lbl.gov" |
| 23 | +db_name = "mp_core" |
| 24 | + |
| 25 | +with open(f"{ROOT}/site/.env") as file: |
| 26 | + text = file.read() |
| 27 | + user = text.split("user=")[1].split("\n")[0] |
| 28 | + password = text.split("password=")[1].split("\n")[0] |
| 29 | + |
| 30 | +uri = f"mongodb://{user}:{password}@{host}/?authSource={db_name}" |
| 31 | +db: Database[TaskDoc] = MongoClient(uri)[db_name] |
| 32 | + |
| 33 | + |
| 34 | +# %% |
| 35 | +ids_path = f"{module_dir}/2023-03-15-mp-task-ids.csv.bz2" |
| 36 | +fields = "task_id formula_pretty run_type nsites task_type tags completed_at".split() |
| 37 | + |
| 38 | +if os.path.isfile(ids_path): |
| 39 | + print(f"Found existing list of task IDs to query at {ids_path=}") |
| 40 | + df_tasks = pd.read_csv(ids_path).set_index("task_id") |
| 41 | +else: |
| 42 | + print(f"Querying all task docs from {db_name}\n{fields=}.\nThis takes a while...") |
| 43 | + task_docs = sorted( |
| 44 | + db["tasks"].find({}, fields), key=lambda doc: int(doc["task_id"].split("-")[1]) |
| 45 | + ) |
| 46 | + |
| 47 | + print(f"{today}: {len(task_docs) = :,}") |
| 48 | + |
| 49 | + df_tasks = pd.DataFrame(task_docs).drop(columns=["_id"]).set_index("task_id") |
| 50 | + df_tasks.task_type.value_counts(dropna=False).plot.pie() |
| 51 | + |
| 52 | + df_tasks.to_csv(f"{module_dir}/{today}-mp-task-ids.csv.bz2") |
| 53 | + |
| 54 | + |
| 55 | +# %% inspect schema of a single task doc |
| 56 | +doc = db.tasks.find_one({"task_id": "mp-288"}) |
| 57 | +# the most relevant task data is found in the 1st calc's ionic steps which are |
| 58 | +# the relaxation trajectory frames with the highest rate of change |
| 59 | +# docs[0]["calcs_reversed"][-1]["output"]["ionic_steps"] |
| 60 | + |
| 61 | + |
| 62 | +# %% |
| 63 | +batch_size = 10_000 |
| 64 | +task_ids = df_tasks.index.tolist() |
| 65 | + |
| 66 | +os.makedirs(f"{module_dir}/mp-tasks", exist_ok=True) |
| 67 | +# Iterate over task_ids in batches |
| 68 | +desc = "Loading MP task docs" |
| 69 | +pbar = trange(0, len(task_ids), batch_size, desc=desc, unit_scale=batch_size) |
| 70 | +for start_idx in pbar: |
| 71 | + # Define start and end indices for batch |
| 72 | + end_idx = min(start_idx + batch_size, len(task_ids)) |
| 73 | + start_id = task_ids[start_idx] |
| 74 | + end_id = task_ids[end_idx - 1] |
| 75 | + batch_ids = task_ids[start_idx:end_idx] |
| 76 | + pbar.set_postfix_str(f"{start_id} to {end_id}") |
| 77 | + |
| 78 | + out_path = f"{module_dir}/mp-tasks/{start_id}__{end_id}.json.gz" |
| 79 | + |
| 80 | + # Check if output file for batch already exists |
| 81 | + if os.path.isfile(out_path): |
| 82 | + continue |
| 83 | + |
| 84 | + # query batch of task docs |
| 85 | + batch_docs = list( |
| 86 | + db["tasks"].find( |
| 87 | + {"task_id": {"$in": batch_ids}}, |
| 88 | + [*fields, "calcs_reversed.output.ionic_steps"], |
| 89 | + ) |
| 90 | + ) |
| 91 | + |
| 92 | + # Convert documents to DataFrame and save to file |
| 93 | + df_batch = pd.DataFrame(batch_docs).set_index("task_id").drop(columns=["_id"]) |
| 94 | + # handler=str needed since MongoDB ObjectId is not JSON serializable |
| 95 | + df_batch.reset_index().to_json(out_path, default_handler=str) |
| 96 | + # don't store df_batch to save memory |
| 97 | + |
| 98 | + |
| 99 | +# %% inspect saved task docs for expected data |
| 100 | +df_10k = pd.read_json( |
| 101 | + f"{module_dir}/mp-tasks/mp-1708653__mp-1735769.json.gz" |
| 102 | +).set_index("task_id") |
0 commit comments