Skip to content

Commit 83736ad

Browse files
committed
fix direct data file download URLs on /contribute page
add tqdm progress bars to upload_to_figshare.py add prompt to delete new Figshare article if error occurs use gzip CLI in get_mp_traj.py to check all files for archive corruption rename compute_projections.py to project_compositions.py
1 parent cc08c78 commit 83736ad

File tree

7 files changed

+130
-47
lines changed

7 files changed

+130
-47
lines changed

data/figshare/1.0.0.json

+32-8
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,34 @@
11
{
2-
"mp_computed_structure_entries": "https://figshare.com/ndownloader/files/40344436",
3-
"mp_elemental_ref_entries": "https://figshare.com/ndownloader/files/40387775",
4-
"mp_energies": "https://figshare.com/ndownloader/files/40344448",
5-
"mp_patched_phase_diagram": "https://figshare.com/ndownloader/files/40344451",
6-
"wbm_computed_structure_entries": "https://figshare.com/ndownloader/files/40344463",
7-
"wbm_initial_structures": "https://figshare.com/ndownloader/files/40344466",
8-
"wbm_cses_plus_init_structs": "https://figshare.com/ndownloader/files/40344469",
9-
"wbm_summary": "https://figshare.com/ndownloader/files/40407575"
2+
"mp_computed_structure_entries": [
3+
"https://figshare.com/ndownloader/files/40344436",
4+
"2023-02-07-mp-computed-structure-entries.json.gz"
5+
],
6+
"mp_elemental_ref_entries": [
7+
"https://figshare.com/ndownloader/files/40387775",
8+
"2023-02-07-mp-elemental-reference-entries.json.gz"
9+
],
10+
"mp_energies": [
11+
"https://figshare.com/ndownloader/files/40344448",
12+
"2023-01-10-mp-energies.csv"
13+
],
14+
"mp_patched_phase_diagram": [
15+
"https://figshare.com/ndownloader/files/40344451",
16+
"2023-02-07-ppd-mp.pkl.gz"
17+
],
18+
"wbm_computed_structure_entries": [
19+
"https://figshare.com/ndownloader/files/40344463",
20+
"2022-10-19-wbm-computed-structure-entries.json.bz2"
21+
],
22+
"wbm_initial_structures": [
23+
"https://figshare.com/ndownloader/files/40344466",
24+
"2022-10-19-wbm-init-structs.json.bz2"
25+
],
26+
"wbm_cses_plus_init_structs": [
27+
"https://figshare.com/ndownloader/files/40344469",
28+
"2022-10-19-wbm-computed-structure-entries+init-structs.json.bz2"
29+
],
30+
"wbm_summary": [
31+
"https://figshare.com/ndownloader/files/40407575",
32+
"2022-10-19-wbm-summary.csv"
33+
]
1034
}

data/mp/get_mp_traj.py

+24-6
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,20 @@
1-
"""Download all MP ionic steps on 2023-03-15."""
1+
"""Download all MP ionic steps using direct read-access to the mp_core DB.
2+
3+
Gzipped JSON is ~15GB.
4+
On a good connection, takes about 15 min per batch * 140 batches = 35 h.
5+
"""
26

37

48
# %%
59
import os
10+
import subprocess
11+
from glob import glob
612

713
import pandas as pd
814
from emmet.core.tasks import TaskDoc
915
from pymongo import MongoClient
1016
from pymongo.database import Database
11-
from tqdm import trange
17+
from tqdm import tqdm, trange
1218

1319
from matbench_discovery import ROOT, today
1420

@@ -36,8 +42,8 @@
3642
fields = "task_id formula_pretty run_type nsites task_type tags completed_at".split()
3743

3844
if os.path.isfile(ids_path):
39-
print(f"Found existing list of task IDs to query at {ids_path=}")
40-
df_tasks = pd.read_csv(ids_path).set_index("task_id")
45+
print(f"Found existing list of task IDs to query at\n{ids_path=}")
46+
df_tasks = pd.read_csv(ids_path, low_memory=False).set_index("task_id")
4147
else:
4248
print(f"Querying all task docs from {db_name}\n{fields=}.\nThis takes a while...")
4349
task_docs = sorted(
@@ -97,6 +103,18 @@
97103

98104

99105
# %% inspect saved task docs for expected data
100-
df_10k = pd.read_json(
101-
f"{module_dir}/mp-tasks/mp-1708653__mp-1735769.json.gz"
106+
df_batch = pd.read_json(
107+
f"{module_dir}/mp-tasks/mp-531529__mp-568116.json.gz"
102108
).set_index("task_id")
109+
110+
print(f"{len(df_batch)=}")
111+
df_batch.head()
112+
113+
114+
# %% use gzip CLI to check all files for archive corruption
115+
for path in tqdm(glob(f"{module_dir}/mp-tasks/*.json.gz")):
116+
try:
117+
subprocess.run(["gzip", "-t", path], check=True)
118+
except subprocess.CalledProcessError as exc:
119+
print(f"{path} raised {exc.stderr}")
120+
# os.remove(path)

matbench_discovery/data.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def load(
9292

9393
cache_path = f"{cache_dir}/{file}"
9494
if not os.path.isfile(cache_path): # download from Figshare URL
95-
url = file_urls[data_key]
95+
url = file_urls[data_key][0]
9696
print(f"Downloading {data_key!r} from {url}")
9797
try:
9898
# ensure directory exists

scripts/compute_projections.py scripts/project_compositions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""Compute t-SNE and UMAP projections of the WBM and MP datasets."""
1+
"""Compute t-SNE or UMAP projections of WBM and MP compositions."""
22

33

44
# %%

scripts/upload_to_figshare.py

+29-19
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import requests
1414
import tomllib # needs python 3.11
1515
from requests.exceptions import HTTPError
16+
from tqdm.auto import tqdm
1617

1718
from matbench_discovery import ROOT
1819
from matbench_discovery.data import DATA_FILES
@@ -21,6 +22,7 @@
2122
__date__ = "2023-04-27"
2223

2324
with open(f"{ROOT}/site/.env") as file:
25+
# TOKEN: length 128, alphanumeric (e.g. 271431c6a94ff7...)
2426
TOKEN = file.read().split("figshare_token=")[1].split("\n")[0]
2527

2628
BASE_URL = "https://api.figshare.com/v2"
@@ -51,7 +53,7 @@
5153

5254
def make_request(method: str, url: str, data: Any = None, binary: bool = False) -> Any:
5355
"""Make a token-authorized HTTP request to the Figshare API."""
54-
headers = {"Authorization": "token " + TOKEN}
56+
headers = {"Authorization": f"token {TOKEN}"}
5557
if data is not None and not binary:
5658
data = json.dumps(data)
5759
response = requests.request(method, url, headers=headers, data=data)
@@ -95,22 +97,20 @@ def upload_file_to_figshare(article_id: int, file_path: str) -> int:
9597
data = dict(name=os.path.basename(file_path), md5=md5, size=size)
9698
endpoint = f"{BASE_URL}/account/articles/{article_id}/files"
9799
result = make_request("POST", endpoint, data=data)
98-
print(f"Initiated file upload: {result['location']}\n")
99100
file_info = make_request("GET", result["location"])
100101

101102
# Upload parts
102103
url = file_info["upload_url"]
103104
result = make_request("GET", url)
104105
with open(file_path, "rb") as file:
105-
for part in result["parts"]:
106+
for part in tqdm(result["parts"], desc=file_path):
106107
# Upload part
107108
u_data = file_info.copy()
108109
u_data.update(part)
109-
url = f'{u_data["upload_url"]}/{part["partNo"]}'
110+
url = f"{u_data['upload_url']}/{part['partNo']}"
110111
file.seek(part["startOffset"])
111112
chunk = file.read(part["endOffset"] - part["startOffset"] + 1)
112113
make_request("PUT", url, data=chunk, binary=True)
113-
print(f'\tUploaded part {part["partNo"]}')
114114

115115
# Complete upload
116116
make_request("POST", f"{endpoint}/{file_info['id']}")
@@ -127,20 +127,30 @@ def main() -> int:
127127
"categories": list(CATEGORIES),
128128
"references": REFERENCES,
129129
}
130-
article_id = create_article(metadata)
131-
uploaded_files: dict[str, str] = {}
132-
for key, file_path in DATA_FILES.items():
133-
file_id = upload_file_to_figshare(article_id, file_path)
134-
file_url = f"https://figshare.com/ndownloader/files/{file_id}"
135-
uploaded_files[key] = file_url
136-
137-
print("\nUploaded files:")
138-
for file_path, file_url in uploaded_files.items():
139-
print(f"{file_path}: {file_url}")
140-
141-
# write to JSON file
142-
with open(file_urls_out_path, "w") as file:
143-
json.dump(uploaded_files, file)
130+
try:
131+
article_id = create_article(metadata)
132+
uploaded_files: dict[str, tuple[str, str]] = {}
133+
pbar = tqdm(DATA_FILES.items(), desc="Uploading to Figshare")
134+
for key, file_path in pbar:
135+
pbar.set_postfix(file=key)
136+
file_id = upload_file_to_figshare(article_id, file_path)
137+
file_url = f"https://figshare.com/ndownloader/files/{file_id}"
138+
uploaded_files[key] = (file_url, file_path.split("/")[-1])
139+
140+
print("\nUploaded files:")
141+
for file_path, (file_url, _) in uploaded_files.items():
142+
print(f"{file_path}: {file_url}")
143+
144+
# write uploaded file keys mapped to their URLs to JSON
145+
with open(file_urls_out_path, "w") as file:
146+
json.dump(uploaded_files, file)
147+
except Exception as exc: # prompt to delete article if something went wrong
148+
answer = ""
149+
print(f"Encountered {exc=}")
150+
while answer not in ("y", "n"):
151+
answer = input("Delete article? [y/n] ")
152+
if answer == "y":
153+
make_request("DELETE", f"{BASE_URL}/account/articles/{article_id}")
144154

145155
return 0
146156

site/src/routes/contribute/+page.md

+41-10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,32 @@
11
<script>
2-
import { name, repository as repo, homepage } from "$site/package.json";
2+
import { name, repository as repo, homepage } from "$site/package.json"
3+
import figshare_urls from "$root/data/figshare/1.0.0.json"
4+
import { Tooltip } from 'svelte-zoo'
5+
6+
const ppd_doc_url = `https://github.com/materialsproject/pymatgen/blob/v2023.5.10/pymatgen/analysis/phase_diagram.py#L1480-L1814`
7+
const ppd_link = `<a href=${ppd_doc_url}>PatchedPhaseDiagram</a>`
8+
const cse_doc_url = `https://github.com/materialsproject/pymatgen/blob/v2023.5.10/pymatgen/entries/computed_entries.py#L579-L722`
9+
const cse_link = `<a href=${cse_doc_url}>ComputedStructureEntry</a>`
10+
11+
const descriptions = {
12+
mp_computed_structure_entries:
13+
`JSON-Serialized MP ${cse_link} objects containing relaxed structures and DFT final energies`,
14+
mp_elemental_ref_entries: `Minimum energy ComputedEntry for each element in MP`,
15+
mp_energies: `Materials Project formation energies and energies above convex hull`,
16+
mp_patched_phase_diagram:
17+
`${ppd_link} constructed from all MP ComputedStructureEntries`,
18+
wbm_computed_structure_entries: `Materials Project computed structure entries`,
19+
wbm_initial_structures: `Unrelaxed WBM structures`,
20+
wbm_cses_plus_init_structs: `Both unrelaxed and DFT-relaxed WBM structures, the latter stored with their final VASP energies as ${cse_link}`,
21+
wbm_summary:
22+
`Computed material properties only, no structures. Available properties are VASP energy, formation energy, energy above the convex hull, volume, band gap, number of sites per unit cell, and more.`,
23+
}
24+
const desc_keys = Object.keys(descriptions).sort()
25+
const figshare_keys = Object.keys(figshare_urls).sort()
26+
const missing = figshare_keys.filter((key) => !desc_keys.includes(key))
27+
if (missing.length > 0) {
28+
console.error(`descriptions must contain all figshare_urls keys, missing=${missing}`)
29+
}
330
</script>
431

532
# How to contribute
@@ -75,15 +102,19 @@ assert list(df_wbm) == [
75102

76103
## 📥 &thinsp; Direct Download
77104

78-
You can also download the data files directly from GitHub:
79-
80-
1. [`2022-10-19-wbm-summary.csv`]({repo}/blob/-/data/wbm/2022-10-19-wbm-summary.csv): Computed material properties only, no structures. Available properties are VASP energy, formation energy, energy above the convex hull, volume, band gap, number of sites per unit cell, and more.
81-
1. [`2022-10-19-wbm-init-structs.json`]({repo}/blob/-/data/wbm/2022-10-19-wbm-init-structs.json): Unrelaxed WBM structures
82-
1. [`2022-10-19-wbm-cses.json`]({repo}/blob/-/data/wbm/2022-10-19-wbm-cses.json): Relaxed WBM structures along with final VASP energies
83-
1. [`2023-01-10-mp-energies.json.gz`]({repo}/blob/-/data/mp/2023-01-10-mp-energies.json.gz): Materials Project formation energies and energies above convex hull
84-
1. [`2023-02-07-mp-computed-structure-entries.json.gz`]({repo}/blob/-/data/mp/2023-02-07-mp-computed-structure-entries.json.gz): Materials Project computed structure entries
85-
1. [`2023-02-07-ppd-mp.pkl.gz`]({repo}/blob/-/data/mp/2023-02-07-ppd-mp.pkl.gz): [PatchedPhaseDiagram](https://pymatgen.org/pymatgen.analysis.phase_diagram.html#pymatgen.analysis.phase_diagram.PatchedPhaseDiagram) constructed from all MP ComputedStructureEntries
86-
1. [`2023-02-07-mp-elemental-reference-entries.json.gz`]({repo}/blob/-/data/mp/2023-02-07-mp-elemental-reference-entries.json.gz): Minimum energy PDEntries for each element present in the Materials Project
105+
You can also download the data files directly from Figshare:
106+
107+
<ol>
108+
{#each Object.entries(figshare_urls) as [key, lst]}
109+
{@const [href, file_name] = lst}
110+
<li>
111+
<Tooltip text={file_name}>
112+
<a {href}>{key}</a>:
113+
</Tooltip>
114+
{@html descriptions[key]}
115+
</li>
116+
{/each}
117+
</ol>
87118

88119
[wbm paper]: https://nature.com/articles/s41524-020-00481-6
89120

tests/test_data.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def test_load(
6767

6868
stdout, _stderr = capsys.readouterr()
6969

70-
assert f"Downloading {data_key!r} from {figshare_urls[data_key]}" in stdout
70+
assert f"Downloading {data_key!r} from {figshare_urls[data_key][0]}" in stdout
7171

7272
# check we called read_csv/read_json once for each data_name
7373
assert urlretrieve.call_count == 1
@@ -172,7 +172,7 @@ def test_load_no_mock(
172172
rel_path = getattr(type(DATA_FILES), file_key)
173173
cache_path = f"{tmp_path}/{rel_path}"
174174
assert (
175-
f"Downloading {file_key!r} from {figshare_urls[file_key]}\nCached "
175+
f"Downloading {file_key!r} from {figshare_urls[file_key][0]}\nCached "
176176
f"{file_key!r} to {cache_path!r}" in stdout
177177
)
178178

0 commit comments

Comments
 (0)