Skip to content

Commit e097290

Browse files
authored
Use datasets from GitHub instead of CloudFront. (#3427)
* Use datasets from GitHub instead of CloudFront. * Fix download of MNIST datasets from GitHub.
1 parent 6b5173c commit e097290

File tree

10 files changed

+68
-43
lines changed

10 files changed

+68
-43
lines changed

examples/baseball.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
"""
5353

5454
logging.basicConfig(format="%(message)s", level=logging.INFO)
55-
DATA_URL = "https://d2hg8soec8ck9v.cloudfront.net/datasets/EfronMorrisBB.txt"
55+
DATA_URL = "https://github.com/pyro-ppl/datasets/blob/master/EfronMorrisBB.txt?raw=true"
5656

5757

5858
# ===================================

examples/mixed_hmm/seal_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
def download_seal_data(filename):
1414
"""download the preprocessed seal data and save it to filename"""
15-
url = "https://d2hg8soec8ck9v.cloudfront.net/datasets/prep_seal_data.csv"
15+
url = "https://github.com/pyro-ppl/datasets/blob/master/prep_seal_data.csv?raw=true"
1616
with open(filename, "wb") as f:
1717
f.write(urlopen(url).read())
1818

examples/sparse_gamma_def.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ def main(args):
216216
raise
217217
pass
218218
wget.download(
219-
"https://d2hg8soec8ck9v.cloudfront.net/datasets/faces_training.csv",
219+
"https://github.com/pyro-ppl/datasets/blob/master/faces_training.csv?raw=true",
220220
dataset_path,
221221
)
222222
data = torch.tensor(np.loadtxt(dataset_path, delimiter=",")).float()

pyro/contrib/examples/finance.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
DATA = get_data_directory(__file__)
1212

1313
# https://finance.yahoo.com/quote/%5EGSPC/history/
14-
CACHE_URL = "https://d2hg8soec8ck9v.cloudfront.net/datasets/snp500.csv.bz2"
14+
CACHE_URL = "https://github.com/pyro-ppl/datasets/blob/master/snp500.csv.bz2?raw=true"
1515

1616

1717
def load_snp500():

pyro/contrib/examples/polyphonic_data_loader.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,25 +31,25 @@
3131

3232
JSB_CHORALES = dset(
3333
"jsb_chorales",
34-
"https://d2hg8soec8ck9v.cloudfront.net/datasets/polyphonic/jsb_chorales.pickle",
34+
"https://github.com/pyro-ppl/datasets/blob/master/polyphonic/jsb_chorales.pickle?raw=true",
3535
"jsb_chorales.pkl",
3636
)
3737

3838
PIANO_MIDI = dset(
3939
"piano_midi",
40-
"https://d2hg8soec8ck9v.cloudfront.net/datasets/polyphonic/piano_midi.pickle",
40+
"https://github.com/pyro-ppl/datasets/blob/master/polyphonic/piano_midi.pickle?raw=true",
4141
"piano_midi.pkl",
4242
)
4343

4444
MUSE_DATA = dset(
4545
"muse_data",
46-
"https://d2hg8soec8ck9v.cloudfront.net/datasets/polyphonic/muse_data.pickle",
46+
"https://github.com/pyro-ppl/datasets/blob/master/polyphonic/muse_data.pickle?raw=true",
4747
"muse_data.pkl",
4848
)
4949

5050
NOTTINGHAM = dset(
5151
"nottingham",
52-
"https://d2hg8soec8ck9v.cloudfront.net/datasets/polyphonic/nottingham.pickle",
52+
"https://github.com/pyro-ppl/datasets/blob/master/polyphonic/nottingham.pickle?raw=true",
5353
"nottingham.pkl",
5454
)
5555

pyro/contrib/examples/util.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,34 @@
1010

1111

1212
class MNIST(datasets.MNIST):
13-
mirrors = [
14-
"https://d2hg8soec8ck9v.cloudfront.net/datasets/mnist/"
15-
] + datasets.MNIST.mirrors
13+
mirrors = ["https://github.com/pyro-ppl/datasets/blob/master/mnist/"]
14+
15+
def download(self) -> None:
16+
"""Download the MNIST data if it doesn't exist already."""
17+
18+
if self._check_exists():
19+
return
20+
21+
os.makedirs(self.raw_folder, exist_ok=True)
22+
23+
# download files
24+
for filename, md5 in self.resources:
25+
errors = []
26+
for mirror in self.mirrors:
27+
url = f"{mirror}{filename}?raw=true"
28+
try:
29+
datasets.utils.download_and_extract_archive(
30+
url, download_root=self.raw_folder, filename=filename, md5=md5
31+
)
32+
except datasets.URLError as e:
33+
errors.append(e)
34+
continue
35+
break
36+
else:
37+
s = f"Error downloading {filename}:\n"
38+
for mirror, err in zip(self.mirrors, errors):
39+
s += f"Tried {mirror}, got:\n{str(err)}\n"
40+
raise RuntimeError(s)
1641

1742

1843
def get_data_loader(

tutorial/source/bayesian_regression.ipynb

Lines changed: 8 additions & 8 deletions
Large diffs are not rendered by default.

tutorial/source/bayesian_regression_ii.ipynb

Lines changed: 7 additions & 7 deletions
Large diffs are not rendered by default.

tutorial/source/elections.ipynb

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@
3030
"outputs": [],
3131
"source": [
3232
"# Data path\n",
33-
"BASE_URL = \"https://d2hg8soec8ck9v.cloudfront.net/datasets/us_elections/\""
33+
"BASE_URL = \"https://github.com/pyro-ppl/datasets/blob/master/us_elections/\""
3434
]
3535
},
3636
{
3737
"cell_type": "code",
38-
"execution_count": 4,
38+
"execution_count": null,
3939
"metadata": {},
4040
"outputs": [
4141
{
@@ -57,7 +57,7 @@
5757
"import torch\n",
5858
"from urllib.request import urlopen\n",
5959
"\n",
60-
"electoral_college_votes = pd.read_pickle(urlopen(BASE_URL + \"electoral_college_votes.pickle\"))\n",
60+
"electoral_college_votes = pd.read_pickle(urlopen(BASE_URL + \"electoral_college_votes.pickle?raw=true\"))\n",
6161
"print(electoral_college_votes.head())\n",
6262
"ec_votes_tensor = torch.tensor(electoral_college_votes.values, dtype=torch.float).squeeze()"
6363
]
@@ -106,7 +106,7 @@
106106
},
107107
{
108108
"cell_type": "code",
109-
"execution_count": 7,
109+
"execution_count": null,
110110
"metadata": {},
111111
"outputs": [
112112
{
@@ -125,7 +125,7 @@
125125
}
126126
],
127127
"source": [
128-
"frame = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_historical.pickle\"))\n",
128+
"frame = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_historical.pickle?raw=true\"))\n",
129129
"print(frame[[1976, 1980, 1984]].head())"
130130
]
131131
},
@@ -652,11 +652,11 @@
652652
},
653653
{
654654
"cell_type": "code",
655-
"execution_count": 23,
655+
"execution_count": null,
656656
"metadata": {},
657657
"outputs": [],
658658
"source": [
659-
"test_data = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_test.pickle\"))\n",
659+
"test_data = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_test.pickle?raw=true\"))\n",
660660
"results_2016 = torch.tensor(test_data.values, dtype=torch.float)\n",
661661
"true_alpha = torch.log(results_2016[..., 0] / results_2016[..., 1])"
662662
]

tutorial/source/intro_long.ipynb

Lines changed: 10 additions & 10 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)