Use datasets from GitHub instead of CloudFront. (#3427)

BenZickel · web-flow · commit e0972906482e · 2025-04-06T15:06:47.000-04:00
* Use datasets from GitHub instead of CloudFront.

* Fix download of MNIST datasets from GitHub.
diff --git a/examples/baseball.py b/examples/baseball.py
@@ -52,7 +52,7 @@
 """
 
 logging.basicConfig(format="%(message)s", level=logging.INFO)
-DATA_URL = "https://d2hg8soec8ck9v.cloudfront.net/datasets/EfronMorrisBB.txt"
+DATA_URL = "https://github.com/pyro-ppl/datasets/blob/master/EfronMorrisBB.txt?raw=true"
 
 
 # ===================================
diff --git a/examples/mixed_hmm/seal_data.py b/examples/mixed_hmm/seal_data.py
@@ -12,7 +12,7 @@
 
 def download_seal_data(filename):
     """download the preprocessed seal data and save it to filename"""
-    url = "https://d2hg8soec8ck9v.cloudfront.net/datasets/prep_seal_data.csv"
+    url = "https://github.com/pyro-ppl/datasets/blob/master/prep_seal_data.csv?raw=true"
     with open(filename, "wb") as f:
         f.write(urlopen(url).read())
 
diff --git a/examples/sparse_gamma_def.py b/examples/sparse_gamma_def.py
@@ -216,7 +216,7 @@ def main(args):
                 raise
             pass
         wget.download(
-            "https://d2hg8soec8ck9v.cloudfront.net/datasets/faces_training.csv",
+            "https://github.com/pyro-ppl/datasets/blob/master/faces_training.csv?raw=true",
             dataset_path,
         )
     data = torch.tensor(np.loadtxt(dataset_path, delimiter=",")).float()
diff --git a/pyro/contrib/examples/finance.py b/pyro/contrib/examples/finance.py
@@ -11,7 +11,7 @@
 DATA = get_data_directory(__file__)
 
 # https://finance.yahoo.com/quote/%5EGSPC/history/
-CACHE_URL = "https://d2hg8soec8ck9v.cloudfront.net/datasets/snp500.csv.bz2"
+CACHE_URL = "https://github.com/pyro-ppl/datasets/blob/master/snp500.csv.bz2?raw=true"
 
 
 def load_snp500():
diff --git a/pyro/contrib/examples/polyphonic_data_loader.py b/pyro/contrib/examples/polyphonic_data_loader.py
@@ -31,25 +31,25 @@
 
 JSB_CHORALES = dset(
     "jsb_chorales",
-    "https://d2hg8soec8ck9v.cloudfront.net/datasets/polyphonic/jsb_chorales.pickle",
+    "https://github.com/pyro-ppl/datasets/blob/master/polyphonic/jsb_chorales.pickle?raw=true",
     "jsb_chorales.pkl",
 )
 
 PIANO_MIDI = dset(
     "piano_midi",
-    "https://d2hg8soec8ck9v.cloudfront.net/datasets/polyphonic/piano_midi.pickle",
+    "https://github.com/pyro-ppl/datasets/blob/master/polyphonic/piano_midi.pickle?raw=true",
     "piano_midi.pkl",
 )
 
 MUSE_DATA = dset(
     "muse_data",
-    "https://d2hg8soec8ck9v.cloudfront.net/datasets/polyphonic/muse_data.pickle",
+    "https://github.com/pyro-ppl/datasets/blob/master/polyphonic/muse_data.pickle?raw=true",
     "muse_data.pkl",
 )
 
 NOTTINGHAM = dset(
     "nottingham",
-    "https://d2hg8soec8ck9v.cloudfront.net/datasets/polyphonic/nottingham.pickle",
+    "https://github.com/pyro-ppl/datasets/blob/master/polyphonic/nottingham.pickle?raw=true",
     "nottingham.pkl",
 )
 
diff --git a/pyro/contrib/examples/util.py b/pyro/contrib/examples/util.py
@@ -10,9 +10,34 @@
 
 
 class MNIST(datasets.MNIST):
-    mirrors = [
-        "https://d2hg8soec8ck9v.cloudfront.net/datasets/mnist/"
-    ] + datasets.MNIST.mirrors
+    mirrors = ["https://github.com/pyro-ppl/datasets/blob/master/mnist/"]
+
+    def download(self) -> None:
+        """Download the MNIST data if it doesn't exist already."""
+
+        if self._check_exists():
+            return
+
+        os.makedirs(self.raw_folder, exist_ok=True)
+
+        # download files
+        for filename, md5 in self.resources:
+            errors = []
+            for mirror in self.mirrors:
+                url = f"{mirror}{filename}?raw=true"
+                try:
+                    datasets.utils.download_and_extract_archive(
+                        url, download_root=self.raw_folder, filename=filename, md5=md5
+                    )
+                except datasets.URLError as e:
+                    errors.append(e)
+                    continue
+                break
+            else:
+                s = f"Error downloading {filename}:\n"
+                for mirror, err in zip(self.mirrors, errors):
+                    s += f"Tried {mirror}, got:\n{str(err)}\n"
+                raise RuntimeError(s)
 
 
 def get_data_loader(
diff --git a/tutorial/source/bayesian_regression.ipynb b/tutorial/source/bayesian_regression.ipynb
diff --git a/tutorial/source/bayesian_regression_ii.ipynb b/tutorial/source/bayesian_regression_ii.ipynb
diff --git a/tutorial/source/elections.ipynb b/tutorial/source/elections.ipynb
@@ -30,12 +30,12 @@
    "outputs": [],
    "source": [
     "# Data path\n",
-    "BASE_URL =  \"https://d2hg8soec8ck9v.cloudfront.net/datasets/us_elections/\""
+    "BASE_URL =  \"https://github.com/pyro-ppl/datasets/blob/master/us_elections/\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -57,7 +57,7 @@
     "import torch\n",
     "from urllib.request import urlopen\n",
     "\n",
-    "electoral_college_votes = pd.read_pickle(urlopen(BASE_URL + \"electoral_college_votes.pickle\"))\n",
+    "electoral_college_votes = pd.read_pickle(urlopen(BASE_URL + \"electoral_college_votes.pickle?raw=true\"))\n",
     "print(electoral_college_votes.head())\n",
     "ec_votes_tensor = torch.tensor(electoral_college_votes.values, dtype=torch.float).squeeze()"
    ]
@@ -106,7 +106,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -125,7 +125,7 @@
     }
    ],
    "source": [
-    "frame = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_historical.pickle\"))\n",
+    "frame = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_historical.pickle?raw=true\"))\n",
     "print(frame[[1976, 1980, 1984]].head())"
    ]
   },
@@ -652,11 +652,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "test_data = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_test.pickle\"))\n",
+    "test_data = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_test.pickle?raw=true\"))\n",
     "results_2016 = torch.tensor(test_data.values, dtype=torch.float)\n",
     "true_alpha = torch.log(results_2016[..., 0] / results_2016[..., 1])"
    ]
diff --git a/tutorial/source/intro_long.ipynb b/tutorial/source/intro_long.ipynb

Original file line number	Diff line number	Diff line change
`@@ -216,7 +216,7 @@ def main(args):`
`216`	`216`	`raise`
`217`	`217`	`pass`
`218`	`218`	`wget.download(`
`219`		`- "https://d2hg8soec8ck9v.cloudfront.net/datasets/faces_training.csv",`
	`219`	`+ "https://github.com/pyro-ppl/datasets/blob/master/faces_training.csv?raw=true",`
`220`	`220`	`dataset_path,`
`221`	`221`	`)`
`222`	`222`	`data = torch.tensor(np.loadtxt(dataset_path, delimiter=",")).float()`
Original file line number	Diff line number	Diff line change
`@@ -30,12 +30,12 @@`
`30`	`30`	`"outputs": [],`
`31`	`31`	`"source": [`
`32`	`32`	`"# Data path\n",`
`33`		`- "BASE_URL = \"https://d2hg8soec8ck9v.cloudfront.net/datasets/us_elections/\""`
	`33`	`+ "BASE_URL = \"https://github.com/pyro-ppl/datasets/blob/master/us_elections/\""`
`34`	`34`	`]`
`35`	`35`	`},`
`36`	`36`	`{`
`37`	`37`	`"cell_type": "code",`
`38`		`- "execution_count": 4,`
	`38`	`+ "execution_count": null,`
`39`	`39`	`"metadata": {},`
`40`	`40`	`"outputs": [`
`41`	`41`	`{`
`@@ -57,7 +57,7 @@`
`57`	`57`	`"import torch\n",`
`58`	`58`	`"from urllib.request import urlopen\n",`
`59`	`59`	`"\n",`
`60`		`- "electoral_college_votes = pd.read_pickle(urlopen(BASE_URL + \"electoral_college_votes.pickle\"))\n",`
	`60`	`+ "electoral_college_votes = pd.read_pickle(urlopen(BASE_URL + \"electoral_college_votes.pickle?raw=true\"))\n",`
`61`	`61`	`"print(electoral_college_votes.head())\n",`
`62`	`62`	`"ec_votes_tensor = torch.tensor(electoral_college_votes.values, dtype=torch.float).squeeze()"`
`63`	`63`	`]`
`@@ -106,7 +106,7 @@`
`106`	`106`	`},`
`107`	`107`	`{`
`108`	`108`	`"cell_type": "code",`
`109`		`- "execution_count": 7,`
	`109`	`+ "execution_count": null,`
`110`	`110`	`"metadata": {},`
`111`	`111`	`"outputs": [`
`112`	`112`	`{`
`@@ -125,7 +125,7 @@`
`125`	`125`	`}`
`126`	`126`	`],`
`127`	`127`	`"source": [`
`128`		`- "frame = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_historical.pickle\"))\n",`
	`128`	`+ "frame = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_historical.pickle?raw=true\"))\n",`
`129`	`129`	`"print(frame[[1976, 1980, 1984]].head())"`
`130`	`130`	`]`
`131`	`131`	`},`
`@@ -652,11 +652,11 @@`
`652`	`652`	`},`
`653`	`653`	`{`
`654`	`654`	`"cell_type": "code",`
`655`		`- "execution_count": 23,`
	`655`	`+ "execution_count": null,`
`656`	`656`	`"metadata": {},`
`657`	`657`	`"outputs": [],`
`658`	`658`	`"source": [`
`659`		`- "test_data = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_test.pickle\"))\n",`
	`659`	`+ "test_data = pd.read_pickle(urlopen(BASE_URL + \"us_presidential_election_data_test.pickle?raw=true\"))\n",`
`660`	`660`	`"results_2016 = torch.tensor(test_data.values, dtype=torch.float)\n",`
`661`	`661`	`"true_alpha = torch.log(results_2016[..., 0] / results_2016[..., 1])"`
`662`	`662`	`]`