@@ -27,9 +27,10 @@ always viable -- we're in Python land so why not grab the filenames using
27
27
``` {python}
28
28
import urllib3
29
29
30
- http = urllib3.PoolManager()
30
+ url = "https://raw.githubusercontent.com/pypi-data/data/main/links/dataset.txt"
31
31
32
- resp = http.request("GET", "https://github.com/pypi-data/data/raw/main/links/dataset.txt")
32
+ with urllib3.PoolManager() as http:
33
+ resp = http.request("GET", url)
33
34
34
35
parquet_files = resp.data.decode().split()
35
36
parquet_files
@@ -87,7 +88,7 @@ We can follow Seth's lead and look for things:
87
88
``` {python}
88
89
expr = pypi.filter(
89
90
[
90
- _.path.re_search(r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0- 2}(?:or)?|go)$"),
91
+ _.path.re_search(r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0, 2}(?:or)?|go)$"),
91
92
~_.path.re_search(r"(^|/)test(|s|ing)"),
92
93
~_.path.contains("/site-packages/"),
93
94
]
@@ -144,10 +145,12 @@ We'll do a few things:
144
145
``` {python}
145
146
collapse_names = expr.mutate(
146
147
ext=_.ext.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
148
+ .re_replace("^f.*$", "Fortran")
147
149
.replace("rs", "Rust")
148
150
.replace("go", "Go")
149
- .replace("asm", "Assembly"),
150
- )
151
+ .replace("asm", "Assembly")
152
+ .nullif(""),
153
+ ).dropna("ext")
151
154
152
155
collapse_names
153
156
```
@@ -202,7 +205,7 @@ Now that the data are tidied, we can pass our expression directly to Altair and
202
205
import altair as alt
203
206
204
207
chart = (
205
- alt.Chart(collapse_names)
208
+ alt.Chart(collapse_names.to_pandas() )
206
209
.mark_line()
207
210
.encode(x="month", y="project_count", color="ext")
208
211
.properties(width=600, height=300)
@@ -235,7 +238,7 @@ full_query = (
235
238
pypi.filter(
236
239
[
237
240
_.path.re_search(
238
- r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0- 2}(?:or)?|go)$"
241
+ r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0, 2}(?:or)?|go)$"
239
242
),
240
243
~_.path.re_search(r"(^|/)test(|s|ing)"),
241
244
~_.path.contains("/site-packages/"),
@@ -249,15 +252,18 @@ full_query = (
249
252
.order_by(_.month.desc())
250
253
.mutate(
251
254
ext=_.ext.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
255
+ .re_replace("^f.*$", "Fortran")
252
256
.replace("rs", "Rust")
253
257
.replace("go", "Go")
254
- .replace("asm", "Assembly"),
258
+ .replace("asm", "Assembly")
259
+ .nullif(""),
255
260
)
261
+ .dropna("ext")
256
262
.group_by(["month", "ext"])
257
263
.aggregate(project_count=flatten(_.projects.collect()).unique().length())
258
264
)
259
265
chart = (
260
- alt.Chart(full_query)
266
+ alt.Chart(full_query.to_pandas() )
261
267
.mark_line()
262
268
.encode(x="month", y="project_count", color="ext")
263
269
.properties(width=600, height=300)
0 commit comments