Skip to content

Commit a7fd32b

Browse files
committed
docs(blog-post): add blog post comparing ibis to pandas and dask
1 parent ed47c74 commit a7fd32b

File tree

11 files changed

+1095
-14
lines changed

11 files changed

+1095
-14
lines changed

docs/_freeze/posts/pydata-performance/index/execute-results/html.json

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
5+
import dask.dataframe as dd
6+
from dask.distributed import Client
7+
8+
if __name__ == "__main__":
9+
client = Client(silence_logs=logging.ERROR)
10+
df = dd.read_parquet(
11+
"/data/pypi-parquet/*.parquet",
12+
columns=["path", "uploaded_on", "project_name"],
13+
split_row_groups=True,
14+
)
15+
df = df[
16+
df.path.str.contains(
17+
r"\.(?:asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
18+
)
19+
& ~df.path.str.contains(r"(?:^|/)test(?:|s|ing)")
20+
& ~df.path.str.contains("/site-packages/")
21+
]
22+
print(
23+
df.assign(
24+
month=df.uploaded_on.dt.to_period("M").dt.to_timestamp(),
25+
ext=df.path.str.extract(r"\.([a-z0-9]+)$", 0, expand=False)
26+
.str.replace(r"cxx|cpp|cc|c|hpp|h", "C/C++", regex=True)
27+
.str.replace("^f.*$", "Fortran", regex=True)
28+
.str.replace("rs", "Rust")
29+
.str.replace("go", "Go")
30+
.str.replace("asm", "Assembly"),
31+
)
32+
.groupby(["month", "ext"])
33+
.project_name.nunique()
34+
.rename("project_count")
35+
.compute()
36+
.reset_index()
37+
.sort_values(["month", "project_count"], ascending=False)
38+
)
39+
client.shutdown()

0 commit comments

Comments
 (0)