Skip to content

Commit 36e1db5

Browse files
committed
docs(blog-post): pydata performance part 2; polars and datafusion
1 parent a3c1c07 commit 36e1db5

File tree

10 files changed

+492
-9
lines changed

10 files changed

+492
-9
lines changed

docs/_freeze/posts/pydata-performance-part2/index/execute-results/html.json

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from __future__ import annotations
2+
3+
import ibis
4+
from ibis import _
5+
6+
ibis.set_backend("datafusion")
7+
8+
expr = (
9+
ibis.read_parquet("/data/pypi-parquet/*.parquet")
10+
.filter(
11+
[
12+
_.path.re_search(
13+
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
14+
),
15+
~_.path.re_search(r"(^|/)test(|s|ing)"),
16+
~_.path.contains("/site-packages/"),
17+
]
18+
)
19+
.group_by(
20+
month=_.uploaded_on.truncate("M"),
21+
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
22+
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
23+
.re_replace("^f.*$", "Fortran")
24+
.replace("rs", "Rust")
25+
.replace("go", "Go")
26+
.replace("asm", "Assembly")
27+
.nullif(""),
28+
)
29+
.aggregate(project_count=_.project_name.nunique())
30+
.dropna("ext")
31+
.order_by([_.month.desc(), _.project_count.desc()])
32+
)
33+
df = expr.to_pandas()
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from __future__ import annotations
2+
3+
import datafusion
4+
5+
with open("./datafusion_native.sql") as f:
6+
query = f.read()
7+
8+
ctx = datafusion.SessionContext()
9+
ctx.register_parquet(name="pypi", path="/data/pypi-parquet/*.parquet")
10+
expr = ctx.sql(query)
11+
12+
df = expr.to_pandas()
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
SELECT
2+
month,
3+
ext,
4+
COUNT(DISTINCT project_name) AS project_count
5+
FROM (
6+
SELECT
7+
project_name,
8+
DATE_TRUNC('month', uploaded_on) AS month,
9+
NULLIF(
10+
REPLACE(
11+
REPLACE(
12+
REPLACE(
13+
REGEXP_REPLACE(
14+
REGEXP_REPLACE(
15+
REGEXP_MATCH(path, CONCAT('(', '\.([a-z0-9]+)$', ')'))[2],
16+
'cxx|cpp|cc|c|hpp|h',
17+
'C/C++',
18+
'g'
19+
),
20+
'^f.*$',
21+
'Fortran',
22+
'g'
23+
),
24+
'rs',
25+
'Rust'
26+
),
27+
'go',
28+
'Go'
29+
),
30+
'asm',
31+
'Assembly'
32+
),
33+
''
34+
) AS ext
35+
FROM pypi
36+
WHERE COALESCE(
37+
ARRAY_LENGTH(
38+
REGEXP_MATCH(path, '\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$')
39+
) > 0,
40+
FALSE
41+
)
42+
AND NOT COALESCE(ARRAY_LENGTH(REGEXP_MATCH(path, '(^|/)test(|s|ing)')) > 0, FALSE)
43+
AND NOT STRPOS(path, '/site-packages/') > 0
44+
)
45+
WHERE ext IS NOT NULL
46+
GROUP BY month, ext
47+
ORDER BY month DESC, project_count DESC
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from __future__ import annotations
2+
3+
import ibis
4+
from ibis import _
5+
6+
expr = (
7+
ibis.read_parquet("/data/pypi-parquet/*.parquet")
8+
.filter(
9+
[
10+
_.path.re_search(
11+
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
12+
),
13+
~_.path.re_search(r"(^|/)test(|s|ing)"),
14+
~_.path.contains("/site-packages/"),
15+
]
16+
)
17+
.group_by(
18+
month=_.uploaded_on.truncate("M"),
19+
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
20+
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
21+
.re_replace("^f.*$", "Fortran")
22+
.replace("rs", "Rust")
23+
.replace("go", "Go")
24+
.replace("asm", "Assembly")
25+
.nullif(""),
26+
)
27+
.aggregate(project_count=_.project_name.nunique())
28+
.dropna("ext")
29+
.order_by([_.month.desc(), _.project_count.desc()]) # <1>
30+
)
31+
df = expr.to_pandas()

0 commit comments

Comments
 (0)