@@ -93,7 +93,7 @@ Alright, let's jump into some data!
93
93
94
94
95
95
``` {python}
96
- jobs = con.tables.jobs[_.started_at < "2023-01-09"]
96
+ jobs = con.tables.jobs
97
97
jobs
98
98
```
99
99
@@ -211,7 +211,7 @@ stats = stats.mutate(
211
211
.else_("NA")
212
212
.end()
213
213
),
214
- team_plan=ibis.where (_.raw_improvements > 1, "Poetry + Team Plan", "None"),
214
+ team_plan=ibis.ifelse (_.raw_improvements > 1, "Poetry + Team Plan", "None"),
215
215
)
216
216
stats
217
217
```
@@ -221,7 +221,7 @@ Finally, we can summarize by averaging the different durations, grouping on the
221
221
``` {python}
222
222
USECS_PER_MIN = 60_000_000
223
223
224
- agged = stats.group_by([ _.started_date, _.improvements, _.team_plan] ).agg(
224
+ agged = stats.group_by(_.started_date, _.improvements, _.team_plan).agg(
225
225
job=_.job_duration.div(USECS_PER_MIN).mean(),
226
226
workflow=_.workflow_duration.div(USECS_PER_MIN).mean(),
227
227
queueing_time=_.queueing_time.div(USECS_PER_MIN).mean(),
@@ -242,23 +242,22 @@ Ibis doesn't have builtin plotting support, so we need to pull our results into
242
242
243
243
Here I'm using ` plotnine ` (a Python port of ` ggplot2 ` ), which has great integration with pandas DataFrames.
244
244
245
- ``` {python}
246
- raw_df = agged.execute()
247
- raw_df
248
- ```
249
-
250
- Generally, ` plotnine ` works with long, tidy data so let's use ` pandas.melt ` to get there.
245
+ Generally, ` plotnine ` works with long, tidy data so let's use Ibis's
246
+ [ ` pivot_longer ` ] ( ../../reference/expression-tables.qmd#ibis.expr.types.relations.Table.pivot_longer )
247
+ to get there.
251
248
252
249
253
250
``` {python}
254
- import pandas as pd
255
-
256
- df = pd.melt(
257
- raw_df ,
258
- id_vars=["started_date", "improvements", "team_plan"] ,
259
- var_name="entity",
260
- value_name="duration",
251
+ agged_pivoted = (
252
+ agged.pivot_longer(
253
+ ("job", "workflow", "queueing_time"),
254
+ names_to="entity" ,
255
+ values_to="duration" ,
256
+ )
257
+ .mutate(started_date=_.started_date.cast("timestamp").truncate("D"))
261
258
)
259
+
260
+ df = agged_pivoted.execute()
262
261
df.head()
263
262
```
264
263
@@ -286,12 +285,16 @@ import logging
286
285
287
286
# without this, findfont logging spams the notebook making it unusable
288
287
logging.getLogger('matplotlib.font_manager').disabled = True
288
+ logging.getLogger('plotnine').disabled = True
289
289
```
290
290
291
291
Here we show job durations, coloring the points differently depending on whether they have no improvements, poetry, or poetry + team plan.
292
292
293
293
``` {python}
294
- (
294
+ import pandas as pd
295
+
296
+
297
+ g = (
295
298
ggplot(
296
299
df.loc[df.entity == "job"].reset_index(drop=True),
297
300
aes(x="started_date", y="duration", color="factor(improvements)"),
@@ -307,8 +310,8 @@ Here we show job durations, coloring the points differently depending on whether
307
310
type='qual',
308
311
limits=["None", "Poetry", "Poetry + Team Plan"],
309
312
)
310
- + geom_text(x=POETRY_MERGED_DATE, label=poetry_label, y=15 , color="blue")
311
- + geom_text(x=TEAMIZATION_DATE, label=team_label, y=10 , color="blue")
313
+ + geom_text(aes("x", "y"), label=poetry_label, data=pd.DataFrame({"x": [POETRY_MERGED_DATE], "y": [15]}) , color="blue")
314
+ + geom_text(aes("x", "y"), label=team_label, data=pd.DataFrame({"x": [TEAMIZATION_DATE], "y": [10]}) , color="blue")
312
315
+ stat_smooth(method="lm")
313
316
+ labs(x="Date", y="Duration (minutes)")
314
317
+ ggtitle("Job Duration")
@@ -318,6 +321,7 @@ Here we show job durations, coloring the points differently depending on whether
318
321
legend_direction="vertical",
319
322
)
320
323
)
324
+ g.show()
321
325
```
322
326
323
327
## Result #1 : Job Duration
@@ -331,7 +335,7 @@ A few things pop out to me right away:
331
335
- Moving to the team plan had little to no effect on job run duration.
332
336
333
337
``` {python}
334
- (
338
+ g = (
335
339
ggplot(
336
340
df.loc[df.entity != "job"].reset_index(drop=True),
337
341
aes(x="started_date", y="duration", color="factor(improvements)"),
@@ -347,8 +351,8 @@ A few things pop out to me right away:
347
351
type='qual',
348
352
limits=["None", "Poetry", "Poetry + Team Plan"],
349
353
)
350
- + geom_text(x=POETRY_MERGED_DATE, label=poetry_label, y=75 , color="blue")
351
- + geom_text(x=TEAMIZATION_DATE, label=team_label, y=50 , color="blue")
354
+ + geom_text(aes("x", "y"), label=poetry_label, data=pd.DataFrame({"x": [POETRY_MERGED_DATE], "y": [75]}) , color="blue")
355
+ + geom_text(aes("x", "y"), label=team_label, data=pd.DataFrame({"x": [TEAMIZATION_DATE], "y": [50]}) , color="blue")
352
356
+ stat_smooth(method="lm")
353
357
+ labs(x="Date", y="Duration (minutes)")
354
358
+ ggtitle("Workflow Duration")
@@ -358,6 +362,7 @@ A few things pop out to me right away:
358
362
legend_direction="vertical",
359
363
)
360
364
)
365
+ g.show()
361
366
```
362
367
363
368
## Result #2 : Workflow Duration and Queueing Time
@@ -377,15 +382,16 @@ Another interesting result.
377
382
In the next plot we'll look at that correlation.
378
383
379
384
``` {python}
380
- (
381
- ggplot(raw_df , aes(x="workflow", y="queueing_time"))
385
+ g = (
386
+ ggplot(agged.execute() , aes(x="workflow", y="queueing_time"))
382
387
+ geom_point()
383
388
+ geom_rug()
384
389
+ facet_grid(". ~ team_plan")
385
390
+ labs(x="Workflow Duration (minutes)", y="Queueing Time (minutes)")
386
391
+ ggtitle("Workflow Duration vs. Queueing Time")
387
392
+ theme(figure_size=(22, 6))
388
393
)
394
+ g.show()
389
395
```
390
396
391
397
## Result #3 : Workflow Duration and Queueing Duration are correlated
0 commit comments