Skip to content

Commit eca076e

Browse files
authored
Retry pr_time_benchmarks when it fails (#6005)
A request from @laithsakka, we want to retry this job in a different runner as it could fail flakily sometimes
1 parent 17204ff commit eca076e

File tree

2 files changed

+55
-0
lines changed

2 files changed

+55
-0
lines changed

torchci/lib/bot/retryBot.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ import { CachedConfigTracker } from "./utils";
55
const SUCCESS_CONCLUSIONS = ["success"];
66
const FAILURE_CONCLUSIONS = ["failure", "cancelled", "timed_out"];
77

8+
// If these jobs fail, they will always be retried
9+
const ALWAYS_RETRY_JOBS = [
10+
// From @laithsakka, we want to retry this job in a different runner as it could
11+
// fail flakily sometimes
12+
"pr_time_benchmarks",
13+
];
14+
815
async function getFlakyJobsFromPreviousWorkflow(
916
owner: string,
1017
repo: string,
@@ -120,6 +127,14 @@ async function retryCurrentWorkflow(
120127
return false;
121128
}
122129

130+
for (const flakyJobName of ALWAYS_RETRY_JOBS) {
131+
// if the job is a known flaky one, we want to retry it whenever if fails,
132+
// even if the failed step is a test step
133+
if (job.name.toLocaleLowerCase().includes(flakyJobName)) {
134+
return true;
135+
}
136+
}
137+
123138
// if no test steps failed, can rerun
124139
return !doesLookLikeUserFailure(job, (step) =>
125140
step.name.toLowerCase().includes("test")

torchci/test/retryBot.test.ts

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,46 @@ describe("retry-bot", () => {
172172
handleScope(scope);
173173
});
174174

175+
test("rerun known flaky jobs", async () => {
176+
const event = requireDeepCopy("./fixtures/workflow_run.completed.json");
177+
event.payload.workflow_run.name = "pull";
178+
const workflow_jobs = requireDeepCopy("./fixtures/workflow_jobs.json");
179+
workflow_jobs.jobs[4].name = `${workflow_jobs.jobs[0].name} (pr_time_benchmarks)`;
180+
workflow_jobs.jobs[4].conclusion = "failure";
181+
workflow_jobs.jobs[4].steps[0].conclusion = "failure";
182+
183+
const owner = event.payload.repository.owner.login;
184+
const repo = event.payload.repository.name;
185+
const attempt_number = event.payload.workflow_run.run_attempt;
186+
const run_id = event.payload.workflow_run.id;
187+
188+
const scope = nock("https://api.github.com")
189+
.get(
190+
`/repos/${owner}/${repo}/actions/runs/${run_id}/attempts/${attempt_number}/jobs?page=1&per_page=100`
191+
)
192+
.reply(200, workflow_jobs)
193+
.get(
194+
`/repos/${owner}/${repo}/contents/${encodeURIComponent(
195+
".github/pytorch-probot.yml"
196+
)}`
197+
)
198+
.reply(
199+
200,
200+
'{retryable_workflows: ["pull", "trunk", "linux-binary", "windows-binary"]}'
201+
)
202+
.post(
203+
`/repos/${owner}/${repo}/actions/jobs/${workflow_jobs.jobs[4].id}/rerun`
204+
)
205+
.reply(200);
206+
207+
const mock = jest.spyOn(clickhouse, "queryClickhouseSaved");
208+
mock.mockImplementation(() => Promise.resolve([]));
209+
210+
await probot.receive(event);
211+
212+
handleScope(scope);
213+
});
214+
175215
test("rerun previous workflow if it has more than one flaky jobs in trunk", async () => {
176216
const event = requireDeepCopy("./fixtures/workflow_run.completed.json");
177217
event.payload.workflow_run.name = "pull";

0 commit comments

Comments
 (0)