Skip to content

Commit 4f72723

Browse files
authored
fix: retry is_job_done on ConnectionError (#1930)
1 parent c3f7b23 commit 4f72723

File tree

2 files changed

+125
-0
lines changed

2 files changed

+125
-0
lines changed

google/cloud/bigquery/retry.py

+8
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,14 @@ def _job_should_retry(exc):
119119
if isinstance(exc, exceptions.RetryError):
120120
exc = exc.cause
121121

122+
# Per https://github.com/googleapis/python-bigquery/issues/1929, sometimes
123+
# retriable errors make their way here. Because of the separate
124+
# `restart_query_job` logic to make sure we aren't restarting non-failed
125+
# jobs, it should be safe to continue and not totally fail our attempt at
126+
# waiting for the query to complete.
127+
if _should_retry(exc):
128+
return True
129+
122130
if not hasattr(exc, "errors") or len(exc.errors) == 0:
123131
return False
124132

tests/unit/test_job_retry.py

+117
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import google.api_core.exceptions
2222
import google.api_core.retry
2323
import freezegun
24+
import requests.exceptions
2425

2526
from google.cloud.bigquery.client import Client
2627
from google.cloud.bigquery import _job_helpers
@@ -126,6 +127,122 @@ def api_request(method, path, query_params=None, data=None, **kw):
126127
assert job.job_id == orig_job_id
127128

128129

130+
def test_retry_connection_error_with_default_retries_and_successful_first_job(
131+
monkeypatch, client
132+
):
133+
"""
134+
Make sure ConnectionError can be retried at `is_job_done` level, even if
135+
retries are exhaused by API-level retry.
136+
137+
Note: Because restart_query_job is set to True only in the case of a
138+
confirmed job failure, this should be safe to do even when a job is not
139+
idempotent.
140+
141+
Regression test for issue
142+
https://github.com/googleapis/python-bigquery/issues/1929
143+
"""
144+
job_counter = 0
145+
146+
def make_job_id(*args, **kwargs):
147+
nonlocal job_counter
148+
job_counter += 1
149+
return f"{job_counter}"
150+
151+
monkeypatch.setattr(_job_helpers, "make_job_id", make_job_id)
152+
conn = client._connection = make_connection()
153+
project = client.project
154+
job_reference_1 = {"projectId": project, "jobId": "1", "location": "test-loc"}
155+
NUM_API_RETRIES = 2
156+
157+
with freezegun.freeze_time(
158+
"2024-01-01 00:00:00",
159+
# Note: because of exponential backoff and a bit of jitter,
160+
# NUM_API_RETRIES will get less accurate the greater the value.
161+
# We add 1 because we know there will be at least some additional
162+
# calls to fetch the time / sleep before the retry deadline is hit.
163+
auto_tick_seconds=(
164+
google.cloud.bigquery.retry._DEFAULT_RETRY_DEADLINE / NUM_API_RETRIES
165+
)
166+
+ 1,
167+
):
168+
conn.api_request.side_effect = [
169+
# jobs.insert
170+
{"jobReference": job_reference_1, "status": {"state": "PENDING"}},
171+
# jobs.get
172+
{"jobReference": job_reference_1, "status": {"state": "RUNNING"}},
173+
# jobs.getQueryResults x2
174+
requests.exceptions.ConnectionError(),
175+
requests.exceptions.ConnectionError(),
176+
# jobs.get
177+
# Job actually succeeeded, so we shouldn't be restarting the job,
178+
# even though we are retrying at the `is_job_done` level.
179+
{"jobReference": job_reference_1, "status": {"state": "DONE"}},
180+
# jobs.getQueryResults
181+
{"jobReference": job_reference_1, "jobComplete": True},
182+
]
183+
184+
job = client.query("select 1")
185+
rows_iter = job.result()
186+
187+
assert job.done() # Shouldn't make any additional API calls.
188+
assert rows_iter is not None
189+
190+
# Should only have created one job, even though we did call job_retry.
191+
assert job_counter == 1
192+
193+
# Double-check that we made the API calls we expected to make.
194+
conn.api_request.assert_has_calls(
195+
[
196+
# jobs.insert
197+
mock.call(
198+
method="POST",
199+
path="/projects/PROJECT/jobs",
200+
data={
201+
"jobReference": {"jobId": "1", "projectId": "PROJECT"},
202+
"configuration": {
203+
"query": {"useLegacySql": False, "query": "select 1"}
204+
},
205+
},
206+
timeout=None,
207+
),
208+
# jobs.get
209+
mock.call(
210+
method="GET",
211+
path="/projects/PROJECT/jobs/1",
212+
query_params={"location": "test-loc"},
213+
timeout=None,
214+
),
215+
# jobs.getQueryResults x2
216+
mock.call(
217+
method="GET",
218+
path="/projects/PROJECT/queries/1",
219+
query_params={"maxResults": 0, "location": "test-loc"},
220+
timeout=None,
221+
),
222+
mock.call(
223+
method="GET",
224+
path="/projects/PROJECT/queries/1",
225+
query_params={"maxResults": 0, "location": "test-loc"},
226+
timeout=None,
227+
),
228+
# jobs.get -- is_job_done checking again
229+
mock.call(
230+
method="GET",
231+
path="/projects/PROJECT/jobs/1",
232+
query_params={"location": "test-loc"},
233+
timeout=None,
234+
),
235+
# jobs.getQueryResults
236+
mock.call(
237+
method="GET",
238+
path="/projects/PROJECT/queries/1",
239+
query_params={"maxResults": 0, "location": "test-loc"},
240+
timeout=120,
241+
),
242+
],
243+
)
244+
245+
129246
def test_query_retry_with_default_retry_and_ambiguous_errors_only_retries_with_failed_job(
130247
client, monkeypatch
131248
):

0 commit comments

Comments
 (0)