Skip to content

Commit 6249032

Browse files
authored
fix: change load_table_from_json autodetect logic (#1804)
1 parent 1298594 commit 6249032

File tree

3 files changed

+255
-5
lines changed

3 files changed

+255
-5
lines changed

google/cloud/bigquery/client.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -2833,8 +2833,22 @@ def load_table_from_json(
28332833

28342834
new_job_config.source_format = job.SourceFormat.NEWLINE_DELIMITED_JSON
28352835

2836-
if new_job_config.schema is None:
2837-
new_job_config.autodetect = True
2836+
# In specific conditions, we check if the table alread exists, and/or
2837+
# set the autodetect value for the user. For exact conditions, see table
2838+
# https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
2839+
if new_job_config.schema is None and new_job_config.autodetect is None:
2840+
if new_job_config.write_disposition in (
2841+
job.WriteDisposition.WRITE_TRUNCATE,
2842+
job.WriteDisposition.WRITE_EMPTY,
2843+
):
2844+
new_job_config.autodetect = True
2845+
else:
2846+
try:
2847+
self.get_table(destination)
2848+
except core_exceptions.NotFound:
2849+
new_job_config.autodetect = True
2850+
else:
2851+
new_job_config.autodetect = False
28382852

28392853
if project is None:
28402854
project = self.project

tests/system/test_client.py

+39
Original file line numberDiff line numberDiff line change
@@ -994,6 +994,45 @@ def test_load_table_from_json_schema_autodetect(self):
994994
self.assertEqual(tuple(table.schema), table_schema)
995995
self.assertEqual(table.num_rows, 2)
996996

997+
# Autodetect makes best effort to infer the schema, but situations exist
998+
# when the detected schema is wrong, and does not match existing schema.
999+
# Thus the client sets autodetect = False when table exists and just uses
1000+
# the existing schema. This test case uses a special case where backend has
1001+
# no way to distinguish int from string.
1002+
def test_load_table_from_json_schema_autodetect_table_exists(self):
1003+
json_rows = [
1004+
{"name": "123", "age": 18, "birthday": "2001-10-15", "is_awesome": False},
1005+
{"name": "456", "age": 79, "birthday": "1940-03-10", "is_awesome": True},
1006+
]
1007+
1008+
dataset_id = _make_dataset_id("bq_system_test")
1009+
self.temp_dataset(dataset_id)
1010+
table_id = "{}.{}.load_table_from_json_basic_use".format(
1011+
Config.CLIENT.project, dataset_id
1012+
)
1013+
1014+
# Use schema with NULLABLE fields, because schema autodetection
1015+
# defaults to field mode NULLABLE.
1016+
table_schema = (
1017+
bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
1018+
bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"),
1019+
bigquery.SchemaField("birthday", "DATE", mode="NULLABLE"),
1020+
bigquery.SchemaField("is_awesome", "BOOLEAN", mode="NULLABLE"),
1021+
)
1022+
# create the table before loading so that the column order is predictable
1023+
table = helpers.retry_403(Config.CLIENT.create_table)(
1024+
Table(table_id, schema=table_schema)
1025+
)
1026+
self.to_delete.insert(0, table)
1027+
1028+
# do not pass an explicit job config to trigger automatic schema detection
1029+
load_job = Config.CLIENT.load_table_from_json(json_rows, table_id)
1030+
load_job.result()
1031+
1032+
table = Config.CLIENT.get_table(table)
1033+
self.assertEqual(tuple(table.schema), table_schema)
1034+
self.assertEqual(table.num_rows, 2)
1035+
9971036
def test_load_avro_from_uri_then_dump_table(self):
9981037
from google.cloud.bigquery.job import CreateDisposition
9991038
from google.cloud.bigquery.job import SourceFormat

tests/unit/test_client.py

+200-3
Original file line numberDiff line numberDiff line change
@@ -8951,6 +8951,8 @@ def test_load_table_from_dataframe_w_higher_scale_decimal128_datatype(self):
89518951
SchemaField("x", "BIGNUMERIC", "NULLABLE", None),
89528952
)
89538953

8954+
# With autodetect specified, we pass the value as is. For more info, see
8955+
# https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
89548956
def test_load_table_from_json_basic_use(self):
89558957
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
89568958
from google.cloud.bigquery import job
@@ -8962,12 +8964,28 @@ def test_load_table_from_json_basic_use(self):
89628964
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
89638965
]
89648966

8967+
job_config = job.LoadJobConfig(autodetect=True)
8968+
89658969
load_patch = mock.patch(
89668970
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
89678971
)
89688972

8969-
with load_patch as load_table_from_file:
8970-
client.load_table_from_json(json_rows, self.TABLE_REF)
8973+
# mock: remote table already exists
8974+
get_table_reference = {
8975+
"projectId": "project_id",
8976+
"datasetId": "test_dataset",
8977+
"tableId": "test_table",
8978+
}
8979+
get_table_patch = mock.patch(
8980+
"google.cloud.bigquery.client.Client.get_table",
8981+
autospec=True,
8982+
return_value=mock.Mock(table_reference=get_table_reference),
8983+
)
8984+
8985+
with load_patch as load_table_from_file, get_table_patch:
8986+
client.load_table_from_json(
8987+
json_rows, self.TABLE_REF, job_config=job_config
8988+
)
89718989

89728990
load_table_from_file.assert_called_once_with(
89738991
client,
@@ -9066,6 +9084,174 @@ def test_load_table_from_json_w_invalid_job_config(self):
90669084
err_msg = str(exc.value)
90679085
assert "Expected an instance of LoadJobConfig" in err_msg
90689086

9087+
# When all following are true:
9088+
# (1) no schema provided;
9089+
# (2) no autodetect value provided;
9090+
# (3) writeDisposition == WRITE_APPEND or None;
9091+
# (4) table already exists,
9092+
# client sets autodetect == False
9093+
# For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
9094+
def test_load_table_from_json_wo_schema_wo_autodetect_write_append_w_table(self):
9095+
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
9096+
from google.cloud.bigquery import job
9097+
from google.cloud.bigquery.job import WriteDisposition
9098+
9099+
client = self._make_client()
9100+
9101+
json_rows = [
9102+
{"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False},
9103+
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
9104+
]
9105+
9106+
job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND)
9107+
9108+
load_patch = mock.patch(
9109+
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
9110+
)
9111+
9112+
# mock: remote table already exists
9113+
get_table_reference = {
9114+
"projectId": "project_id",
9115+
"datasetId": "test_dataset",
9116+
"tableId": "test_table",
9117+
}
9118+
get_table_patch = mock.patch(
9119+
"google.cloud.bigquery.client.Client.get_table",
9120+
autospec=True,
9121+
return_value=mock.Mock(table_reference=get_table_reference),
9122+
)
9123+
9124+
with load_patch as load_table_from_file, get_table_patch:
9125+
client.load_table_from_json(
9126+
json_rows, self.TABLE_REF, job_config=job_config
9127+
)
9128+
9129+
load_table_from_file.assert_called_once_with(
9130+
client,
9131+
mock.ANY,
9132+
self.TABLE_REF,
9133+
size=mock.ANY,
9134+
num_retries=_DEFAULT_NUM_RETRIES,
9135+
job_id=mock.ANY,
9136+
job_id_prefix=None,
9137+
location=client.location,
9138+
project=client.project,
9139+
job_config=mock.ANY,
9140+
timeout=DEFAULT_TIMEOUT,
9141+
)
9142+
9143+
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
9144+
assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON
9145+
assert sent_config.schema is None
9146+
assert not sent_config.autodetect
9147+
9148+
# When all following are true:
9149+
# (1) no schema provided;
9150+
# (2) no autodetect value provided;
9151+
# (3) writeDisposition == WRITE_APPEND or None;
9152+
# (4) table does NOT exist,
9153+
# client sets autodetect == True
9154+
# For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
9155+
def test_load_table_from_json_wo_schema_wo_autodetect_write_append_wo_table(self):
9156+
import google.api_core.exceptions as core_exceptions
9157+
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
9158+
from google.cloud.bigquery import job
9159+
from google.cloud.bigquery.job import WriteDisposition
9160+
9161+
client = self._make_client()
9162+
9163+
json_rows = [
9164+
{"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False},
9165+
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
9166+
]
9167+
9168+
job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND)
9169+
9170+
load_patch = mock.patch(
9171+
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
9172+
)
9173+
9174+
# mock: remote table doesn't exist
9175+
get_table_patch = mock.patch(
9176+
"google.cloud.bigquery.client.Client.get_table",
9177+
autospec=True,
9178+
side_effect=core_exceptions.NotFound(""),
9179+
)
9180+
9181+
with load_patch as load_table_from_file, get_table_patch:
9182+
client.load_table_from_json(
9183+
json_rows, self.TABLE_REF, job_config=job_config
9184+
)
9185+
9186+
load_table_from_file.assert_called_once_with(
9187+
client,
9188+
mock.ANY,
9189+
self.TABLE_REF,
9190+
size=mock.ANY,
9191+
num_retries=_DEFAULT_NUM_RETRIES,
9192+
job_id=mock.ANY,
9193+
job_id_prefix=None,
9194+
location=client.location,
9195+
project=client.project,
9196+
job_config=mock.ANY,
9197+
timeout=DEFAULT_TIMEOUT,
9198+
)
9199+
9200+
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
9201+
assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON
9202+
assert sent_config.schema is None
9203+
assert sent_config.autodetect
9204+
9205+
# When all following are true:
9206+
# (1) no schema provided;
9207+
# (2) no autodetect value provided;
9208+
# (3) writeDisposition == WRITE_TRUNCATE or WRITE_EMPTY;
9209+
# client sets autodetect == True
9210+
# For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
9211+
def test_load_table_from_json_wo_schema_wo_autodetect_others(self):
9212+
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
9213+
from google.cloud.bigquery import job
9214+
from google.cloud.bigquery.job import WriteDisposition
9215+
9216+
client = self._make_client()
9217+
9218+
json_rows = [
9219+
{"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False},
9220+
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
9221+
]
9222+
9223+
job_config = job.LoadJobConfig(
9224+
write_disposition=WriteDisposition.WRITE_TRUNCATE
9225+
)
9226+
9227+
load_patch = mock.patch(
9228+
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
9229+
)
9230+
9231+
with load_patch as load_table_from_file:
9232+
client.load_table_from_json(
9233+
json_rows, self.TABLE_REF, job_config=job_config
9234+
)
9235+
9236+
load_table_from_file.assert_called_once_with(
9237+
client,
9238+
mock.ANY,
9239+
self.TABLE_REF,
9240+
size=mock.ANY,
9241+
num_retries=_DEFAULT_NUM_RETRIES,
9242+
job_id=mock.ANY,
9243+
job_id_prefix=None,
9244+
location=client.location,
9245+
project=client.project,
9246+
job_config=mock.ANY,
9247+
timeout=DEFAULT_TIMEOUT,
9248+
)
9249+
9250+
sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
9251+
assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON
9252+
assert sent_config.schema is None
9253+
assert sent_config.autodetect
9254+
90699255
def test_load_table_from_json_w_explicit_job_config_override(self):
90709256
from google.cloud.bigquery import job
90719257
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
@@ -9190,8 +9376,19 @@ def test_load_table_from_json_unicode_emoji_data_case(self):
91909376
load_patch = mock.patch(
91919377
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
91929378
)
9379+
# mock: remote table already exists
9380+
get_table_reference = {
9381+
"projectId": "project_id",
9382+
"datasetId": "test_dataset",
9383+
"tableId": "test_table",
9384+
}
9385+
get_table_patch = mock.patch(
9386+
"google.cloud.bigquery.client.Client.get_table",
9387+
autospec=True,
9388+
return_value=mock.Mock(table_reference=get_table_reference),
9389+
)
91939390

9194-
with load_patch as load_table_from_file:
9391+
with load_patch as load_table_from_file, get_table_patch:
91959392
client.load_table_from_json(json_rows, self.TABLE_REF)
91969393

91979394
load_table_from_file.assert_called_once_with(

0 commit comments

Comments
 (0)