|
97 | 97 | ),
|
98 | 98 | ]
|
99 | 99 |
|
| 100 | +SOURCE_URIS_AVRO = [ |
| 101 | + "gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/a-twitter.avro", |
| 102 | + "gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/b-twitter.avro", |
| 103 | + "gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/c-twitter.avro", |
| 104 | +] |
| 105 | +SOURCE_URIS_PARQUET = [ |
| 106 | + "gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/a-twitter.parquet", |
| 107 | + "gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/b-twitter.parquet", |
| 108 | + "gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/c-twitter.parquet", |
| 109 | +] |
| 110 | +REFERENCE_FILE_SCHEMA_URI_AVRO = "gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/a-twitter.avro" |
| 111 | +REFERENCE_FILE_SCHEMA_URI_PARQUET = "gs://cloud-samples-data/bigquery/federated-formats-reference-file-schema/a-twitter.parquet" |
| 112 | + |
| 113 | + |
100 | 114 | # The VPC-SC team maintains a mirror of the GCS bucket used for code
|
101 | 115 | # samples. The public bucket crosses the configured security boundary.
|
102 | 116 | # See: https://github.com/googleapis/google-cloud-python/issues/8550
|
@@ -1052,6 +1066,195 @@ def test_load_table_from_file_w_explicit_location(self):
|
1052 | 1066 | table_ref, "gs://{}/letters-us.csv".format(bucket_name), location="US"
|
1053 | 1067 | ).result()
|
1054 | 1068 |
|
| 1069 | + def test_create_external_table_with_reference_file_schema_uri_avro(self): |
| 1070 | + client = Config.CLIENT |
| 1071 | + dataset_id = _make_dataset_id("external_reference_file_avro") |
| 1072 | + self.temp_dataset(dataset_id) |
| 1073 | + dataset_ref = bigquery.DatasetReference(client.project, dataset_id) |
| 1074 | + table_id = "test_ref_file_avro" |
| 1075 | + table_ref = bigquery.TableReference(dataset_ref=dataset_ref, table_id=table_id) |
| 1076 | + |
| 1077 | + expected_schema = [ |
| 1078 | + bigquery.SchemaField("username", "STRING", mode="NULLABLE"), |
| 1079 | + bigquery.SchemaField("tweet", "STRING", mode="NULLABLE"), |
| 1080 | + bigquery.SchemaField("timestamp", "STRING", mode="NULLABLE"), |
| 1081 | + bigquery.SchemaField("likes", "INTEGER", mode="NULLABLE"), |
| 1082 | + ] |
| 1083 | + |
| 1084 | + # By default, the table should have the c-twitter schema because it is lexicographically last |
| 1085 | + # in the `SOURCE_URIs` list: |
| 1086 | + # a-twitter schema: (username, tweet, timestamp, likes) |
| 1087 | + # b-twitter schema: (username, tweet, timestamp) |
| 1088 | + # c-twitter schema: (username, tweet) |
| 1089 | + |
| 1090 | + # Because `referenceFileSchemaUri` is set as a-twitter, the table will have a-twitter schema |
| 1091 | + |
| 1092 | + # Create external data configuration |
| 1093 | + external_config = bigquery.ExternalConfig(bigquery.ExternalSourceFormat.AVRO) |
| 1094 | + external_config.source_uris = SOURCE_URIS_AVRO |
| 1095 | + external_config.reference_file_schema_uri = REFERENCE_FILE_SCHEMA_URI_AVRO |
| 1096 | + |
| 1097 | + table = bigquery.Table(table_ref) |
| 1098 | + table.external_data_configuration = external_config |
| 1099 | + |
| 1100 | + table = client.create_table(table) |
| 1101 | + |
| 1102 | + # Get table created by the create_table API call |
| 1103 | + generated_table = client.get_table(table_ref) |
| 1104 | + |
| 1105 | + self.assertEqual(generated_table.schema, expected_schema) |
| 1106 | + self.assertEqual( |
| 1107 | + generated_table.external_data_configuration._properties[ |
| 1108 | + "referenceFileSchemaUri" |
| 1109 | + ], |
| 1110 | + REFERENCE_FILE_SCHEMA_URI_AVRO, |
| 1111 | + ) |
| 1112 | + |
| 1113 | + # Clean up test |
| 1114 | + self.to_delete.insert(0, generated_table) |
| 1115 | + |
| 1116 | + def test_load_table_from_uri_with_reference_file_schema_uri_avro(self): |
| 1117 | + dataset_id = _make_dataset_id("test_reference_file_avro") |
| 1118 | + self.temp_dataset(dataset_id) |
| 1119 | + client = Config.CLIENT |
| 1120 | + dataset_ref = bigquery.DatasetReference(client.project, dataset_id) |
| 1121 | + table_id = "test_ref_file_avro" |
| 1122 | + table_ref = bigquery.TableReference(dataset_ref=dataset_ref, table_id=table_id) |
| 1123 | + |
| 1124 | + expected_schema = [ |
| 1125 | + bigquery.SchemaField("username", "STRING", mode="NULLABLE"), |
| 1126 | + bigquery.SchemaField("tweet", "STRING", mode="NULLABLE"), |
| 1127 | + bigquery.SchemaField("timestamp", "STRING", mode="NULLABLE"), |
| 1128 | + bigquery.SchemaField("likes", "INTEGER", mode="NULLABLE"), |
| 1129 | + ] |
| 1130 | + |
| 1131 | + # By default, the table should have the c-twitter schema because it is lexicographically last |
| 1132 | + # in the `SOURCE_URIS` list: |
| 1133 | + # a-twitter schema: (username, tweet, timestamp, likes) |
| 1134 | + # b-twitter schema: (username, tweet, timestamp) |
| 1135 | + # c-twitter schema: (username, tweet) |
| 1136 | + |
| 1137 | + # Because `referenceFileSchemaUri` is set as a-twitter, the table will have a-twitter schema |
| 1138 | + |
| 1139 | + # Create load job configuration |
| 1140 | + load_job_config = bigquery.LoadJobConfig( |
| 1141 | + source_format=bigquery.SourceFormat.AVRO |
| 1142 | + ) |
| 1143 | + load_job_config.reference_file_schema_uri = REFERENCE_FILE_SCHEMA_URI_AVRO |
| 1144 | + |
| 1145 | + load_job = client.load_table_from_uri( |
| 1146 | + source_uris=SOURCE_URIS_AVRO, |
| 1147 | + destination=table_ref, |
| 1148 | + job_config=load_job_config, |
| 1149 | + ) |
| 1150 | + # Wait for load job to complete |
| 1151 | + result = load_job.result() |
| 1152 | + |
| 1153 | + # Get table created by the load job |
| 1154 | + generated_table = client.get_table(table_ref) |
| 1155 | + self.assertEqual(generated_table.schema, expected_schema) |
| 1156 | + self.assertEqual( |
| 1157 | + result._properties["configuration"]["load"]["referenceFileSchemaUri"], |
| 1158 | + REFERENCE_FILE_SCHEMA_URI_AVRO, |
| 1159 | + ) |
| 1160 | + |
| 1161 | + # Clean up test |
| 1162 | + self.to_delete.insert(0, generated_table) |
| 1163 | + |
| 1164 | + def test_create_external_table_with_reference_file_schema_uri_parquet(self): |
| 1165 | + client = Config.CLIENT |
| 1166 | + dataset_id = _make_dataset_id("external_table_ref_file_parquet") |
| 1167 | + self.temp_dataset(dataset_id) |
| 1168 | + dataset_ref = bigquery.DatasetReference(client.project, dataset_id) |
| 1169 | + table_id = "test_ref_file_parquet" |
| 1170 | + table_ref = bigquery.TableReference(dataset_ref=dataset_ref, table_id=table_id) |
| 1171 | + |
| 1172 | + expected_schema = [ |
| 1173 | + bigquery.SchemaField("username", "STRING", mode="NULLABLE"), |
| 1174 | + bigquery.SchemaField("tweet", "STRING", mode="NULLABLE"), |
| 1175 | + bigquery.SchemaField("timestamp", "STRING", mode="NULLABLE"), |
| 1176 | + bigquery.SchemaField("likes", "INTEGER", mode="NULLABLE"), |
| 1177 | + ] |
| 1178 | + |
| 1179 | + # By default, the table should have the c-twitter schema because it is lexicographically last |
| 1180 | + # in the `SOURCE_URIS` list: |
| 1181 | + # a-twitter schema: (username, tweet, timestamp, likes) |
| 1182 | + # b-twitter schema: (username, tweet, timestamp) |
| 1183 | + # c-twitter schema: (username, tweet) |
| 1184 | + |
| 1185 | + # Because `referenceFileSchemaUri` is set as a-twitter, the table will have a-twitter schema |
| 1186 | + |
| 1187 | + # Create external data configuration |
| 1188 | + external_config = bigquery.ExternalConfig(bigquery.ExternalSourceFormat.PARQUET) |
| 1189 | + external_config.source_uris = SOURCE_URIS_PARQUET |
| 1190 | + external_config.reference_file_schema_uri = REFERENCE_FILE_SCHEMA_URI_PARQUET |
| 1191 | + |
| 1192 | + table = bigquery.Table(table_ref) |
| 1193 | + table.external_data_configuration = external_config |
| 1194 | + |
| 1195 | + table = client.create_table(table) |
| 1196 | + |
| 1197 | + # Get table created by the create_table API call |
| 1198 | + generated_table = client.get_table(table_ref) |
| 1199 | + self.assertEqual(generated_table.schema, expected_schema) |
| 1200 | + self.assertEqual( |
| 1201 | + generated_table.external_data_configuration._properties[ |
| 1202 | + "referenceFileSchemaUri" |
| 1203 | + ], |
| 1204 | + REFERENCE_FILE_SCHEMA_URI_PARQUET, |
| 1205 | + ) |
| 1206 | + |
| 1207 | + # Clean up test |
| 1208 | + self.to_delete.insert(0, generated_table) |
| 1209 | + |
| 1210 | + def test_load_table_from_uri_with_reference_file_schema_uri_parquet(self): |
| 1211 | + dataset_id = _make_dataset_id("test_reference_file_parquet") |
| 1212 | + self.temp_dataset(dataset_id) |
| 1213 | + client = Config.CLIENT |
| 1214 | + dataset_ref = bigquery.DatasetReference(client.project, dataset_id) |
| 1215 | + table_id = "test_ref_file_parquet" |
| 1216 | + table_ref = bigquery.TableReference(dataset_ref=dataset_ref, table_id=table_id) |
| 1217 | + |
| 1218 | + expected_schema = [ |
| 1219 | + bigquery.SchemaField("username", "STRING", mode="NULLABLE"), |
| 1220 | + bigquery.SchemaField("tweet", "STRING", mode="NULLABLE"), |
| 1221 | + bigquery.SchemaField("timestamp", "STRING", mode="NULLABLE"), |
| 1222 | + bigquery.SchemaField("likes", "INTEGER", mode="NULLABLE"), |
| 1223 | + ] |
| 1224 | + |
| 1225 | + # By default, the table should have the c-twitter schema because it is lexicographically last |
| 1226 | + # in the `SOURCE_URIS` list: |
| 1227 | + # a-twitter schema: (username, tweet, timestamp, likes) |
| 1228 | + # b-twitter schema: (username, tweet, timestamp) |
| 1229 | + # c-twitter schema: (username, tweet) |
| 1230 | + |
| 1231 | + # Because `referenceFileSchemaUri` is set as a-twitter, the table will have a-twitter schema |
| 1232 | + |
| 1233 | + # Create load job configuration |
| 1234 | + load_job_config = bigquery.LoadJobConfig( |
| 1235 | + source_format=bigquery.SourceFormat.PARQUET |
| 1236 | + ) |
| 1237 | + load_job_config.reference_file_schema_uri = REFERENCE_FILE_SCHEMA_URI_PARQUET |
| 1238 | + |
| 1239 | + load_job = client.load_table_from_uri( |
| 1240 | + source_uris=SOURCE_URIS_PARQUET, |
| 1241 | + destination=table_ref, |
| 1242 | + job_config=load_job_config, |
| 1243 | + ) |
| 1244 | + # Wait for load job to complete |
| 1245 | + result = load_job.result() |
| 1246 | + |
| 1247 | + # Get table created by the load job |
| 1248 | + generated_table = client.get_table(table_ref) |
| 1249 | + self.assertEqual(generated_table.schema, expected_schema) |
| 1250 | + self.assertEqual( |
| 1251 | + result._properties["configuration"]["load"]["referenceFileSchemaUri"], |
| 1252 | + REFERENCE_FILE_SCHEMA_URI_PARQUET, |
| 1253 | + ) |
| 1254 | + |
| 1255 | + # Clean up test |
| 1256 | + self.to_delete.insert(0, generated_table) |
| 1257 | + |
1055 | 1258 | def _write_csv_to_storage(self, bucket_name, blob_name, header_row, data_rows):
|
1056 | 1259 | from google.cloud._testing import _NamedTemporaryFile
|
1057 | 1260 |
|
|
0 commit comments