Skip to content

Commit 90d9d2f

Browse files
aaronsteersjatinyadav-cc
authored andcommitted
AirbyteLib: improve json schema type detection (airbytehq#35263)
1 parent a4baf28 commit 90d9d2f

File tree

3 files changed

+74
-6
lines changed

3 files changed

+74
-6
lines changed

airbyte-lib/airbyte_lib/types.py

+20-5
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ class SQLTypeConversionError(Exception):
3131
"""An exception to be raised when a type conversion fails."""
3232

3333

34-
def _get_airbyte_type(
35-
json_schema_property_def: dict[str, str | dict],
34+
def _get_airbyte_type( # noqa: PLR0911 # Too many return statements
35+
json_schema_property_def: dict[str, str | dict | list],
3636
) -> tuple[str, str | None]:
3737
"""Get the airbyte type and subtype from a JSON schema property definition.
3838
@@ -45,6 +45,13 @@ def _get_airbyte_type(
4545
json_schema_type = json_schema_property_def.get("type", None)
4646
json_schema_format = json_schema_property_def.get("format", None)
4747

48+
# if json_schema_type is an array of two strings with one of them being null, pick the other one
49+
# this strategy is often used by connectors to indicate a field might not be set all the time
50+
if isinstance(json_schema_type, list):
51+
non_null_types = [t for t in json_schema_type if t != "null"]
52+
if len(non_null_types) == 1:
53+
json_schema_type = non_null_types[0]
54+
4855
if json_schema_type == "string":
4956
if json_schema_format == "date":
5057
return "date", None
@@ -58,9 +65,17 @@ def _get_airbyte_type(
5865
if json_schema_type in ["string", "number", "boolean", "integer"]:
5966
return cast(str, json_schema_type), None
6067

61-
if json_schema_type == "object" and "properties" in json_schema_property_def:
68+
if json_schema_type == "object":
6269
return "object", None
6370

71+
if json_schema_type == "array":
72+
items_def = json_schema_property_def.get("items", None)
73+
if isinstance(items_def, dict):
74+
subtype, _ = _get_airbyte_type(items_def)
75+
return "array", subtype
76+
77+
return "array", None
78+
6479
err_msg = f"Could not determine airbyte type from JSON schema type: {json_schema_property_def}"
6580
raise SQLTypeConversionError(err_msg)
6681

@@ -81,11 +96,11 @@ def get_failover_type() -> sqlalchemy.types.TypeEngine:
8196

8297
def to_sql_type(
8398
self,
84-
json_schema_property_def: dict[str, str | dict],
99+
json_schema_property_def: dict[str, str | dict | list],
85100
) -> sqlalchemy.types.TypeEngine:
86101
"""Convert a value to a SQL type."""
87102
try:
88-
airbyte_type, airbyte_subtype = _get_airbyte_type(json_schema_property_def)
103+
airbyte_type, _ = _get_airbyte_type(json_schema_property_def)
89104
return self.conversion_map[airbyte_type]()
90105
except SQLTypeConversionError:
91106
print(f"Could not determine airbyte type from JSON schema: {json_schema_property_def}")

airbyte-lib/pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ ignore = [
135135
"PIE790", # Allow unnecssary 'pass' (sometimes useful for readability)
136136
"PERF203", # exception handling in loop
137137
"S", # flake8-bandit (noisy, security related)
138+
"SIM910", # Allow "None" as second argument to Dict.get(). "Explicit is better than implicit."
138139
"TD002", # Require author for TODOs
139140
"TRIO", # flake8-trio (opinionated, noisy)
140141
"INP001", # Dir 'examples' is part of an implicit namespace package. Add an __init__.py.

airbyte-lib/tests/unit_tests/test_type_translation.py

+53-1
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,17 @@
22

33
import pytest
44
from sqlalchemy import types
5-
from airbyte_lib.types import SQLTypeConverter
5+
from airbyte_lib.types import SQLTypeConverter, _get_airbyte_type
6+
67

78
@pytest.mark.parametrize(
89
"json_schema_property_def, expected_sql_type",
910
[
1011
({"type": "string"}, types.VARCHAR),
12+
({"type": ["boolean", "null"]}, types.BOOLEAN),
13+
({"type": ["null", "boolean"]}, types.BOOLEAN),
14+
({"type": "string"}, types.VARCHAR),
15+
({"type": ["null", "string"]}, types.VARCHAR),
1116
({"type": "boolean"}, types.BOOLEAN),
1217
({"type": "string", "format": "date"}, types.DATE),
1318
({"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, types.TIMESTAMP),
@@ -25,3 +30,50 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type):
2530
converter = SQLTypeConverter()
2631
sql_type = converter.to_sql_type(json_schema_property_def)
2732
assert isinstance(sql_type, expected_sql_type)
33+
34+
35+
@pytest.mark.parametrize(
36+
"json_schema_property_def, expected_airbyte_type",
37+
[
38+
({"type": "string"}, "string"),
39+
({"type": ["boolean", "null"]}, "boolean"),
40+
({"type": ["null", "boolean"]}, "boolean"),
41+
({"type": "string"}, "string"),
42+
({"type": ["null", "string"]}, "string"),
43+
({"type": "boolean"}, "boolean"),
44+
({"type": "string", "format": "date"}, "date"),
45+
({"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, "timestamp_without_timezone"),
46+
({"type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone"}, "timestamp_with_timezone"),
47+
({"type": "string", "format": "time", "airbyte_type": "time_without_timezone"}, "time_without_timezone"),
48+
({"type": "string", "format": "time", "airbyte_type": "time_with_timezone"}, "time_with_timezone"),
49+
({"type": "integer"}, "integer"),
50+
({"type": "number", "airbyte_type": "integer"}, "integer"),
51+
({"type": "number"}, "number"),
52+
({"type": "array"}, "array"),
53+
({"type": "object"}, "object"),
54+
],
55+
)
56+
def test_to_airbyte_type(json_schema_property_def, expected_airbyte_type):
57+
airbyte_type, _ = _get_airbyte_type(json_schema_property_def)
58+
assert airbyte_type == expected_airbyte_type
59+
60+
61+
@pytest.mark.parametrize(
62+
"json_schema_property_def, expected_airbyte_type, expected_airbyte_subtype",
63+
[
64+
({"type": "string"}, "string", None),
65+
({"type": "number"}, "number", None),
66+
({"type": "array"}, "array", None),
67+
({"type": "object"}, "object", None),
68+
({"type": "array", "items": {"type": ["null", "string"]}}, "array", "string"),
69+
({"type": "array", "items": {"type": ["boolean"]}}, "array", "boolean"),
70+
],
71+
)
72+
def test_to_airbyte_subtype(
73+
json_schema_property_def,
74+
expected_airbyte_type,
75+
expected_airbyte_subtype,
76+
):
77+
airbyte_type, subtype = _get_airbyte_type(json_schema_property_def)
78+
assert airbyte_type == expected_airbyte_type
79+
assert subtype == expected_airbyte_subtype

0 commit comments

Comments
 (0)