Skip to content

AirbyteLib: improve json schema type detection #35263

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions airbyte-lib/airbyte_lib/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ class SQLTypeConversionError(Exception):
"""An exception to be raised when a type conversion fails."""


def _get_airbyte_type(
json_schema_property_def: dict[str, str | dict],
def _get_airbyte_type( # noqa: PLR0911 # Too many return statements
json_schema_property_def: dict[str, str | dict | list],
) -> tuple[str, str | None]:
"""Get the airbyte type and subtype from a JSON schema property definition.

Expand All @@ -45,6 +45,13 @@ def _get_airbyte_type(
json_schema_type = json_schema_property_def.get("type", None)
json_schema_format = json_schema_property_def.get("format", None)

# if json_schema_type is an array of two strings with one of them being null, pick the other one
# this strategy is often used by connectors to indicate a field might not be set all the time
if isinstance(json_schema_type, list):
non_null_types = [t for t in json_schema_type if t != "null"]
if len(non_null_types) == 1:
json_schema_type = non_null_types[0]

if json_schema_type == "string":
if json_schema_format == "date":
return "date", None
Expand All @@ -58,9 +65,17 @@ def _get_airbyte_type(
if json_schema_type in ["string", "number", "boolean", "integer"]:
return cast(str, json_schema_type), None

if json_schema_type == "object" and "properties" in json_schema_property_def:
if json_schema_type == "object":
return "object", None

if json_schema_type == "array":
items_def = json_schema_property_def.get("items", None)
if isinstance(items_def, dict):
subtype, _ = _get_airbyte_type(items_def)
return "array", subtype

return "array", None

err_msg = f"Could not determine airbyte type from JSON schema type: {json_schema_property_def}"
raise SQLTypeConversionError(err_msg)

Expand All @@ -81,11 +96,11 @@ def get_failover_type() -> sqlalchemy.types.TypeEngine:

def to_sql_type(
self,
json_schema_property_def: dict[str, str | dict],
json_schema_property_def: dict[str, str | dict | list],
) -> sqlalchemy.types.TypeEngine:
"""Convert a value to a SQL type."""
try:
airbyte_type, airbyte_subtype = _get_airbyte_type(json_schema_property_def)
airbyte_type, _ = _get_airbyte_type(json_schema_property_def)
return self.conversion_map[airbyte_type]()
except SQLTypeConversionError:
print(f"Could not determine airbyte type from JSON schema: {json_schema_property_def}")
Expand Down
1 change: 1 addition & 0 deletions airbyte-lib/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ ignore = [
"PIE790", # Allow unnecssary 'pass' (sometimes useful for readability)
"PERF203", # exception handling in loop
"S", # flake8-bandit (noisy, security related)
"SIM910", # Allow "None" as second argument to Dict.get(). "Explicit is better than implicit."
"TD002", # Require author for TODOs
"TRIO", # flake8-trio (opinionated, noisy)
"INP001", # Dir 'examples' is part of an implicit namespace package. Add an __init__.py.
Expand Down
54 changes: 53 additions & 1 deletion airbyte-lib/tests/unit_tests/test_type_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@

import pytest
from sqlalchemy import types
from airbyte_lib.types import SQLTypeConverter
from airbyte_lib.types import SQLTypeConverter, _get_airbyte_type


@pytest.mark.parametrize(
"json_schema_property_def, expected_sql_type",
[
({"type": "string"}, types.VARCHAR),
({"type": ["boolean", "null"]}, types.BOOLEAN),
({"type": ["null", "boolean"]}, types.BOOLEAN),
({"type": "string"}, types.VARCHAR),
({"type": ["null", "string"]}, types.VARCHAR),
({"type": "boolean"}, types.BOOLEAN),
({"type": "string", "format": "date"}, types.DATE),
({"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, types.TIMESTAMP),
Expand All @@ -25,3 +30,50 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type):
converter = SQLTypeConverter()
sql_type = converter.to_sql_type(json_schema_property_def)
assert isinstance(sql_type, expected_sql_type)


@pytest.mark.parametrize(
"json_schema_property_def, expected_airbyte_type",
[
({"type": "string"}, "string"),
({"type": ["boolean", "null"]}, "boolean"),
({"type": ["null", "boolean"]}, "boolean"),
({"type": "string"}, "string"),
({"type": ["null", "string"]}, "string"),
({"type": "boolean"}, "boolean"),
({"type": "string", "format": "date"}, "date"),
({"type": "string", "format": "date-time", "airbyte_type": "timestamp_without_timezone"}, "timestamp_without_timezone"),
({"type": "string", "format": "date-time", "airbyte_type": "timestamp_with_timezone"}, "timestamp_with_timezone"),
({"type": "string", "format": "time", "airbyte_type": "time_without_timezone"}, "time_without_timezone"),
({"type": "string", "format": "time", "airbyte_type": "time_with_timezone"}, "time_with_timezone"),
({"type": "integer"}, "integer"),
({"type": "number", "airbyte_type": "integer"}, "integer"),
({"type": "number"}, "number"),
({"type": "array"}, "array"),
({"type": "object"}, "object"),
],
)
def test_to_airbyte_type(json_schema_property_def, expected_airbyte_type):
airbyte_type, _ = _get_airbyte_type(json_schema_property_def)
assert airbyte_type == expected_airbyte_type


@pytest.mark.parametrize(
"json_schema_property_def, expected_airbyte_type, expected_airbyte_subtype",
[
({"type": "string"}, "string", None),
({"type": "number"}, "number", None),
({"type": "array"}, "array", None),
({"type": "object"}, "object", None),
({"type": "array", "items": {"type": ["null", "string"]}}, "array", "string"),
({"type": "array", "items": {"type": ["boolean"]}}, "array", "boolean"),
],
)
def test_to_airbyte_subtype(
json_schema_property_def,
expected_airbyte_type,
expected_airbyte_subtype,
):
airbyte_type, subtype = _get_airbyte_type(json_schema_property_def)
assert airbyte_type == expected_airbyte_type
assert subtype == expected_airbyte_subtype