|
| 1 | +# Copyright (c) 2024 Airbyte, Inc., all rights reserved. |
| 2 | + |
| 3 | +from pathlib import Path |
| 4 | +from typing import List, Set, Union |
| 5 | + |
| 6 | +import yaml |
| 7 | +from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ManifestReferenceResolver |
| 8 | +from airbyte_protocol.models import AirbyteCatalog, AirbyteStream # type: ignore # missing library stubs or py.typed marker |
| 9 | +from erd.relationships import Relationships |
| 10 | +from pydbml import Database # type: ignore # missing library stubs or py.typed marker |
| 11 | +from pydbml.classes import Column, Index, Reference, Table # type: ignore # missing library stubs or py.typed marker |
| 12 | + |
| 13 | + |
| 14 | +class Source: |
| 15 | + def __init__(self, source_folder: Path, source_technical_name: str) -> None: |
| 16 | + self._source_folder = source_folder |
| 17 | + self._source_technical_name = source_technical_name |
| 18 | + |
| 19 | + def is_dynamic(self, stream_name: str) -> bool: |
| 20 | + """ |
| 21 | + This method is a very flaky heuristic to know if a stream is dynamic or not. A stream will be considered dynamic if: |
| 22 | + * The stream name is in the schemas folder |
| 23 | + * The stream is within the manifest and the schema definition is `InlineSchemaLoader` |
| 24 | + """ |
| 25 | + manifest_static_streams = set() |
| 26 | + if self._has_manifest(): |
| 27 | + with open(self._get_manifest_path()) as manifest_file: |
| 28 | + resolved_manifest = ManifestReferenceResolver().preprocess_manifest(yaml.safe_load(manifest_file)) |
| 29 | + for stream in resolved_manifest["streams"]: |
| 30 | + if "schema_loader" not in stream: |
| 31 | + # stream is assumed to have `DefaultSchemaLoader` which will show in the schemas folder so we can skip |
| 32 | + continue |
| 33 | + if stream["schema_loader"]["type"] == "InlineSchemaLoader": |
| 34 | + name = stream["name"] if "name" in stream else stream.get("$parameters").get("name", None) |
| 35 | + if not name: |
| 36 | + print(f"Could not retrieve name for this stream: {stream}") |
| 37 | + continue |
| 38 | + manifest_static_streams.add(stream["name"] if "name" in stream else stream.get("$parameters").get("name", None)) |
| 39 | + |
| 40 | + return stream_name not in manifest_static_streams | self._get_streams_from_schemas_folder() |
| 41 | + |
| 42 | + def _get_streams_from_schemas_folder(self) -> Set[str]: |
| 43 | + schemas_folder = self._source_folder / self._source_technical_name.replace("-", "_") / "schemas" |
| 44 | + return {p.name.replace(".json", "") for p in schemas_folder.iterdir() if p.is_file()} if schemas_folder.exists() else set() |
| 45 | + |
| 46 | + def _get_manifest_path(self) -> Path: |
| 47 | + return self._source_folder / self._source_technical_name.replace("-", "_") / "manifest.yaml" |
| 48 | + |
| 49 | + def _has_manifest(self) -> bool: |
| 50 | + return self._get_manifest_path().exists() |
| 51 | + |
| 52 | + |
| 53 | +class DbmlAssembler: |
| 54 | + def assemble(self, source: Source, discovered_catalog: AirbyteCatalog, relationships: Relationships) -> Database: |
| 55 | + database = Database() |
| 56 | + for stream in discovered_catalog.streams: |
| 57 | + if source.is_dynamic(stream.name): |
| 58 | + print(f"Skipping stream {stream.name} as it is dynamic") |
| 59 | + continue |
| 60 | + |
| 61 | + database.add(self._create_table(stream)) |
| 62 | + |
| 63 | + self._add_references(source, database, relationships) |
| 64 | + |
| 65 | + return database |
| 66 | + |
| 67 | + def _create_table(self, stream: AirbyteStream) -> Table: |
| 68 | + dbml_table = Table(stream.name) |
| 69 | + for property_name, property_information in stream.json_schema.get("properties").items(): |
| 70 | + try: |
| 71 | + dbml_table.add_column( |
| 72 | + Column( |
| 73 | + name=property_name, |
| 74 | + type=self._extract_type(property_information["type"]), |
| 75 | + pk=self._is_pk(stream, property_name), |
| 76 | + ) |
| 77 | + ) |
| 78 | + except (KeyError, ValueError) as exception: |
| 79 | + print(f"Ignoring field {property_name}: {exception}") |
| 80 | + continue |
| 81 | + |
| 82 | + if stream.source_defined_primary_key and len(stream.source_defined_primary_key) > 1: |
| 83 | + if any(map(lambda key: len(key) != 1, stream.source_defined_primary_key)): |
| 84 | + raise ValueError(f"Does not support nested key as part of primary key `{stream.source_defined_primary_key}`") |
| 85 | + |
| 86 | + composite_key_columns = [ |
| 87 | + column for key in stream.source_defined_primary_key for column in dbml_table.columns if column.name in key |
| 88 | + ] |
| 89 | + if len(composite_key_columns) < len(stream.source_defined_primary_key): |
| 90 | + raise ValueError("Unexpected error: missing PK column from dbml table") |
| 91 | + |
| 92 | + dbml_table.add_index( |
| 93 | + Index( |
| 94 | + subjects=composite_key_columns, |
| 95 | + pk=True, |
| 96 | + ) |
| 97 | + ) |
| 98 | + return dbml_table |
| 99 | + |
| 100 | + def _add_references(self, source: Source, database: Database, relationships: Relationships) -> None: |
| 101 | + for stream in relationships["streams"]: |
| 102 | + for column_name, relationship in stream["relations"].items(): |
| 103 | + if source.is_dynamic(stream["name"]): |
| 104 | + print(f"Skipping relationship as stream {stream['name']} from relationship is dynamic") |
| 105 | + continue |
| 106 | + |
| 107 | + try: |
| 108 | + target_table_name, target_column_name = relationship.split( |
| 109 | + ".", 1 |
| 110 | + ) # we support the field names having dots but not stream name hence we split on the first dot only |
| 111 | + except ValueError as exception: |
| 112 | + raise ValueError(f"Could not handle relationship {relationship}") from exception |
| 113 | + |
| 114 | + if source.is_dynamic(target_table_name): |
| 115 | + print(f"Skipping relationship as target stream {target_table_name} is dynamic") |
| 116 | + continue |
| 117 | + |
| 118 | + try: |
| 119 | + database.add_reference( |
| 120 | + Reference( |
| 121 | + type="<>", # we don't have the information of which relationship type it is so we assume many-to-many for now |
| 122 | + col1=self._get_column(database, stream["name"], column_name), |
| 123 | + col2=self._get_column(database, target_table_name, target_column_name), |
| 124 | + ) |
| 125 | + ) |
| 126 | + except ValueError as exception: |
| 127 | + print(f"Skipping relationship: {exception}") |
| 128 | + |
| 129 | + def _extract_type(self, property_type: Union[str, List[str]]) -> str: |
| 130 | + if isinstance(property_type, str): |
| 131 | + return property_type |
| 132 | + |
| 133 | + types = list(property_type) |
| 134 | + if "null" in types: |
| 135 | + # As we flag everything as nullable (except PK and cursor field), there is little value in keeping the information in order to |
| 136 | + # show this in DBML |
| 137 | + types.remove("null") |
| 138 | + if len(types) != 1: |
| 139 | + raise ValueError(f"Expected only one type apart from `null` but got {len(types)}: {property_type}") |
| 140 | + return types[0] |
| 141 | + |
| 142 | + def _is_pk(self, stream: AirbyteStream, property_name: str) -> bool: |
| 143 | + return stream.source_defined_primary_key == [[property_name]] |
| 144 | + |
| 145 | + def _get_column(self, database: Database, table_name: str, column_name: str) -> Column: |
| 146 | + matching_tables = list(filter(lambda dbml_table: dbml_table.name == table_name, database.tables)) |
| 147 | + if len(matching_tables) == 0: |
| 148 | + raise ValueError(f"Could not find table {table_name}") |
| 149 | + elif len(matching_tables) > 1: |
| 150 | + raise ValueError(f"Unexpected error: many tables found with name {table_name}") |
| 151 | + |
| 152 | + table: Table = matching_tables[0] |
| 153 | + matching_columns = list(filter(lambda column: column.name == column_name, table.columns)) |
| 154 | + if len(matching_columns) == 0: |
| 155 | + raise ValueError(f"Could not find column {column_name} in table {table_name}. Columns are: {table.columns}") |
| 156 | + elif len(matching_columns) > 1: |
| 157 | + raise ValueError(f"Unexpected error: many columns found with name {column_name} for table {table_name}") |
| 158 | + |
| 159 | + return matching_columns[0] |
0 commit comments