airbytehq
diff --git a/‎.github/workflows/airbyte-ci-tests.yml
+1 b/‎.github/workflows/airbyte-ci-tests.yml
+1
diff --git a/‎airbyte-ci/connectors/erd/README.md
+39 b/‎airbyte-ci/connectors/erd/README.md
+39
diff --git a/‎airbyte-ci/connectors/erd/poetry.lock
+2,001 b/‎airbyte-ci/connectors/erd/poetry.lock
+2,001
diff --git a/‎airbyte-ci/connectors/erd/pyproject.toml
+49 b/‎airbyte-ci/connectors/erd/pyproject.toml
+49
diff --git a/‎airbyte-ci/connectors/erd/src/erd/__init__.py
+1 b/‎airbyte-ci/connectors/erd/src/erd/__init__.py
+1
diff --git a/‎airbyte-ci/connectors/erd/src/erd/dbml_assembler.py
+159 b/‎airbyte-ci/connectors/erd/src/erd/dbml_assembler.py
+159
diff --git a/‎airbyte-ci/connectors/erd/src/erd/erd_service.py
+135 b/‎airbyte-ci/connectors/erd/src/erd/erd_service.py
+135
@@ -39,6 +39,7 @@ jobs:
             - airbyte-ci/connectors/connector_ops/**
             - airbyte-ci/connectors/connectors_qa/**
             - airbyte-ci/connectors/ci_credentials/**
+            - airbyte-ci/connectors/erd/**
             - airbyte-ci/connectors/metadata_service/lib/**
             - airbyte-ci/connectors/metadata_service/orchestrator/**
             - airbyte-cdk/python/**
 
@@ -0,0 +1,39 @@
+# erd
+
+A collection of utilities for generating ERDs.
+
+# Setup
+
+## Installation
+
+`erd` tools use [Poetry](https://github.com/python-poetry/poetry) to manage dependencies,
+and targets Python 3.10 and higher.
+
+Assuming you're in Airbyte repo root:
+
+```bash
+cd airbyte-ci/connectors/erd
+poetry install
+```
+
+## Usage
+
+Pre-requisites:
+* Env variable `GENAI_API_KEY`. Can be found at URL https://aistudio.google.com/app/apikey
+
+`poetry run erd --source-path <source path> --source-technical-name <for example, 'source-facebook-marketing'>`
+
+The script supports the option to ignore the LLM generation by passing parameter `--skip-llm-relationships`
+
+## Contributing to `erd`
+
+### Running tests
+
+To run tests locally:
+
+```bash
+poetry run pytest
+```
+
+## Changelog
+- 0.1.0: Initial commit
@@ -0,0 +1,49 @@
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+name = "erd"
+version = "0.1.0"
+description = "Contains utilities for generating ERDs."
+authors = ["Airbyte <[email protected]>"]
+license = "MIT"
+homepage = "https://github.com/airbytehq/airbyte"
+readme = "README.md"
+packages = [
+    { include = "erd", from = "src" },
+]
+
+[tool.poetry.dependencies]
+python = "^3.10,<3.12"
+airbyte-cdk = "*"
+click = "^8.1.3"
+dpath = "^2.1.6"
+google-generativeai = "^0.7.2"
+markdown-it-py = ">=2.2.0"
+pydbml = "^1.1.0"
+pytest = "^8.1.1"
+pyyaml = "^6.0"
+
+[tool.poetry.group.dev.dependencies]
+ruff = "^0.3.0"
+mypy = "^1.8.0"
+types-pyyaml = "^6.0.12.20240311"
+
+[tool.ruff.lint]
+select = ["I", "F"]
+
+[tool.ruff.lint.isort]
+known-first-party = ["connection-retriever"]
+
+[tool.poe.tasks]
+test = "pytest tests"
+type_check = "mypy src --disallow-untyped-defs"
+pre-push = []
+
+[tool.poetry.scripts]
+erd = "erd.cli:main"
+
+[tool.airbyte_ci]
+python_versions = ["3.10"]
+poe_tasks = ["type_check", "test"]
@@ -0,0 +1 @@
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
@@ -0,0 +1,159 @@
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
+
+from pathlib import Path
+from typing import List, Set, Union
+
+import yaml
+from airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver import ManifestReferenceResolver
+from airbyte_protocol.models import AirbyteCatalog, AirbyteStream  # type: ignore  # missing library stubs or py.typed marker
+from erd.relationships import Relationships
+from pydbml import Database  # type: ignore  # missing library stubs or py.typed marker
+from pydbml.classes import Column, Index, Reference, Table  # type: ignore  # missing library stubs or py.typed marker
+
+
+class Source:
+    def __init__(self, source_folder: Path, source_technical_name: str) -> None:
+        self._source_folder = source_folder
+        self._source_technical_name = source_technical_name
+
+    def is_dynamic(self, stream_name: str) -> bool:
+        """
+        This method is a very flaky heuristic to know if a stream is dynamic or not. A stream will be considered dynamic if:
+        * The stream name is in the schemas folder
+        * The stream is within the manifest and the schema definition is `InlineSchemaLoader`
+        """
+        manifest_static_streams = set()
+        if self._has_manifest():
+            with open(self._get_manifest_path()) as manifest_file:
+                resolved_manifest = ManifestReferenceResolver().preprocess_manifest(yaml.safe_load(manifest_file))
+            for stream in resolved_manifest["streams"]:
+                if "schema_loader" not in stream:
+                    # stream is assumed to have `DefaultSchemaLoader` which will show in the schemas folder so we can skip
+                    continue
+                if stream["schema_loader"]["type"] == "InlineSchemaLoader":
+                    name = stream["name"] if "name" in stream else stream.get("$parameters").get("name", None)
+                    if not name:
+                        print(f"Could not retrieve name for this stream: {stream}")
+                        continue
+                    manifest_static_streams.add(stream["name"] if "name" in stream else stream.get("$parameters").get("name", None))
+
+        return stream_name not in manifest_static_streams | self._get_streams_from_schemas_folder()
+
+    def _get_streams_from_schemas_folder(self) -> Set[str]:
+        schemas_folder = self._source_folder / self._source_technical_name.replace("-", "_") / "schemas"
+        return {p.name.replace(".json", "") for p in schemas_folder.iterdir() if p.is_file()} if schemas_folder.exists() else set()
+
+    def _get_manifest_path(self) -> Path:
+        return self._source_folder / self._source_technical_name.replace("-", "_") / "manifest.yaml"
+
+    def _has_manifest(self) -> bool:
+        return self._get_manifest_path().exists()
+
+
+class DbmlAssembler:
+    def assemble(self, source: Source, discovered_catalog: AirbyteCatalog, relationships: Relationships) -> Database:
+        database = Database()
+        for stream in discovered_catalog.streams:
+            if source.is_dynamic(stream.name):
+                print(f"Skipping stream {stream.name} as it is dynamic")
+                continue
+
+            database.add(self._create_table(stream))
+
+        self._add_references(source, database, relationships)
+
+        return database
+
+    def _create_table(self, stream: AirbyteStream) -> Table:
+        dbml_table = Table(stream.name)
+        for property_name, property_information in stream.json_schema.get("properties").items():
+            try:
+                dbml_table.add_column(
+                    Column(
+                        name=property_name,
+                        type=self._extract_type(property_information["type"]),
+                        pk=self._is_pk(stream, property_name),
+                    )
+                )
+            except (KeyError, ValueError) as exception:
+                print(f"Ignoring field {property_name}: {exception}")
+                continue
+
+        if stream.source_defined_primary_key and len(stream.source_defined_primary_key) > 1:
+            if any(map(lambda key: len(key) != 1, stream.source_defined_primary_key)):
+                raise ValueError(f"Does not support nested key as part of primary key `{stream.source_defined_primary_key}`")
+
+            composite_key_columns = [
+                column for key in stream.source_defined_primary_key for column in dbml_table.columns if column.name in key
+            ]
+            if len(composite_key_columns) < len(stream.source_defined_primary_key):
+                raise ValueError("Unexpected error: missing PK column from dbml table")
+
+            dbml_table.add_index(
+                Index(
+                    subjects=composite_key_columns,
+                    pk=True,
+                )
+            )
+        return dbml_table
+
+    def _add_references(self, source: Source, database: Database, relationships: Relationships) -> None:
+        for stream in relationships["streams"]:
+            for column_name, relationship in stream["relations"].items():
+                if source.is_dynamic(stream["name"]):
+                    print(f"Skipping relationship as stream {stream['name']} from relationship is dynamic")
+                    continue
+
+                try:
+                    target_table_name, target_column_name = relationship.split(
+                        ".", 1
+                    )  # we support the field names having dots but not stream name hence we split on the first dot only
+                except ValueError as exception:
+                    raise ValueError(f"Could not handle relationship {relationship}") from exception
+
+                if source.is_dynamic(target_table_name):
+                    print(f"Skipping relationship as target stream {target_table_name} is dynamic")
+                    continue
+
+                try:
+                    database.add_reference(
+                        Reference(
+                            type="<>",  # we don't have the information of which relationship type it is so we assume many-to-many for now
+                            col1=self._get_column(database, stream["name"], column_name),
+                            col2=self._get_column(database, target_table_name, target_column_name),
+                        )
+                    )
+                except ValueError as exception:
+                    print(f"Skipping relationship: {exception}")
+
+    def _extract_type(self, property_type: Union[str, List[str]]) -> str:
+        if isinstance(property_type, str):
+            return property_type
+
+        types = list(property_type)
+        if "null" in types:
+            # As we flag everything as nullable (except PK and cursor field), there is little value in keeping the information in order to
+            # show this in DBML
+            types.remove("null")
+        if len(types) != 1:
+            raise ValueError(f"Expected only one type apart from `null` but got {len(types)}: {property_type}")
+        return types[0]
+
+    def _is_pk(self, stream: AirbyteStream, property_name: str) -> bool:
+        return stream.source_defined_primary_key == [[property_name]]
+
+    def _get_column(self, database: Database, table_name: str, column_name: str) -> Column:
+        matching_tables = list(filter(lambda dbml_table: dbml_table.name == table_name, database.tables))
+        if len(matching_tables) == 0:
+            raise ValueError(f"Could not find table {table_name}")
+        elif len(matching_tables) > 1:
+            raise ValueError(f"Unexpected error: many tables found with name {table_name}")
+
+        table: Table = matching_tables[0]
+        matching_columns = list(filter(lambda column: column.name == column_name, table.columns))
+        if len(matching_columns) == 0:
+            raise ValueError(f"Could not find column {column_name} in table {table_name}. Columns are: {table.columns}")
+        elif len(matching_columns) > 1:
+            raise ValueError(f"Unexpected error: many columns found with name {column_name} for table {table_name}")
+
+        return matching_columns[0]
@@ -0,0 +1,135 @@
+# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
+
+import copy
+import json
+from pathlib import Path
+from typing import Any
+
+import dpath
+import google.generativeai as genai  # type: ignore  # missing library stubs or py.typed marker
+from airbyte_protocol.models import AirbyteCatalog  # type: ignore  # missing library stubs or py.typed marker
+from erd.dbml_assembler import DbmlAssembler, Source
+from erd.relationships import Relationships, RelationshipsMerger
+from markdown_it import MarkdownIt
+from pydbml.renderer.dbml.default import DefaultDBMLRenderer  # type: ignore  # missing library stubs or py.typed marker
+
+
+class ErdService:
+    def __init__(self, source_technical_name: str, source_path: Path) -> None:
+        self._source_technical_name = source_technical_name
+        self._source_path = source_path
+        self._model = genai.GenerativeModel("gemini-1.5-flash")
+
+        if not self._discovered_catalog_path.exists():
+            raise ValueError(f"Could not find discovered catalog at path {self._discovered_catalog_path}")
+
+    def generate_estimated_relationships(self) -> None:
+        normalized_catalog = self._normalize_schema_catalog(self._get_catalog())
+        estimated_relationships = self._get_relations_from_gemini(source_name=self._source_path.name, catalog=normalized_catalog)
+        with open(self._estimated_relationships_file, "w") as estimated_relationship_file:
+            json.dump(estimated_relationships, estimated_relationship_file, indent=4)
+
+    def write_dbml_file(self) -> None:
+        database = DbmlAssembler().assemble(
+            Source(self._source_path, self._source_technical_name),
+            self._get_catalog(),
+            RelationshipsMerger().merge(
+                self._get_relationships(self._estimated_relationships_file), self._get_relationships(self._confirmed_relationships_file)
+            ),
+        )
+
+        with open(self._erd_folder / "source.dbml", "w") as f:
+            f.write(DefaultDBMLRenderer.render_db(database))
+
+    @staticmethod
+    def _normalize_schema_catalog(catalog: AirbyteCatalog) -> dict[str, Any]:
+        """
+        Foreign key cannot be of type object or array, therefore, we can remove these properties.
+        :param schema: json_schema in draft7
+        :return: json_schema in draft7 with TOP level properties only.
+        """
+        streams = copy.deepcopy(catalog.model_dump())["streams"]
+        for stream in streams:
+            to_rem = dpath.search(
+                stream["json_schema"]["properties"],
+                ["**"],
+                afilter=lambda x: isinstance(x, dict) and ("array" in str(x.get("type", "")) or "object" in str(x.get("type", ""))),
+            )
+            for key in to_rem:
+                stream["json_schema"]["properties"].pop(key)
+        return streams  # type: ignore  # as this comes from an AirbyteCatalog dump, the format should be fine
+
+    def _get_relations_from_gemini(self, source_name: str, catalog: dict[str, Any]) -> Relationships:
+        """
+
+        :param source_name:
+        :param catalog:
+        :return: {"streams":[{'name': 'ads', 'relations': {'account_id': 'ad_account.id', 'campaign_id': 'campaigns.id', 'adset_id': 'ad_sets.id'}}, ...]}
+        """
+        system = "You are an Database developer in charge of communicating well to your users."
+
+        source_desc = """
+You are working on the {source_name} API service.
+
+The current JSON Schema format is as follows:
+{current_schema}, where "streams" has a list of streams, which represents database tables, and list of properties in each, which in turn, represent DB columns. Streams presented in list are the only available ones.
+Generate and add a `foreign_key` with reference for each field in top level of properties that is helpful in understanding what the data represents and how are streams related to each other. Pay attention to fields ends with '_id'.
+        """.format(
+            source_name=source_name, current_schema=catalog
+        )
+        task = """
+Please provide answer in the following format:
+{streams: [{"name": "<stream_name>", "relations": {"<foreign_key>": "<ref_table.column_name>"} }]}
+Pay extra attention that in <ref_table.column_name>" "ref_table" should be one of the list of streams, and "column_name" should be one of the property in respective reference stream.
+Limitations:
+- Not all tables should have relations
+- Reference should point to 1 table only.
+- table cannot reference on itself, on other words, e.g. `ad_account` cannot have relations with "ad_account" as a "ref_table"
+        """
+        response = self._model.generate_content(f"{system} {source_desc} {task}")
+        md = MarkdownIt("commonmark")
+        tokens = md.parse(response.text)
+        response_json = json.loads(tokens[0].content)
+        return response_json  # type: ignore  # we blindly assume Gemini returns a response with the Relationships format as asked
+
+    @staticmethod
+    def _get_relationships(path: Path) -> Relationships:
+        if not path.exists():
+            return {"streams": []}
+
+        with open(path, "r") as file:
+            return json.load(file)  # type: ignore  # we assume the content of the file matches Relationships
+
+    def _get_catalog(self) -> AirbyteCatalog:
+        with open(self._discovered_catalog_path, "r") as file:
+            try:
+                return AirbyteCatalog.model_validate(json.loads(file.read()))
+            except json.JSONDecodeError as error:
+                raise ValueError(
+                    f"Could not read json file {self._discovered_catalog_path}: {error}. Please ensure that it is a valid JSON."
+                )
+
+    @property
+    def _erd_folder(self) -> Path:
+        """
+        Note: if this folder change, make sure to update the exported folder in the pipeline
+        """
+        path = self._source_path / "erd"
+        if not path.exists():
+            path.mkdir()
+        return path
+
+    @property
+    def _estimated_relationships_file(self) -> Path:
+        return self._erd_folder / "estimated_relationships.json"
+
+    @property
+    def _confirmed_relationships_file(self) -> Path:
+        return self._erd_folder / "confirmed_relationships.json"
+
+    @property
+    def _discovered_catalog_path(self) -> Path:
+        """
+        Note: if this folder change, make sure to update the exported folder in the pipeline
+        """
+        return self._source_path / "erd" / "discovered_catalog.json"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.`