Skip to content

Commit 32c45a9

Browse files
authored
Create record obfuscator and use it in live tests (#43318)
1 parent eb74cba commit 32c45a9

File tree

6 files changed

+122
-4
lines changed

6 files changed

+122
-4
lines changed

airbyte-ci/connectors/live-tests/src/live_tests/commons/connector_runner.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import datetime
77
import json
88
import logging
9+
import os
10+
import subprocess
911
import uuid
1012
from pathlib import Path
1113
from typing import Optional
@@ -23,11 +25,13 @@ class ConnectorRunner:
2325
IN_CONTAINER_CONFIGURED_CATALOG_PATH = "/data/catalog.json"
2426
IN_CONTAINER_STATE_PATH = "/data/state.json"
2527
IN_CONTAINER_OUTPUT_PATH = "/output.txt"
28+
IN_CONTAINER_OBFUSCATOR_PATH = "/user/local/bin/record_obfuscator.py"
2629

2730
def __init__(
2831
self,
2932
dagger_client: dagger.Client,
3033
execution_inputs: ExecutionInputs,
34+
is_airbyte_ci: bool,
3135
http_proxy: Optional[Proxy] = None,
3236
):
3337
self.connector_under_test = execution_inputs.connector_under_test
@@ -45,6 +49,11 @@ def __init__(
4549
self.http_proxy = http_proxy
4650
self.logger = logging.getLogger(f"{self.connector_under_test.name}-{self.connector_under_test.version}")
4751
self.dagger_client = dagger_client.pipeline(f"{self.connector_under_test.name}-{self.connector_under_test.version}")
52+
if is_airbyte_ci:
53+
self.host_obfuscator_path = "/tmp/record_obfuscator.py"
54+
else:
55+
repo_root = Path(subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).strip().decode())
56+
self.host_obfuscator_path = f"{repo_root}/tools/bin/record_obfuscator.py"
4857

4958
@property
5059
def _connector_under_test_container(self) -> dagger.Container:
@@ -109,6 +118,12 @@ async def _run(
109118
container = self._connector_under_test_container
110119
# Do not cache downstream dagger layers
111120
container = container.with_env_variable("CACHEBUSTER", str(uuid.uuid4()))
121+
expanded_host_executable_path = os.path.expanduser(self.host_obfuscator_path)
122+
123+
container = container.with_file(
124+
self.IN_CONTAINER_OBFUSCATOR_PATH,
125+
self.dagger_client.host().file(expanded_host_executable_path),
126+
)
112127
for env_var_name, env_var_value in self.environment_variables.items():
113128
container = container.with_env_variable(env_var_name, env_var_value)
114129
if self.config:
@@ -134,7 +149,8 @@ async def _run(
134149
[
135150
"sh",
136151
"-c",
137-
" ".join(airbyte_command) + f" > {self.IN_CONTAINER_OUTPUT_PATH} 2>&1 | tee -a {self.IN_CONTAINER_OUTPUT_PATH}",
152+
" ".join(airbyte_command)
153+
+ f"| {self.IN_CONTAINER_OBFUSCATOR_PATH} > {self.IN_CONTAINER_OUTPUT_PATH} 2>&1 | tee -a {self.IN_CONTAINER_OUTPUT_PATH}",
138154
],
139155
skip_entrypoint=True,
140156
)

airbyte-ci/connectors/live-tests/src/live_tests/commons/models.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -321,10 +321,36 @@ def generate_stream_schemas(self) -> dict[str, Any]:
321321
stream_schema_builder = SchemaBuilder()
322322
stream_schema_builder.add_schema({"type": "object", "properties": {}})
323323
stream_builders[stream] = stream_schema_builder
324-
stream_builders[stream].add_object(record.record.data)
324+
stream_builders[stream].add_object(self.get_obfuscated_types(record.record.data))
325325
self.logger.info("Stream schemas generated")
326326
return {stream: sort_dict_keys(stream_builders[stream].to_schema()) for stream in stream_builders}
327327

328+
@staticmethod
329+
def get_obfuscated_types(data: dict[str, Any]) -> dict[str, Any]:
330+
"""
331+
Convert obfuscated records into a record whose values have the same type as the original values.
332+
"""
333+
types = {}
334+
for k, v in data.items():
335+
if v.startswith("string_"):
336+
types[k] = "a"
337+
elif v.startswith("integer_"):
338+
types[k] = 0
339+
elif v.startswith("number_"):
340+
types[k] = 0.1
341+
elif v.startswith("boolean_"):
342+
types[k] = True
343+
elif v.startswith("null_"):
344+
types[k] = None
345+
elif v.startswith("array_"):
346+
types[k] = []
347+
elif v.startswith("object_"):
348+
types[k] = {}
349+
else:
350+
types[k] = v
351+
352+
return types
353+
328354
def get_records_per_stream(self, stream: str) -> Iterator[AirbyteMessage]:
329355
assert self.backend is not None, "Backend must be set to get records per stream"
330356
self.logger.info(f"Reading records for stream {stream}")

airbyte-ci/connectors/live-tests/src/live_tests/conftest.py

+20
Original file line numberDiff line numberDiff line change
@@ -467,12 +467,14 @@ def spec_control_execution_inputs(
467467

468468
@pytest.fixture(scope="session")
469469
def spec_control_connector_runner(
470+
request: SubRequest,
470471
dagger_client: dagger.Client,
471472
spec_control_execution_inputs: ExecutionInputs,
472473
) -> ConnectorRunner:
473474
runner = ConnectorRunner(
474475
dagger_client,
475476
spec_control_execution_inputs,
477+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
476478
)
477479
return runner
478480

@@ -507,12 +509,14 @@ def spec_target_execution_inputs(
507509

508510
@pytest.fixture(scope="session")
509511
def spec_target_connector_runner(
512+
request: SubRequest,
510513
dagger_client: dagger.Client,
511514
spec_target_execution_inputs: ExecutionInputs,
512515
) -> ConnectorRunner:
513516
runner = ConnectorRunner(
514517
dagger_client,
515518
spec_target_execution_inputs,
519+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
516520
)
517521
return runner
518522

@@ -551,6 +555,7 @@ def check_control_execution_inputs(
551555

552556
@pytest.fixture(scope="session")
553557
async def check_control_connector_runner(
558+
request: SubRequest,
554559
dagger_client: dagger.Client,
555560
check_control_execution_inputs: ExecutionInputs,
556561
connection_id: str,
@@ -560,6 +565,7 @@ async def check_control_connector_runner(
560565
runner = ConnectorRunner(
561566
dagger_client,
562567
check_control_execution_inputs,
568+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
563569
http_proxy=proxy,
564570
)
565571
yield runner
@@ -600,6 +606,7 @@ def check_target_execution_inputs(
600606

601607
@pytest.fixture(scope="session")
602608
async def check_target_connector_runner(
609+
request: SubRequest,
603610
check_control_execution_result: ExecutionResult,
604611
dagger_client: dagger.Client,
605612
check_target_execution_inputs: ExecutionInputs,
@@ -614,6 +621,7 @@ async def check_target_connector_runner(
614621
runner = ConnectorRunner(
615622
dagger_client,
616623
check_target_execution_inputs,
624+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
617625
http_proxy=proxy,
618626
)
619627
yield runner
@@ -685,6 +693,7 @@ def discover_target_execution_inputs(
685693

686694
@pytest.fixture(scope="session")
687695
async def discover_control_connector_runner(
696+
request: SubRequest,
688697
dagger_client: dagger.Client,
689698
discover_control_execution_inputs: ExecutionInputs,
690699
connection_id: str,
@@ -694,13 +703,15 @@ async def discover_control_connector_runner(
694703
yield ConnectorRunner(
695704
dagger_client,
696705
discover_control_execution_inputs,
706+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
697707
http_proxy=proxy,
698708
)
699709
await proxy.clear_cache_volume()
700710

701711

702712
@pytest.fixture(scope="session")
703713
async def discover_target_connector_runner(
714+
request: SubRequest,
704715
dagger_client: dagger.Client,
705716
discover_control_execution_result: ExecutionResult,
706717
discover_target_execution_inputs: ExecutionInputs,
@@ -716,6 +727,7 @@ async def discover_target_connector_runner(
716727
yield ConnectorRunner(
717728
dagger_client,
718729
discover_target_execution_inputs,
730+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
719731
http_proxy=proxy,
720732
)
721733
await proxy.clear_cache_volume()
@@ -776,6 +788,7 @@ def read_target_execution_inputs(
776788

777789
@pytest.fixture(scope="session")
778790
async def read_control_connector_runner(
791+
request: SubRequest,
779792
dagger_client: dagger.Client,
780793
read_control_execution_inputs: ExecutionInputs,
781794
connection_id: str,
@@ -785,6 +798,7 @@ async def read_control_connector_runner(
785798
yield ConnectorRunner(
786799
dagger_client,
787800
read_control_execution_inputs,
801+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
788802
http_proxy=proxy,
789803
)
790804
await proxy.clear_cache_volume()
@@ -806,6 +820,7 @@ async def read_control_execution_result(
806820

807821
@pytest.fixture(scope="session")
808822
async def read_target_connector_runner(
823+
request: SubRequest,
809824
dagger_client: dagger.Client,
810825
read_target_execution_inputs: ExecutionInputs,
811826
read_control_execution_result: ExecutionResult,
@@ -821,6 +836,7 @@ async def read_target_connector_runner(
821836
yield ConnectorRunner(
822837
dagger_client,
823838
read_target_execution_inputs,
839+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
824840
http_proxy=proxy,
825841
)
826842
await proxy.clear_cache_volume()
@@ -890,6 +906,7 @@ def read_with_state_target_execution_inputs(
890906

891907
@pytest.fixture(scope="session")
892908
async def read_with_state_control_connector_runner(
909+
request: SubRequest,
893910
dagger_client: dagger.Client,
894911
read_with_state_control_execution_inputs: ExecutionInputs,
895912
connection_id: str,
@@ -899,6 +916,7 @@ async def read_with_state_control_connector_runner(
899916
yield ConnectorRunner(
900917
dagger_client,
901918
read_with_state_control_execution_inputs,
919+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
902920
http_proxy=proxy,
903921
)
904922
await proxy.clear_cache_volume()
@@ -922,6 +940,7 @@ async def read_with_state_control_execution_result(
922940

923941
@pytest.fixture(scope="session")
924942
async def read_with_state_target_connector_runner(
943+
request: SubRequest,
925944
dagger_client: dagger.Client,
926945
read_with_state_target_execution_inputs: ExecutionInputs,
927946
read_with_state_control_execution_result: ExecutionResult,
@@ -936,6 +955,7 @@ async def read_with_state_target_connector_runner(
936955
yield ConnectorRunner(
937956
dagger_client,
938957
read_with_state_target_execution_inputs,
958+
request.config.stash[stash_keys.RUN_IN_AIRBYTE_CI],
939959
http_proxy=proxy,
940960
)
941961
await proxy.clear_cache_volume()

airbyte-ci/connectors/live-tests/src/live_tests/validation_tests/test_read.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ async def test_read(
5858

5959
for record in records:
6060
has_records = True
61-
if not conforms_to_schema(record.record.data, stream.schema()):
61+
if not conforms_to_schema(read_target_execution_result.get_obfuscated_types(record.record.data), stream.schema()):
6262
errors.append(f"A record was encountered that does not conform to the schema. stream={stream.stream.name} record={record}")
6363
if primary_key:
6464
if _extract_primary_key_value(record.dict(), primary_key) is None:

airbyte-ci/connectors/pipelines/pipelines/airbyte_ci/connectors/test/steps/common.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -649,7 +649,10 @@ async def _run(self, connector_under_test_container: Container) -> StepResult:
649649

650650
exit_code, stdout, stderr = await get_exec_result(container)
651651

652-
if "report.html" not in await container.directory(f"{tests_artifacts_dir}/session_{self.run_id}").entries():
652+
if (
653+
f"session_{self.run_id}" not in await container.directory(f"{tests_artifacts_dir}").entries()
654+
or "report.html" not in await container.directory(f"{tests_artifacts_dir}/session_{self.run_id}").entries()
655+
):
653656
main_logger.exception(
654657
"The report file was not generated, an unhandled error likely happened during regression test execution, please check the step stderr and stdout for more details"
655658
)
@@ -686,6 +689,10 @@ async def _build_test_container(self, target_container_id: str) -> Container:
686689
# Enable dagger-in-dagger
687690
.with_unix_socket("/var/run/docker.sock", self.dagger_client.host().unix_socket("/var/run/docker.sock"))
688691
.with_env_variable("RUN_IN_AIRBYTE_CI", "1")
692+
.with_file(
693+
"/tmp/record_obfuscator.py",
694+
self.context.get_repo_dir("tools/bin", include=["record_obfuscator.py"]).file("record_obfuscator.py"),
695+
)
689696
# The connector being tested is already built and is stored in a location accessible to an inner dagger kicked off by
690697
# regression tests. The connector can be found if you know the container ID, so we write the container ID to a file and put
691698
# it in the regression test container. This way regression tests will use the already-built connector instead of trying to

tools/bin/record_obfuscator.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3+
4+
5+
import hashlib
6+
import json
7+
import sys
8+
from typing import Any
9+
10+
11+
def _generate_hash(value: Any) -> str:
12+
return hashlib.sha256(str(value).encode()).hexdigest()
13+
14+
15+
def obfuscate(value: Any) -> Any:
16+
if isinstance(value, str):
17+
obfuscated_value = f"string_len-{len(value)}_" + _generate_hash(value)
18+
elif isinstance(value, int):
19+
obfuscated_value = f"integer_len-{len(str(value))}" + _generate_hash(value)
20+
elif isinstance(value, float):
21+
obfuscated_value = f"number_len-{len(str(value))}" + _generate_hash(value)
22+
elif isinstance(value, bool):
23+
obfuscated_value = "boolean_" + _generate_hash(value)
24+
elif value is None:
25+
obfuscated_value = "null_" + _generate_hash(value)
26+
elif isinstance(value, list):
27+
obfuscated_value = f"array_len-{len(value)}" + _generate_hash(json.dumps(value, sort_keys=True).encode())
28+
elif isinstance(value, dict):
29+
obfuscated_value = f"object_len-{len(value.keys())}" + _generate_hash(json.dumps(value, sort_keys=True).encode())
30+
else:
31+
raise ValueError(f"Unsupported data type: {type(value)}")
32+
33+
return obfuscated_value
34+
35+
36+
if __name__ == "__main__":
37+
for line in sys.stdin:
38+
line = line.strip()
39+
try:
40+
data = json.loads(line)
41+
except Exception as exc:
42+
# We don't expect invalid json so if we see it, it will go to stderr
43+
sys.stderr.write(f"{line}\n")
44+
else:
45+
if data.get("type") == "RECORD":
46+
record_data = data["record"].get("data", {})
47+
obfuscated_record = {k: obfuscate(v) for k, v in record_data.items()}
48+
data["record"]["data"] = obfuscated_record
49+
sys.stdout.write(f"{json.dumps(data)}\n")

0 commit comments

Comments
 (0)