Skip to content

Commit caec5f2

Browse files
authored
feat(registry): add remove stale partition job (#38165)
## What Add a job that lets us remove partition keys that no longer exist ## Why We have > 10,000 partitions, one for every metadata file ever. Likely only 500 of those reference files that exist. Adding this job should let us clean out the noise. ## Future If it works I'll add it to a nightly job
1 parent 116b3df commit caec5f2

File tree

4 files changed

+129
-1
lines changed

4 files changed

+129
-1
lines changed

airbyte-ci/connectors/metadata_service/orchestrator/orchestrator/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from orchestrator.jobs.metadata import generate_stale_gcs_latest_metadata_file
2323
from orchestrator.jobs.registry import (
2424
add_new_metadata_partitions,
25+
remove_stale_metadata_partitions,
2526
generate_cloud_registry,
2627
generate_oss_registry,
2728
generate_registry_entry,
@@ -184,6 +185,7 @@
184185
generate_registry_entry,
185186
generate_nightly_reports,
186187
add_new_metadata_partitions,
188+
remove_stale_metadata_partitions,
187189
generate_stale_gcs_latest_metadata_file,
188190
]
189191

airbyte-ci/connectors/metadata_service/orchestrator/orchestrator/jobs/registry.py

+26
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,32 @@
2424
)
2525

2626

27+
@op(required_resource_keys={"all_metadata_file_blobs"})
28+
def remove_stale_metadata_partitions_op(context):
29+
"""
30+
This op is responsible for polling for new metadata files and adding their etag to the dynamic partition.
31+
"""
32+
all_metadata_file_blobs = context.resources.all_metadata_file_blobs
33+
partition_name = registry_entry.metadata_partitions_def.name
34+
35+
all_fresh_etags = [blob.etag for blob in all_metadata_file_blobs]
36+
37+
all_etag_partitions = context.instance.get_dynamic_partitions(partition_name)
38+
39+
for stale_etag in [etag for etag in all_etag_partitions if etag not in all_fresh_etags]:
40+
context.log.info(f"Removing stale etag: {stale_etag}")
41+
context.instance.delete_dynamic_partition(partition_name, stale_etag)
42+
context.log.info(f"Removed stale etag: {stale_etag}")
43+
44+
45+
@job(tags={"dagster/priority": HIGH_QUEUE_PRIORITY})
46+
def remove_stale_metadata_partitions():
47+
"""
48+
This job is responsible for removing stale metadata partitions (metadata files or versions of files that no longer exist).
49+
"""
50+
remove_stale_metadata_partitions_op()
51+
52+
2753
@op(required_resource_keys={"slack", "all_metadata_file_blobs"})
2854
def add_new_metadata_partitions_op(context):
2955
"""

airbyte-ci/connectors/metadata_service/orchestrator/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "orchestrator"
3-
version = "0.1.0"
3+
version = "0.1.1"
44
description = ""
55
authors = ["Ben Church <[email protected]>"]
66
readme = "README.md"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2+
3+
from unittest import mock
4+
5+
from dagster import build_op_context
6+
from google.cloud.storage import Blob
7+
from orchestrator.assets import registry_entry
8+
from orchestrator.jobs.registry import add_new_metadata_partitions_op, remove_stale_metadata_partitions_op
9+
10+
11+
def test_basic_partition():
12+
context = build_op_context()
13+
partition_key = "test_partition_key"
14+
15+
existing_partitions = context.instance.get_dynamic_partitions(partition_key)
16+
assert len(existing_partitions) == 0
17+
context.instance.add_dynamic_partitions(partition_key, ["partition_1", "partition_2"])
18+
existing_partitions = context.instance.get_dynamic_partitions(partition_key)
19+
assert len(existing_partitions) == 2
20+
21+
22+
def test_metadata_partition_remove():
23+
mock_fresh_blob_1 = mock.create_autospec(Blob, instance=True)
24+
mock_fresh_blob_1.etag = "fresh_etag_1"
25+
mock_fresh_blob_1.name = "fresh_metadata"
26+
27+
mock_fresh_blob_2 = mock.create_autospec(Blob, instance=True)
28+
mock_fresh_blob_2.etag = "fresh_etag_2"
29+
mock_fresh_blob_2.name = "fresh_metadata"
30+
31+
mock_stale_blob = mock.create_autospec(Blob, instance=True)
32+
mock_stale_blob.etag = "stale_etag"
33+
mock_stale_blob.name = "stale_metadata"
34+
35+
mock_metadata_file_blobs = [mock_fresh_blob_1, mock_fresh_blob_2]
36+
37+
resources = {"all_metadata_file_blobs": mock_metadata_file_blobs}
38+
39+
context = build_op_context(resources=resources)
40+
41+
partition_key = registry_entry.metadata_partitions_def.name
42+
43+
existing_partitions = context.instance.get_dynamic_partitions(partition_key)
44+
assert len(existing_partitions) == 0
45+
46+
context.instance.add_dynamic_partitions(partition_key, [mock_fresh_blob_1.etag, mock_stale_blob.etag])
47+
existing_partitions = context.instance.get_dynamic_partitions(partition_key)
48+
assert len(existing_partitions) == 2
49+
50+
remove_stale_metadata_partitions_op(context)
51+
52+
existing_partitions = context.instance.get_dynamic_partitions(partition_key)
53+
assert len(existing_partitions) == 1
54+
assert mock_stale_blob.etag not in existing_partitions
55+
56+
57+
def test_metadata_partition_add():
58+
mock_fresh_blob_1 = mock.create_autospec(Blob, instance=True)
59+
mock_fresh_blob_1.etag = "fresh_etag_1"
60+
mock_fresh_blob_1.name = "fresh_metadata"
61+
62+
mock_fresh_blob_2 = mock.create_autospec(Blob, instance=True)
63+
mock_fresh_blob_2.etag = "fresh_etag_2"
64+
mock_fresh_blob_2.name = "fresh_metadata"
65+
66+
mock_existing_blob = mock.create_autospec(Blob, instance=True)
67+
mock_existing_blob.etag = "existing_etag"
68+
mock_existing_blob.name = "existing_metadata"
69+
70+
mock_stale_blob = mock.create_autospec(Blob, instance=True)
71+
mock_stale_blob.etag = "stale_etag"
72+
mock_stale_blob.name = "stale_metadata"
73+
74+
mock_metadata_file_blobs = [mock_fresh_blob_1, mock_fresh_blob_2]
75+
76+
mock_slack = mock.MagicMock()
77+
mock_slack.get_client = mock.MagicMock()
78+
chat_postMessage = mock.MagicMock()
79+
mock_slack.get_client.return_value = chat_postMessage
80+
81+
resources = {"slack": mock_slack, "all_metadata_file_blobs": mock_metadata_file_blobs}
82+
83+
context = build_op_context(resources=resources)
84+
85+
partition_key = registry_entry.metadata_partitions_def.name
86+
87+
existing_partitions = context.instance.get_dynamic_partitions(partition_key)
88+
assert len(existing_partitions) == 0
89+
90+
context.instance.add_dynamic_partitions(partition_key, [mock_stale_blob.etag, mock_existing_blob.etag])
91+
existing_partitions = context.instance.get_dynamic_partitions(partition_key)
92+
assert len(existing_partitions) == 2
93+
94+
add_new_metadata_partitions_op(context)
95+
96+
existing_partitions = context.instance.get_dynamic_partitions(partition_key)
97+
expected_partitions = [mock_fresh_blob_1.etag, mock_fresh_blob_2.etag, mock_existing_blob.etag, mock_stale_blob.etag]
98+
99+
# assert all expected partitions are in the existing partitions, and no other partitions are present, order does not matter
100+
assert all([etag in existing_partitions for etag in expected_partitions])

0 commit comments

Comments
 (0)