Skip to content

Commit 3e128f4

Browse files
authored
chore(structured-properties): add cli validation for entity types (datahub-project#11863)
1 parent 17c9fcf commit 3e128f4

File tree

5 files changed

+169
-101
lines changed

5 files changed

+169
-101
lines changed

metadata-ingestion/examples/structured_properties/structured_properties.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
entity_types:
88
- dataset # or urn:li:entityType:datahub.dataset
99
- dataFlow
10-
description: "Retention Time is used to figure out how long to retain records in a dataset"
10+
description: 'Retention Time is used to figure out how long to retain records in a dataset'
1111
allowed_values:
1212
- value: 30
1313
description: 30 days, usually reserved for datasets that are ephemeral and contain pii
@@ -18,7 +18,7 @@
1818
- id: io.acryl.dataManagement.replicationSLA
1919
type: number
2020
display_name: Replication SLA
21-
description: "SLA for how long data can be delayed before replicating to the destination cluster"
21+
description: 'SLA for how long data can be delayed before replicating to the destination cluster'
2222
entity_types:
2323
- dataset
2424
- id: io.acryl.dataManagement.deprecationDate

metadata-ingestion/src/datahub/api/entities/structuredproperties/structuredproperties.py

Lines changed: 138 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import logging
2+
from contextlib import contextmanager
23
from enum import Enum
34
from pathlib import Path
4-
from typing import List, Optional
5+
from typing import Generator, List, Optional
56

67
import yaml
78
from pydantic import validator
@@ -20,6 +21,28 @@
2021
logger = logging.getLogger(__name__)
2122

2223

24+
class StructuredPropertiesConfig:
25+
"""Configuration class to hold the graph client"""
26+
27+
_graph: Optional[DataHubGraph] = None
28+
29+
@classmethod
30+
@contextmanager
31+
def use_graph(cls, graph: DataHubGraph) -> Generator[None, None, None]:
32+
"""Context manager to temporarily set a custom graph"""
33+
previous_graph = cls._graph
34+
cls._graph = graph
35+
try:
36+
yield
37+
finally:
38+
cls._graph = previous_graph
39+
40+
@classmethod
41+
def get_graph(cls) -> DataHubGraph:
42+
"""Get the current graph, falling back to default if none set"""
43+
return cls._graph if cls._graph is not None else get_default_graph()
44+
45+
2346
class AllowedTypes(Enum):
2447
STRING = "string"
2548
RICH_TEXT = "rich_text"
@@ -41,25 +64,28 @@ class AllowedValue(ConfigModel):
4164
description: Optional[str] = None
4265

4366

67+
VALID_ENTITY_TYPES_PREFIX_STRING = ", ".join(
68+
[
69+
f"urn:li:entityType:datahub.{x}"
70+
for x in ["dataset", "dashboard", "dataFlow", "schemaField"]
71+
]
72+
)
73+
VALID_ENTITY_TYPES_STRING = f"Valid entity type urns are {VALID_ENTITY_TYPES_PREFIX_STRING}, etc... Ensure that the entity type is valid."
74+
75+
4476
class TypeQualifierAllowedTypes(ConfigModel):
4577
allowed_types: List[str]
4678

47-
@validator("allowed_types")
79+
@validator("allowed_types", each_item=True)
4880
def validate_allowed_types(cls, v):
49-
validated_entity_type_urns = []
5081
if v:
51-
with get_default_graph() as graph:
52-
for et in v:
53-
validated_urn = Urn.make_entity_type_urn(et)
54-
if graph.exists(validated_urn):
55-
validated_entity_type_urns.append(validated_urn)
56-
else:
57-
logger.warn(
58-
f"Input {et} is not a valid entity type urn. Skipping."
59-
)
60-
v = validated_entity_type_urns
61-
if not v:
62-
logger.warn("No allowed_types given within type_qualifier.")
82+
graph = StructuredPropertiesConfig.get_graph()
83+
validated_urn = Urn.make_entity_type_urn(v)
84+
if not graph.exists(validated_urn):
85+
raise ValueError(
86+
f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}"
87+
)
88+
v = str(validated_urn)
6389
return v
6490

6591

@@ -77,6 +103,18 @@ class StructuredProperties(ConfigModel):
77103
type_qualifier: Optional[TypeQualifierAllowedTypes] = None
78104
immutable: Optional[bool] = False
79105

106+
@validator("entity_types", each_item=True)
107+
def validate_entity_types(cls, v):
108+
if v:
109+
graph = StructuredPropertiesConfig.get_graph()
110+
validated_urn = Urn.make_entity_type_urn(v)
111+
if not graph.exists(validated_urn):
112+
raise ValueError(
113+
f"Input {v} is not a valid entity type urn. {VALID_ENTITY_TYPES_STRING}"
114+
)
115+
v = str(validated_urn)
116+
return v
117+
80118
@property
81119
def fqn(self) -> str:
82120
assert self.urn is not None
@@ -97,93 +135,99 @@ def urn_must_be_present(cls, v, values):
97135
@staticmethod
98136
def create(file: str, graph: Optional[DataHubGraph] = None) -> None:
99137
emitter: DataHubGraph = graph if graph else get_default_graph()
100-
101-
with open(file) as fp:
102-
structuredproperties: List[dict] = yaml.safe_load(fp)
103-
for structuredproperty_raw in structuredproperties:
104-
structuredproperty = StructuredProperties.parse_obj(
105-
structuredproperty_raw
106-
)
107-
if not structuredproperty.type.islower():
108-
structuredproperty.type = structuredproperty.type.lower()
109-
logger.warn(
110-
f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
138+
with StructuredPropertiesConfig.use_graph(emitter):
139+
print("Using graph")
140+
with open(file) as fp:
141+
structuredproperties: List[dict] = yaml.safe_load(fp)
142+
for structuredproperty_raw in structuredproperties:
143+
structuredproperty = StructuredProperties.parse_obj(
144+
structuredproperty_raw
111145
)
112-
if not AllowedTypes.check_allowed_type(structuredproperty.type):
113-
raise ValueError(
114-
f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
115-
)
116-
mcp = MetadataChangeProposalWrapper(
117-
entityUrn=structuredproperty.urn,
118-
aspect=StructuredPropertyDefinitionClass(
119-
qualifiedName=structuredproperty.fqn,
120-
valueType=Urn.make_data_type_urn(structuredproperty.type),
121-
displayName=structuredproperty.display_name,
122-
description=structuredproperty.description,
123-
entityTypes=[
124-
Urn.make_entity_type_urn(entity_type)
125-
for entity_type in structuredproperty.entity_types or []
126-
],
127-
cardinality=structuredproperty.cardinality,
128-
immutable=structuredproperty.immutable,
129-
allowedValues=(
130-
[
131-
PropertyValueClass(
132-
value=v.value, description=v.description
133-
)
134-
for v in structuredproperty.allowed_values
135-
]
136-
if structuredproperty.allowed_values
137-
else None
138-
),
139-
typeQualifier=(
140-
{
141-
"allowedTypes": structuredproperty.type_qualifier.allowed_types
142-
}
143-
if structuredproperty.type_qualifier
144-
else None
146+
if not structuredproperty.type.islower():
147+
structuredproperty.type = structuredproperty.type.lower()
148+
logger.warn(
149+
f"Structured property type should be lowercase. Updated to {structuredproperty.type}"
150+
)
151+
if not AllowedTypes.check_allowed_type(structuredproperty.type):
152+
raise ValueError(
153+
f"Type {structuredproperty.type} is not allowed. Allowed types are {AllowedTypes.values()}"
154+
)
155+
mcp = MetadataChangeProposalWrapper(
156+
entityUrn=structuredproperty.urn,
157+
aspect=StructuredPropertyDefinitionClass(
158+
qualifiedName=structuredproperty.fqn,
159+
valueType=Urn.make_data_type_urn(structuredproperty.type),
160+
displayName=structuredproperty.display_name,
161+
description=structuredproperty.description,
162+
entityTypes=[
163+
Urn.make_entity_type_urn(entity_type)
164+
for entity_type in structuredproperty.entity_types or []
165+
],
166+
cardinality=structuredproperty.cardinality,
167+
immutable=structuredproperty.immutable,
168+
allowedValues=(
169+
[
170+
PropertyValueClass(
171+
value=v.value, description=v.description
172+
)
173+
for v in structuredproperty.allowed_values
174+
]
175+
if structuredproperty.allowed_values
176+
else None
177+
),
178+
typeQualifier=(
179+
{
180+
"allowedTypes": structuredproperty.type_qualifier.allowed_types
181+
}
182+
if structuredproperty.type_qualifier
183+
else None
184+
),
145185
),
146-
),
147-
)
148-
emitter.emit_mcp(mcp)
186+
)
187+
emitter.emit_mcp(mcp)
149188

150-
logger.info(f"Created structured property {structuredproperty.urn}")
189+
logger.info(f"Created structured property {structuredproperty.urn}")
151190

152191
@classmethod
153192
def from_datahub(cls, graph: DataHubGraph, urn: str) -> "StructuredProperties":
154193

155-
structured_property: Optional[
156-
StructuredPropertyDefinitionClass
157-
] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
158-
if structured_property is None:
159-
raise Exception(
160-
"StructuredPropertyDefinition aspect is None. Unable to create structured property."
194+
with StructuredPropertiesConfig.use_graph(graph):
195+
structured_property: Optional[
196+
StructuredPropertyDefinitionClass
197+
] = graph.get_aspect(urn, StructuredPropertyDefinitionClass)
198+
if structured_property is None:
199+
raise Exception(
200+
"StructuredPropertyDefinition aspect is None. Unable to create structured property."
201+
)
202+
return StructuredProperties(
203+
urn=urn,
204+
qualified_name=structured_property.qualifiedName,
205+
display_name=structured_property.displayName,
206+
type=structured_property.valueType,
207+
description=structured_property.description,
208+
entity_types=structured_property.entityTypes,
209+
cardinality=structured_property.cardinality,
210+
allowed_values=(
211+
[
212+
AllowedValue(
213+
value=av.value,
214+
description=av.description,
215+
)
216+
for av in structured_property.allowedValues or []
217+
]
218+
if structured_property.allowedValues is not None
219+
else None
220+
),
221+
type_qualifier=(
222+
{
223+
"allowed_types": structured_property.typeQualifier.get(
224+
"allowedTypes"
225+
)
226+
}
227+
if structured_property.typeQualifier
228+
else None
229+
),
161230
)
162-
return StructuredProperties(
163-
urn=urn,
164-
qualified_name=structured_property.qualifiedName,
165-
display_name=structured_property.displayName,
166-
type=structured_property.valueType,
167-
description=structured_property.description,
168-
entity_types=structured_property.entityTypes,
169-
cardinality=structured_property.cardinality,
170-
allowed_values=(
171-
[
172-
AllowedValue(
173-
value=av.value,
174-
description=av.description,
175-
)
176-
for av in structured_property.allowedValues or []
177-
]
178-
if structured_property.allowedValues is not None
179-
else None
180-
),
181-
type_qualifier=(
182-
{"allowed_types": structured_property.typeQualifier.get("allowedTypes")}
183-
if structured_property.typeQualifier
184-
else None
185-
),
186-
)
187231

188232
def to_yaml(
189233
self,
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
- id: clusterTypeBad
2+
type: STRING
3+
display_name: Cluster's type
4+
description: 'Test Cluster Type Property'
5+
entity_types:
6+
- urn:li:entityType:dataset # should fail because this is not a valid entity type

smoke-test/tests/structured_properties/test_structured_properties.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,24 @@ def test_structured_property_schema_field(ingest_cleanup_data, graph_client):
371371
raise e
372372

373373

374+
def test_structured_properties_yaml_load_with_bad_entity_type(
375+
ingest_cleanup_data, graph_client
376+
):
377+
try:
378+
StructuredProperties.create(
379+
"tests/structured_properties/bad_entity_type.yaml",
380+
graph=graph_client,
381+
)
382+
raise AssertionError(
383+
"Should not be able to create structured properties with bad entity type"
384+
)
385+
except Exception as e:
386+
if "urn:li:entityType:dataset is not a valid entity type urn" in str(e):
387+
pass
388+
else:
389+
raise e
390+
391+
374392
def test_dataset_yaml_loader(ingest_cleanup_data, graph_client):
375393
StructuredProperties.create(
376394
"tests/structured_properties/test_structured_properties.yaml",

smoke-test/tests/structured_properties/test_structured_properties.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
- id: clusterType
22
type: STRING
33
display_name: Cluster's type
4-
description: "Test Cluster Type Property"
4+
description: 'Test Cluster Type Property'
55
entity_types:
66
- dataset
77
- id: clusterName
88
type: STRING
99
display_name: Cluster's name
10-
description: "Test Cluster Name Property"
10+
description: 'Test Cluster Name Property'
1111
entity_types:
1212
- dataset
1313
- id: projectNames
1414
type: STRING
1515
cardinality: MULTIPLE
1616
display_name: Project Name
1717
entity_types:
18-
- dataset # or urn:li:logicalEntity:metamodel.datahub.dataset
19-
- dataflow
20-
description: "Test property for project name"
18+
- dataset # or urn:li:entityType:datahub.dataset
19+
- dataFlow
20+
description: 'Test property for project name'
2121
allowed_values:
2222
- value: Tracking
2323
description: test value 1 for project

0 commit comments

Comments
 (0)