Skip to content

Commit 78dd856

Browse files
feat: Search Service Highlighting (#1856)
* feat: Search Service Highlighting Signed-off-by: Allison Suarez Miranda <[email protected]> * fixed unit test Signed-off-by: Allison Suarez Miranda <[email protected]> * put attrdict access in try in case of keyerror Signed-off-by: Allison Suarez Miranda <[email protected]> * stupid Signed-off-by: Allison Suarez Miranda <[email protected]> * ??? Signed-off-by: Allison Suarez Miranda <[email protected]> * mock mathc Signed-off-by: Allison Suarez Miranda <[email protected]> * debug Signed-off-by: Allison Suarez Miranda <[email protected]> * all unit tests pass Signed-off-by: Allison Suarez Miranda <[email protected]> * lint Signed-off-by: Allison Suarez Miranda <[email protected]> * cinf fix Signed-off-by: Allison Suarez Miranda <[email protected]> * check that request has highlights Signed-off-by: Allison Suarez Miranda <[email protected]> * implemented feedback Signed-off-by: Allison Suarez Miranda <[email protected]> * empty checks Signed-off-by: Allison Suarez Miranda <[email protected]> * updated format response tests anhd put them in utils test file Signed-off-by: Allison Suarez Miranda <[email protected]> * fixed all tests Signed-off-by: Allison Suarez Miranda <[email protected]> * lint Signed-off-by: Allison Suarez Miranda <[email protected]> * mypy Signed-off-by: Allison Suarez Miranda <[email protected]> * isort Signed-off-by: Allison Suarez Miranda <[email protected]> * added another unit test Signed-off-by: Allison Suarez Miranda <[email protected]> * isort Signed-off-by: Allison Suarez Miranda <[email protected]> * major oopsie Signed-off-by: Allison Suarez Miranda <[email protected]> * some changes Signed-off-by: Allison Suarez Miranda <[email protected]> * updated test Signed-off-by: Allison Suarez Miranda <[email protected]> * latets pending marshmallow3-annotations bump Signed-off-by: Allison Suarez Miranda <[email protected]> * remove debugging line Signed-off-by: Allison Suarez Miranda <[email protected]> * isort Signed-off-by: Allison Suarez Miranda <[email protected]> * bumped marshmallow3-annotation to 1.1.0 Signed-off-by: Allison Suarez Miranda <[email protected]> * typing extensions issue attempt to fix Signed-off-by: Allison Suarez Miranda <[email protected]> * had to convert sttrdict and lsit in the end Signed-off-by: Allison Suarez Miranda <[email protected]> * sort imports Signed-off-by: Allison Suarez Miranda <[email protected]>
1 parent 08bea6c commit 78dd856

File tree

16 files changed

+831
-1033
lines changed

16 files changed

+831
-1033
lines changed

common/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
'Flask>=1.0.2',
4242
'attrs>=19.0.0',
4343
'marshmallow>=3.0',
44-
'marshmallow3-annotations>=1.0.0'
44+
'marshmallow3-annotations>=1.1.0'
4545
],
4646
extras_require={
4747
'all': requirements_dev

databuilder/setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
'Flask==1.0.2',
6161
'gremlinpython==3.4.3',
6262
'requests-aws4auth==1.1.0',
63-
'typing-extensions==3.7.4',
63+
'typing-extensions==4.0.0',
6464
'overrides==2.5',
6565
'boto3==1.17.23'
6666
]

requirements-common.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ itsdangerous<=2.0.1
1717
Jinja2>=2.10.1,<3.1
1818
jsonschema>=3.0.1,<4.0
1919
marshmallow>=3.0,<=3.6
20-
marshmallow3-annotations>=1.0.0
20+
marshmallow3-annotations>=1.1.0
2121
pytz==2021.1
2222
requests>=2.25.0
2323
requests-aws4auth==1.1.0

search/search_service/api/search.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
# Copyright Contributors to the Amundsen project.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import json
54
from http import HTTPStatus
65
from typing import ( # noqa: F401
7-
Any, Iterable, List,
6+
Any, Dict, Iterable, List,
87
)
98

10-
from amundsen_common.models.search import SearchRequestSchema, SearchResponseSchema
9+
from amundsen_common.models.search import (
10+
HighlightOptions, SearchRequestSchema, SearchResponseSchema,
11+
)
1112
from flasgger import swag_from
1213
from flask_restful import Resource, request
1314

@@ -29,14 +30,17 @@ def post(self) -> Iterable[Any]:
2930
Fetch search results
3031
:return: json payload of schema
3132
"""
32-
33-
request_data = SearchRequestSchema().loads(json.dumps(request.get_json()))
33+
request_data = SearchRequestSchema().load(request.json, partial=False)
3434

3535
resources: List[AmundsenResource] = []
36-
for r in request.get_json().get('resource_types'):
36+
highlight_options: Dict[AmundsenResource, HighlightOptions] = {}
37+
38+
for r in request_data.resource_types:
3739
resource = RESOURCE_STR_MAPPING.get(r)
3840
if resource:
3941
resources.append(resource)
42+
if request_data.highlight_options.get(r):
43+
highlight_options[resource] = request_data.highlight_options.get(r)
4044
else:
4145
err_msg = f'Search for invalid resource "{r}" requested'
4246
return {'message': err_msg}, HTTPStatus.BAD_REQUEST
@@ -46,7 +50,8 @@ def post(self) -> Iterable[Any]:
4650
page_index=request_data.page_index,
4751
results_per_page=request_data.results_per_page,
4852
resource_types=resources,
49-
filters=request_data.filters)
53+
filters=request_data.filters,
54+
highlight_options=highlight_options)
5055
return SearchResponseSchema().dump(search_results), HTTPStatus.OK
5156

5257
except RuntimeError as e:

search/search_service/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,4 +125,4 @@ class AwsSearchConfig(LocalConfig):
125125
connection_class=RequestsHttpConnection
126126
)
127127

128-
PROXY_CLIENT_KEY = client
128+
ELASTICSEARCH_CLIENT = client

search/search_service/proxy/base.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
)
88

99
from amundsen_common.models.api.health_check import HealthCheck
10-
from amundsen_common.models.search import Filter, SearchResponse
10+
from amundsen_common.models.search import (
11+
Filter, HighlightOptions, SearchResponse,
12+
)
1113

1214
from search_service.models.dashboard import SearchDashboardResult
1315
from search_service.models.feature import SearchFeatureResult
@@ -36,7 +38,8 @@ def search(self, *,
3638
page_index: int,
3739
results_per_page: int,
3840
resource_types: List[Resource],
39-
filters: List[Filter]) -> SearchResponse:
41+
filters: List[Filter],
42+
highlight_options: Dict[Resource, HighlightOptions]) -> SearchResponse:
4043
pass
4144

4245
@abstractmethod

search/search_service/proxy/elasticsearch.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
from amundsen_common.models.index_map import (
1212
FEATURE_INDEX_MAP, TABLE_INDEX_MAP, USER_INDEX_MAP,
1313
)
14-
from amundsen_common.models.search import Filter, SearchResponse
14+
from amundsen_common.models.search import (
15+
Filter, HighlightOptions, SearchResponse,
16+
)
1517
from elasticsearch import Elasticsearch
1618
from elasticsearch.exceptions import ConnectionError as ElasticConnectionError, NotFoundError
1719
from elasticsearch_dsl import Search, query
@@ -812,7 +814,8 @@ def search(self, *,
812814
page_index: int,
813815
results_per_page: int,
814816
resource_types: List[Resource],
815-
filters: List[Filter]) -> SearchResponse:
817+
filters: List[Filter],
818+
highlight_options: Dict[Resource, HighlightOptions]) -> SearchResponse:
816819
LOGGING.warn(DEPRECATION_MSG)
817820
return SearchResponse(msg=DEPRECATION_MSG,
818821
page_index=0,
Lines changed: 97 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,16 @@
11
# Copyright Contributors to the Amundsen project.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
import logging
45
from enum import Enum
6+
from typing import (
7+
Any, Dict, List,
8+
)
9+
10+
from amundsen_common.models.search import SearchResponse
11+
from elasticsearch_dsl.response import Response
12+
from elasticsearch_dsl.response.hit import Hit
13+
from elasticsearch_dsl.utils import AttrDict, AttrList
514

615

716
class Resource(Enum):
@@ -12,13 +21,97 @@ class Resource(Enum):
1221

1322

1423
RESOURCE_STR_MAPPING = {
15-
'table': Resource.TABLE,
16-
'dashboard': Resource.DASHBOARD,
17-
'feature': Resource.FEATURE,
18-
'user': Resource.USER,
24+
"table": Resource.TABLE,
25+
"dashboard": Resource.DASHBOARD,
26+
"feature": Resource.FEATURE,
27+
"user": Resource.USER,
1928
}
2029

2130

2231
def get_index_for_resource(resource_type: Resource) -> str:
2332
resource_str = resource_type.name.lower()
2433
return f"{resource_str}_search_index"
34+
35+
36+
class SearchHit():
37+
# custom wrapper for elasticsearch_dsl Hit
38+
def __init__(self, hit: Hit, fields_mapping: Dict):
39+
self.hit = hit
40+
self.fields_mapping = fields_mapping
41+
42+
def _convert_attr_value_to_native(self, attr_value: Any) -> Any:
43+
if type(attr_value) is AttrDict:
44+
return attr_value.to_dict()
45+
elif type(attr_value) is AttrList:
46+
return list(attr_value)
47+
return attr_value
48+
49+
def to_search_result(self) -> Dict:
50+
result = {}
51+
for field, mapped_field in self.fields_mapping.items():
52+
field_value = None
53+
# get field name instead of subfield
54+
mapped_field = mapped_field.split('.')[0]
55+
56+
if field != mapped_field and hasattr(self.hit, mapped_field):
57+
# if the field name doesn't already match get mapped one
58+
field_value = getattr(self.hit, mapped_field)
59+
elif hasattr(self.hit, field):
60+
field_value = getattr(self.hit, field)
61+
62+
result[field] = self._convert_attr_value_to_native(field_value)
63+
result["search_score"] = self.hit.meta.score
64+
return result
65+
66+
def get_highlights(self) -> Dict:
67+
highlights = {}
68+
try:
69+
for highlighted_field, highlighted_value in self.hit.meta.highlight.to_dict().items():
70+
parent_field_name = highlighted_field.split('.')[0]
71+
highlights[parent_field_name] = highlighted_value
72+
return highlights
73+
74+
except AttributeError:
75+
# response doesn't have highlights
76+
return highlights
77+
78+
79+
def format_resource_response(response: Response, fields_mapping: Dict) -> Dict:
80+
results = []
81+
for hit in response.hits:
82+
search_hit = SearchHit(hit=hit, fields_mapping=fields_mapping)
83+
results.append({
84+
**search_hit.to_search_result(),
85+
'highlight': search_hit.get_highlights()
86+
})
87+
return {
88+
"results": results,
89+
"total_results": response.hits.total.value,
90+
}
91+
92+
93+
def create_search_response(page_index: int, # noqa: C901
94+
results_per_page: int,
95+
responses: List[Response],
96+
resource_types: List[Resource],
97+
resource_to_field_mapping: Dict) -> SearchResponse:
98+
results_per_resource = {}
99+
# responses are returned in the order in which the searches appear in msearch request
100+
for resource, response in zip(resource_types, responses):
101+
msg = ''
102+
status_code = 200
103+
if response.success():
104+
msg = 'Success'
105+
results_per_resource[resource.name.lower()] = \
106+
format_resource_response(response=response,
107+
fields_mapping=resource_to_field_mapping[resource])
108+
else:
109+
msg = f'Query response for {resource} returned an error: {response.to_dict()}'
110+
status_code = 500
111+
logging.error(msg)
112+
113+
return SearchResponse(msg=msg,
114+
page_index=page_index,
115+
results_per_page=results_per_page,
116+
results=results_per_resource,
117+
status_code=status_code)

search/search_service/proxy/es_proxy_v2.py

Lines changed: 13 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,20 @@
88
)
99

1010
from amundsen_common.models.api import health_check
11-
from amundsen_common.models.search import Filter, SearchResponse
11+
from amundsen_common.models.search import (
12+
Filter, HighlightOptions, SearchResponse,
13+
)
1214
from elasticsearch import Elasticsearch
1315
from elasticsearch.exceptions import ConnectionError as ElasticConnectionError, ElasticsearchException
1416
from elasticsearch_dsl import (
1517
MultiSearch, Q, Search,
1618
)
1719
from elasticsearch_dsl.query import MultiMatch
1820
from elasticsearch_dsl.response import Response
19-
from elasticsearch_dsl.utils import AttrDict, AttrList
21+
from elasticsearch_dsl.utils import AttrList
2022
from werkzeug.exceptions import InternalServerError
2123

22-
from search_service.proxy.es_proxy_utils import Resource
24+
from search_service.proxy.es_proxy_utils import Resource, create_search_response
2325

2426
LOGGER = logging.getLogger(__name__)
2527

@@ -237,56 +239,6 @@ def _build_elasticsearch_query(self, *,
237239

238240
return es_query
239241

240-
def _format_response(self, page_index: int,
241-
results_per_page: int,
242-
responses: List[Response],
243-
resource_types: List[Resource]) -> SearchResponse:
244-
resource_types_str = [r.name.lower() for r in resource_types]
245-
no_results_for_resource = {
246-
"results": [],
247-
"total_results": 0
248-
}
249-
results_per_resource = {resource: no_results_for_resource for resource in resource_types_str}
250-
251-
for r in responses:
252-
if r.success():
253-
if len(r.hits.hits) > 0:
254-
resource_type = r.hits.hits[0]._source['resource_type']
255-
results = []
256-
for search_result in r.hits.hits:
257-
# mapping gives all the fields in the response
258-
result = {}
259-
fields = self.RESOUCE_TO_MAPPING[Resource[resource_type.upper()]]
260-
for f in fields.keys():
261-
# remove "keyword" from mapping value
262-
field = fields[f].split('.')[0]
263-
try:
264-
result_for_field = search_result._source[field]
265-
# AttrList and AttrDict are not json serializable
266-
if type(result_for_field) is AttrList:
267-
result_for_field = list(result_for_field)
268-
elif type(result_for_field) is AttrDict:
269-
result_for_field = result_for_field.to_dict()
270-
result[f] = result_for_field
271-
except KeyError:
272-
logging.debug(f'Field: {field} missing in search response.')
273-
pass
274-
result["search_score"] = search_result._score
275-
results.append(result)
276-
# replace empty results with actual results
277-
results_per_resource[resource_type] = {
278-
"results": results,
279-
"total_results": r.hits.total.value
280-
}
281-
else:
282-
raise InternalServerError(f"Request to Elasticsearch failed: {r.failures}")
283-
284-
return SearchResponse(msg="Success",
285-
page_index=page_index,
286-
results_per_page=results_per_page,
287-
results=results_per_resource,
288-
status_code=200)
289-
290242
def execute_queries(self, queries: Dict[Resource, Q],
291243
page_index: int,
292244
results_per_page: int) -> List[Response]:
@@ -316,8 +268,9 @@ def search(self, *,
316268
page_index: int,
317269
results_per_page: int,
318270
resource_types: List[Resource],
319-
filters: List[Filter]) -> SearchResponse:
320-
if resource_types == []:
271+
filters: List[Filter],
272+
highlight_options: Dict[Resource, HighlightOptions]) -> SearchResponse:
273+
if not resource_types:
321274
# if resource types are not defined then search all resources
322275
resource_types = self.PRIMARY_ENTITIES
323276

@@ -332,10 +285,11 @@ def search(self, *,
332285
page_index=page_index,
333286
results_per_page=results_per_page)
334287

335-
formatted_response = self._format_response(page_index=page_index,
336-
results_per_page=results_per_page,
337-
responses=responses,
338-
resource_types=resource_types)
288+
formatted_response = create_search_response(page_index=page_index,
289+
results_per_page=results_per_page,
290+
responses=responses,
291+
resource_types=resource_types,
292+
resource_to_field_mapping=self.RESOUCE_TO_MAPPING)
339293

340294
return formatted_response
341295

0 commit comments

Comments
 (0)