diff --git a/databuilder/databuilder/task/search/document_mappings.py b/databuilder/databuilder/task/search/document_mappings.py index 336eab0f29..909a4b6127 100644 --- a/databuilder/databuilder/task/search/document_mappings.py +++ b/databuilder/databuilder/task/search/document_mappings.py @@ -57,31 +57,88 @@ class Analyzer: Filter.english_stop, Filter.english_stemmer]) + +class Subfield: + # combinations of field types and analyzers for additional index time analysis + + keyword = Keyword() + + alphanumeric = Text(analyzer=Analyzer.alphanum_analyzer, + term_vector=POSITIONS_OFFSETS) + + alphanumeric_multi = Text(multi=True, + analyzer=Analyzer.alphanum_analyzer, + term_vector=POSITIONS_OFFSETS) + + general = Text(analyzer=Analyzer.general_analyzer, + term_vector=POSITIONS_OFFSETS) + + general_multi = Text(multi=True, + analyzer=Analyzer.general_analyzer, + term_vector=POSITIONS_OFFSETS) + + @staticmethod + def get_ngram_subfield(field_name: str, + multi: bool = False, + min_shingle_size: int = 2, + max_shingle_size: int = 5, + token_separator: str = ' ') -> Text: + + # using shingle token filter for word level ngrams + # https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-shingle-tokenfilter.html + shingle_filter = token_filter(f"shingle_filter_{field_name}", + type="shingle", + output_unigrams=True, + min_shingle_size=min_shingle_size, + max_shingle_size=max_shingle_size, + token_separator=token_separator) + + ngram_analyzer = analysis.analyzer(f"ngram_analyzer_{field_name}", + tokenizer=Tokenizer.alphanum_tokenizer, + filter=[shingle_filter]) + + return Text(multi=multi, + analyzer=ngram_analyzer, + term_vector=POSITIONS_OFFSETS) + + # Resource Mappings +# Note: the current analyzers don't support tokenizing on camelcase text, if this is a requirement extend +# these classes and write a custom analyzer +# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html + class SearchableResource(Document): # For better understanding of field type rationale read "Mapping unstructured content" # https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#wildcard-field-type key = Text(required=True, - fields={"keyword": Keyword()}, + fields={"keyword": Subfield.keyword}, analyzer=Analyzer.general_analyzer, term_vector=POSITIONS_OFFSETS) name = Text(required=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Subfield.keyword, + "general": Subfield.general, + "ngram": Subfield.get_ngram_subfield( + field_name="resource_name", + max_shingle_size=8 + ) + }, analyzer=Analyzer.stemming_analyzer, term_vector=POSITIONS_OFFSETS) description = Text(analyzer=Analyzer.english_analyzer, - fields={"alphanumeric": Text(analyzer=Analyzer.alphanum_analyzer, - term_vector=POSITIONS_OFFSETS) - }, + fields={ + "alphanumeric": Subfield.alphanumeric, + "general": Subfield.general + }, term_vector=POSITIONS_OFFSETS) badges = Text(multi=True, - fields={"keyword": Keyword()}, + fields={"keyword": Subfield.keyword}, analyzer=Analyzer.general_analyzer, term_vector=POSITIONS_OFFSETS) tags = Text(multi=True, - fields={"keyword": Keyword()}, + fields={"keyword": Subfield.keyword}, analyzer=Analyzer.general_analyzer, term_vector=POSITIONS_OFFSETS) usage = RankFeatures() @@ -93,36 +150,57 @@ class Meta: class Table(SearchableResource): + # overwrite table name because it requires a different ngram subfield + name = Text(required=True, + fields={ + "keyword": Subfield.keyword, + "general": Subfield.general, + "ngram": Subfield.get_ngram_subfield( + field_name="table_name", + max_shingle_size=8, + token_separator="_" + ) + }, + analyzer=Analyzer.stemming_analyzer, + term_vector=POSITIONS_OFFSETS) columns = Text(multi=True, fields={ - "keyword": Keyword(), - "general": Text(multi=True, - analyzer=Analyzer.general_analyzer, - term_vector=POSITIONS_OFFSETS) + "keyword": Subfield.keyword, + "general": Subfield.general_multi, + "ngram": Subfield.get_ngram_subfield( + field_name="table_columns", + multi=True, + token_separator="_") }, term_vector=POSITIONS_OFFSETS, analyzer=Analyzer.stemming_analyzer) display_name = Text(required=True, - fields={"keyword": Keyword()}, + fields={"keyword": Subfield.keyword}, analyzer=Analyzer.general_analyzer, term_vector=POSITIONS_OFFSETS) database = Text(required=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Subfield.keyword + }, analyzer=Analyzer.general_analyzer, term_vector=POSITIONS_OFFSETS) cluster = Text(required=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Subfield.keyword + }, analyzer=Analyzer.general_analyzer, term_vector=POSITIONS_OFFSETS) schema = Text(required=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Subfield.keyword, + "general": Subfield.general + }, analyzer=Analyzer.stemming_analyzer, term_vector=POSITIONS_OFFSETS) column_descriptions = Text(multi=True, fields={ - "alphanumeric": Text(multi=True, - analyzer=Analyzer.alphanum_analyzer, - term_vector=POSITIONS_OFFSETS) + "alphanumeric": Subfield.alphanumeric_multi, + "general": Subfield.general_multi }, analyzer=Analyzer.english_analyzer, term_vector=POSITIONS_OFFSETS) @@ -130,30 +208,50 @@ class Table(SearchableResource): class Dashboard(SearchableResource): group_name = Text(required=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Subfield.keyword, + "general": Subfield.general + }, analyzer=Analyzer.stemming_analyzer, term_vector=POSITIONS_OFFSETS) group_description = Text(analyzer=Analyzer.english_analyzer, term_vector=POSITIONS_OFFSETS) query_names = Text(multi=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Subfield.keyword, + "general": Subfield.general_multi, + "ngram": Subfield.get_ngram_subfield( + field_name="dashboard_query", + multi=True + ) + }, analyzer=Analyzer.stemming_analyzer, term_vector=POSITIONS_OFFSETS) chart_names = Text(multi=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Subfield.keyword, + "general": Subfield.general_multi, + "ngram": Subfield.get_ngram_subfield( + field_name="dashboard_chart", + multi=True + ) + }, analyzer=Analyzer.stemming_analyzer, term_vector=POSITIONS_OFFSETS) class Feature(SearchableResource): feature_group = Text(required=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Subfield.keyword, + "general": Subfield.general_multi + }, analyzer=Analyzer.stemming_analyzer, term_vector=POSITIONS_OFFSETS) version = Keyword(required=True) status = Keyword() entity = Text(multi=True, - fields={"keyword": Keyword()}, + fields={"keyword": Subfield.keyword}, analyzer=Analyzer.general_analyzer, term_vector=POSITIONS_OFFSETS) availability = Keyword() @@ -164,11 +262,17 @@ class User(SearchableResource): # name is full name, no separate first and last name # total read, total own, total follow goes under usage metrics first_name = Text(required=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Keyword(), + "general": Subfield.general + }, analyzer=Analyzer.stemming_analyzer, term_vector=POSITIONS_OFFSETS) last_name = Text(required=True, - fields={"keyword": Keyword()}, + fields={ + "keyword": Keyword(), + "general": Subfield.general + }, analyzer=Analyzer.stemming_analyzer, term_vector=POSITIONS_OFFSETS) diff --git a/databuilder/databuilder/task/search/search_metadata_to_elasticsearch_task.py b/databuilder/databuilder/task/search/search_metadata_to_elasticsearch_task.py index 0c60a88c90..ed7e32d97b 100644 --- a/databuilder/databuilder/task/search/search_metadata_to_elasticsearch_task.py +++ b/databuilder/databuilder/task/search/search_metadata_to_elasticsearch_task.py @@ -153,6 +153,10 @@ def run(self) -> None: LOGGER.info(f"Creating ES index {self.elasticsearch_new_index}") index = Index(name=self.elasticsearch_new_index, using=self.elasticsearch_client) index.document(self.document_mapping) + + # allow for longer ngram length + index.settings(max_shingle_diff=10) + index.create() # publish search metadata to ES diff --git a/databuilder/setup.py b/databuilder/setup.py index f62cf78084..687b4532ec 100644 --- a/databuilder/setup.py +++ b/databuilder/setup.py @@ -5,7 +5,7 @@ from setuptools import find_packages, setup -__version__ = '6.9.0' +__version__ = '6.10.0' requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt')