Skip to content

feat: added ngram subfield with no stemming on ES mappings #1895

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 5, 2022
154 changes: 129 additions & 25 deletions databuilder/databuilder/task/search/document_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,31 +57,88 @@ class Analyzer:
Filter.english_stop,
Filter.english_stemmer])


class Subfield:
# combinations of field types and analyzers for additional index time analysis

keyword = Keyword()

alphanumeric = Text(analyzer=Analyzer.alphanum_analyzer,
term_vector=POSITIONS_OFFSETS)

alphanumeric_multi = Text(multi=True,
analyzer=Analyzer.alphanum_analyzer,
term_vector=POSITIONS_OFFSETS)

general = Text(analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)

general_multi = Text(multi=True,
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)

@staticmethod
def get_ngram_subfield(field_name: str,
multi: bool = False,
min_shingle_size: int = 2,
max_shingle_size: int = 5,
token_separator: str = ' ') -> Text:

# using shingle token filter for word level ngrams
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-shingle-tokenfilter.html
shingle_filter = token_filter(f"shingle_filter_{field_name}",
type="shingle",
output_unigrams=True,
min_shingle_size=min_shingle_size,
max_shingle_size=max_shingle_size,
token_separator=token_separator)

ngram_analyzer = analysis.analyzer(f"ngram_analyzer_{field_name}",
tokenizer=Tokenizer.alphanum_tokenizer,
filter=[shingle_filter])

return Text(multi=multi,
analyzer=ngram_analyzer,
term_vector=POSITIONS_OFFSETS)


# Resource Mappings

# Note: the current analyzers don't support tokenizing on camelcase text, if this is a requirement extend
# these classes and write a custom analyzer
# https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-pattern-analyzer.html


class SearchableResource(Document):
# For better understanding of field type rationale read "Mapping unstructured content"
# https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#wildcard-field-type
key = Text(required=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
name = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general,
"ngram": Subfield.get_ngram_subfield(
field_name="resource_name",
max_shingle_size=8
)
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
description = Text(analyzer=Analyzer.english_analyzer,
fields={"alphanumeric": Text(analyzer=Analyzer.alphanum_analyzer,
term_vector=POSITIONS_OFFSETS)
},
fields={
"alphanumeric": Subfield.alphanumeric,
"general": Subfield.general
},
term_vector=POSITIONS_OFFSETS)
badges = Text(multi=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
tags = Text(multi=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
usage = RankFeatures()
Expand All @@ -93,67 +150,108 @@ class Meta:


class Table(SearchableResource):
# overwrite table name because it requires a different ngram subfield
name = Text(required=True,
fields={
"keyword": Subfield.keyword,
"general": Subfield.general,
"ngram": Subfield.get_ngram_subfield(
field_name="table_name",
max_shingle_size=8,
token_separator="_"
)
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
columns = Text(multi=True,
fields={
"keyword": Keyword(),
"general": Text(multi=True,
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
"keyword": Subfield.keyword,
"general": Subfield.general_multi,
"ngram": Subfield.get_ngram_subfield(
field_name="table_columns",
multi=True,
token_separator="_")
},
term_vector=POSITIONS_OFFSETS,
analyzer=Analyzer.stemming_analyzer)
display_name = Text(required=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
database = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword
},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
cluster = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword
},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
schema = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
column_descriptions = Text(multi=True,
fields={
"alphanumeric": Text(multi=True,
analyzer=Analyzer.alphanum_analyzer,
term_vector=POSITIONS_OFFSETS)
"alphanumeric": Subfield.alphanumeric_multi,
"general": Subfield.general_multi
},
analyzer=Analyzer.english_analyzer,
term_vector=POSITIONS_OFFSETS)


class Dashboard(SearchableResource):
group_name = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
group_description = Text(analyzer=Analyzer.english_analyzer,
term_vector=POSITIONS_OFFSETS)
query_names = Text(multi=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general_multi,
"ngram": Subfield.get_ngram_subfield(
field_name="dashboard_query",
multi=True
)
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
chart_names = Text(multi=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general_multi,
"ngram": Subfield.get_ngram_subfield(
field_name="dashboard_chart",
multi=True
)
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)


class Feature(SearchableResource):
feature_group = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general_multi
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
version = Keyword(required=True)
status = Keyword()
entity = Text(multi=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
availability = Keyword()
Expand All @@ -164,11 +262,17 @@ class User(SearchableResource):
# name is full name, no separate first and last name
# total read, total own, total follow goes under usage metrics
first_name = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Keyword(),
"general": Subfield.general
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
last_name = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Keyword(),
"general": Subfield.general
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,10 @@ def run(self) -> None:
LOGGER.info(f"Creating ES index {self.elasticsearch_new_index}")
index = Index(name=self.elasticsearch_new_index, using=self.elasticsearch_client)
index.document(self.document_mapping)

# allow for longer ngram length
index.settings(max_shingle_diff=10)

index.create()

# publish search metadata to ES
Expand Down
2 changes: 1 addition & 1 deletion databuilder/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from setuptools import find_packages, setup

__version__ = '6.9.0'
__version__ = '6.10.0'

requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
'requirements.txt')
Expand Down