Skip to content

feat: added ngram subfield with no stemming on ES mappings #1895

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 5, 2022
96 changes: 71 additions & 25 deletions databuilder/databuilder/task/search/document_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,31 +57,55 @@ class Analyzer:
Filter.english_stop,
Filter.english_stemmer])


class Subfield:
# combinations of field types and analyzers for additional index time analysis

keyword = Keyword()

alphanumeric = Text(analyzer=Analyzer.alphanum_analyzer,
term_vector=POSITIONS_OFFSETS)

general = Text(analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)

general_multi = Text(multi=True,
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)

alphanumeric_multi = Text(multi=True,
analyzer=Analyzer.alphanum_analyzer,
term_vector=POSITIONS_OFFSETS)

# Resource Mappings


class SearchableResource(Document):
# For better understanding of field type rationale read "Mapping unstructured content"
# https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#wildcard-field-type
key = Text(required=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
name = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
description = Text(analyzer=Analyzer.english_analyzer,
fields={"alphanumeric": Text(analyzer=Analyzer.alphanum_analyzer,
term_vector=POSITIONS_OFFSETS)
},
fields={
"alphanumeric": Subfield.alphanumeric,
"general": Subfield.general
},
term_vector=POSITIONS_OFFSETS)
badges = Text(multi=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
tags = Text(multi=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
usage = RankFeatures()
Expand All @@ -95,65 +119,81 @@ class Meta:
class Table(SearchableResource):
columns = Text(multi=True,
fields={
"keyword": Keyword(),
"general": Text(multi=True,
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
"keyword": Subfield.keyword,
"general": Subfield.general_multi
},
term_vector=POSITIONS_OFFSETS,
analyzer=Analyzer.stemming_analyzer)
display_name = Text(required=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
database = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword
},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
cluster = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword
},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
schema = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
column_descriptions = Text(multi=True,
fields={
"alphanumeric": Text(multi=True,
analyzer=Analyzer.alphanum_analyzer,
term_vector=POSITIONS_OFFSETS)
"alphanumeric": Subfield.alphanumeric_multi,
"general": Subfield.general_multi
},
analyzer=Analyzer.english_analyzer,
term_vector=POSITIONS_OFFSETS)


class Dashboard(SearchableResource):
group_name = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
group_description = Text(analyzer=Analyzer.english_analyzer,
term_vector=POSITIONS_OFFSETS)
query_names = Text(multi=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general_multi
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
chart_names = Text(multi=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general_multi
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)


class Feature(SearchableResource):
feature_group = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Subfield.keyword,
"general": Subfield.general_multi
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
version = Keyword(required=True)
status = Keyword()
entity = Text(multi=True,
fields={"keyword": Keyword()},
fields={"keyword": Subfield.keyword},
analyzer=Analyzer.general_analyzer,
term_vector=POSITIONS_OFFSETS)
availability = Keyword()
Expand All @@ -164,11 +204,17 @@ class User(SearchableResource):
# name is full name, no separate first and last name
# total read, total own, total follow goes under usage metrics
first_name = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Keyword(),
"general": Subfield.general
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)
last_name = Text(required=True,
fields={"keyword": Keyword()},
fields={
"keyword": Keyword(),
"general": Subfield.general
},
analyzer=Analyzer.stemming_analyzer,
term_vector=POSITIONS_OFFSETS)

Expand Down
2 changes: 1 addition & 1 deletion databuilder/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from setuptools import find_packages, setup

__version__ = '6.8.0'
__version__ = '6.9.0'

requirements_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'requirements.txt')
with open(requirements_path) as requirements_file:
Expand Down