Skip to content

Commit

Permalink
port facet grouping
Browse files Browse the repository at this point in the history
  • Loading branch information
willronchetti committed Jan 23, 2025
1 parent b51a8e1 commit b6898d8
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 16 deletions.
9 changes: 8 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ snovault
Change Log
----------

11.23.0.1b0
===========

* Ports ``group_by_field`` faceting feature from Fourfront



11.23.0
=======
* 2024-11-02/dmichaels
Expand All @@ -18,7 +25,7 @@ Change Log
- Fix in snovault/tests/elasticsearch_fixture.py (use only for local/dev deploy) for
strange (new as of 2024-09-02) behavior where it was hanging on startup during
ElasticSearch index mapping creation, related to ElasticSearch logging output,
and the way we were using subprocess.Popen and reading the subprocess output;
and the way we were using subprocess.Popen and reading the subprocess output;
more correct way is to inherit stdout/stderr of the partent.


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicsnovault"
version = "11.23.0"
version = "11.23.0.1b0"
description = "Storage support for 4DN Data Portals."
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down
71 changes: 63 additions & 8 deletions snovault/search/lucene_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,27 @@ def handle_should_query(field_name, options):
should_query = {BOOL: {SHOULD: {TERMS: {field_name: options}}}}
return should_query

@staticmethod
def create_field_filters(field_filters):
""" Taken as is (essentially) from Fourfront to implement the group by facet
terms aggregation
"""
must_terms, must_not_terms = [], []
must_filters, must_not_filters = [], []
for query_field, filters in field_filters.items():
must_terms = {'terms': {query_field: filters['must_terms']}} if filters['must_terms'] else {}
must_not_terms = {'terms': {query_field: filters['must_not_terms']}} if filters['must_not_terms'] else {}

final_filters = {'bool': {'must': must_filters, 'must_not': must_not_filters}}

if must_terms:
must_filters.append(must_terms)
if must_not_terms:
must_not_filters.append(must_not_terms)

return final_filters


@classmethod
def build_sub_queries(cls, field_filters, es_mapping):
"""
Expand Down Expand Up @@ -381,8 +402,8 @@ def handle_range_filters(cls, request, result, field_filters, doc_types):

return range_filters

@staticmethod
def initialize_field_filters(request, principals, doc_types):
@classmethod
def initialize_field_filters(cls, request, principals, doc_types):
""" Helper function for build_filters
Initializes field filters with filters that exist on all searches, does some basic updates
"""
Expand Down Expand Up @@ -416,7 +437,10 @@ def initialize_field_filters(request, principals, doc_types):
if 'OntologyTerm' not in doc_types:
field_filters['[email protected]']['must_not_terms'].append('OntologyTerm')

return field_filters
# base filters only includes principals, doc_type and excludes some status and item types
# it is essentially useful for the group by facet terms aggregation
base_field_filters = cls.create_field_filters(deepcopy(field_filters))
return field_filters, base_field_filters

@staticmethod
def build_nested_query(nested_path, query):
Expand Down Expand Up @@ -569,7 +593,7 @@ def build_filters(cls, request, query, result, principals, doc_types, es_mapping

# these next two dictionaries should each have keys equal to query_field
# and values: must_terms: [<list of terms>], must_not_terms: [<list of terms>], add_no_value: True/False/None
field_filters = cls.initialize_field_filters(request, principals, doc_types)
field_filters, base_field_filters = cls.initialize_field_filters(request, principals, doc_types,)
range_filters = cls.handle_range_filters(request, result, field_filters, doc_types)

# construct queries
Expand All @@ -588,7 +612,7 @@ def build_filters(cls, request, query, result, principals, doc_types, es_mapping

# at this point, final_filters is valid lucene and can be dropped into the query directly
query[QUERY][BOOL][FILTER] = final_filters
return query, final_filters
return query, final_filters, base_field_filters

@staticmethod
def _check_and_remove(compare_field, facet_filters, active_filter, query_field, filter_type):
Expand Down Expand Up @@ -969,7 +993,7 @@ def _build_terms_aggregation(query_field, facet, requested_values=None, nested=F

@classmethod
def _add_terms_aggregation(cls, facet, query_field, search_filters, string_query, nested_path, aggs, agg_name,
requested_values):
requested_values, base_field_filters):
""" Builds a standard terms aggregation, setting a nested identifier to be repaired later
by elasticsearch_dsl, adding it to the given aggs.
Expand All @@ -981,6 +1005,7 @@ def _add_terms_aggregation(cls, facet, query_field, search_filters, string_query
:param aggs: the aggregation object we are building
:param agg_name: name of the aggregation we are building
:param requested_values: values for this terms agg we requested (to be explicitly included)
:param base_field_filters: Dict of filters on base fields for use with group by terms facet
"""
is_nested = nested_path is not None
if is_nested:
Expand Down Expand Up @@ -1032,9 +1057,38 @@ def _add_terms_aggregation(cls, facet, query_field, search_filters, string_query
FILTER: facet_filters,
}

# This comment is taken from Fourfront and represents the last remnant of divergent in behavior
# in search across the portals. Note that such terms query will *not* work on nested fields,
# but this is not likely to be a big issue as it's an uncommon scenario -Will 23 Jan 2024
# add extra ES sub-query to fetch facet terms and their grouping terms to build
# parent - child hierarchy (we always build full map, since the implementation in
# https://github.com/4dn-dcic/fourfront/blob/dc47659487aec88fb0c19145e48ebbd20588eba3/src/encoded/search.py
# fails when there are selected terms in filters but not listed in facets)
# Note: This aggregation is used in group_facet_terms func.
if 'group_by_field' in facet and base_field_filters:
aggs[facet['aggregation_type'] + ":" + agg_name + ":group_by"] = {
'aggs': {
"primary_agg": {
"terms": {
'size': 100,
'field': "embedded." + facet['group_by_field'] + ".raw",
'missing': facet.get("missing_value_replacement", "No value"),
'aggs': {
"sub_terms": {
"terms": {
"field": query_field,
}
}
}
}
}
},
'filter': {'bool': deepcopy(base_field_filters['bool'])},
}

@classmethod
def build_facets(cls, query, facets, search_filters, string_query, request, doc_types,
custom_aggregations=None, size=25, from_=0, es_mapping=None):
custom_aggregations=None, size=25, from_=0, es_mapping=None, base_field_filters=None):
"""
Sets facets in the query as ElasticSearch aggregations, with each aggregation to be
filtered by search_filters minus filter affecting facet field in order to get counts
Expand All @@ -1044,6 +1098,7 @@ def build_facets(cls, query, facets, search_filters, string_query, request, doc_
:type facets: List of tuples.
:param search_filters: Dict of filters which are set for the ES query in build_filters
:param string_query: Dict holding the query_string used in the search
:param base_field_filters: Dict of filters for use with the group by term facet
"""
if from_ != 0:
return query
Expand All @@ -1066,7 +1121,7 @@ def build_facets(cls, query, facets, search_filters, string_query, request, doc_
aggs, agg_name)
else: # assume terms
cls._add_terms_aggregation(facet, query_field, search_filters, string_query, nested_path,
aggs, agg_name, requested_values)
aggs, agg_name, requested_values, base_field_filters)

# Update facet with title, description from field_schema, if missing.
if facet.get('title') is None and field_schema and 'title' in field_schema:
Expand Down
12 changes: 6 additions & 6 deletions snovault/search/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ def initialize_facets(self):
else:
facets = [
# adds default 'type' facet with hide_from_view=True
# Note that the 'hide_from_view=True' facet is included in context.facets whereas the 'default_hidden=True' is ignored
# Note that the 'hide_from_view=True' facet is included in context.facets whereas the 'default_hidden=True' is ignored
('type', {'title': 'Data Type', 'hide_from_view': True})
]

Expand Down Expand Up @@ -748,16 +748,16 @@ def build_search_query(self):
self.set_sort_order()

# Transform into filtered search
self.query, query_filters = LuceneBuilder.build_filters(self.request, self.query, self.response,
self.principals, self.doc_types,
self.item_type_es_mapping)
self.query, query_filters, base_field_filters = LuceneBuilder.build_filters(self.request, self.query, self.response,
self.principals, self.doc_types,
self.item_type_es_mapping)
# Prepare facets in intermediary structure
self.facets = self.initialize_facets()

# Transform filter search into filter + faceted search
self.query = LuceneBuilder.build_facets(self.query, self.facets, query_filters, self.string_query,
self.request, self.doc_types, self.custom_aggregations, self.size,
self.from_, self.item_type_es_mapping)
self.from_, self.item_type_es_mapping, base_field_filters)

# Add preference from session, if available
# This just sets the value on the class - it is passed to execute_search later
Expand Down Expand Up @@ -1208,7 +1208,7 @@ def get_total(self, es_results):
'''
# default value returned by ES
total = es_results['hits']['total']['value']

# After ES7 upgrade, 'total' does not return the exact count if it is >10000. To get a more precise result, it
# loops through the facet terms. (currently, type=Item's doc_count is calculated correctly)
if total == ES_MAX_HIT_TOTAL and 'facets' in self.response:
Expand Down

0 comments on commit b6898d8

Please sign in to comment.