diff --git a/api/src/adapters/search/opensearch_query_builder.py b/api/src/adapters/search/opensearch_query_builder.py index 4aa4e07e5..a7e900c4d 100644 --- a/api/src/adapters/search/opensearch_query_builder.py +++ b/api/src/adapters/search/opensearch_query_builder.py @@ -1,3 +1,4 @@ +import datetime import typing from src.pagination.pagination_models import SortDirection @@ -17,6 +18,7 @@ class SearchQueryBuilder: * Sorted by relevancy score descending * Scored on titles containing "king" * Where the author is one of Brandon Sanderson or J R.R. Tolkien + * With a page count between 300 and 1000 * Returning aggregate counts of books by those authors in the full results This query could either be built manually and look like: @@ -52,6 +54,12 @@ class SearchQueryBuilder: "Brandon Sanderson", "J R.R. Tolkien" ] + }, + "range": { + "publication_date": { + "gte": 300, + "lte": 1000 + } } } ] @@ -75,6 +83,7 @@ class SearchQueryBuilder: .sort_by([("relevancy", SortDirection.DESCENDING)]) .simple_query("king", fields=["title.keyword"]) .filter_terms("author.keyword", terms=["Brandon Sanderson", "J R.R. Tolkien"]) + .filter_int_range("page_count", 300, 1000) .aggregation_terms(aggregation_name="author", field_name="author.keyword", minimum_count=0) .build() """ @@ -150,6 +159,54 @@ def filter_terms(self, field: str, terms: list) -> typing.Self: self.filters.append({"terms": {field: terms}}) return self + def filter_int_range( + self, field: str, min_value: int | None, max_value: int | None + ) -> typing.Self: + """ + For a given field, filter results to a range of integer values. + + If min or max is not provided, the range is unbounded and only + affects the minimum or maximum possible value. At least one min or max value must be specified. + + These filters do not affect the relevancy score, they are purely + a binary filter on the overall results. + """ + if min_value is None and max_value is None: + raise ValueError("Cannot use int range filter if both min and max are None") + + range_filter = {} + if min_value is not None: + range_filter["gte"] = min_value + if max_value is not None: + range_filter["lte"] = max_value + + self.filters.append({"range": {field: range_filter}}) + return self + + def filter_date_range( + self, field: str, start_date: datetime.date | None, end_date: datetime.date | None + ) -> typing.Self: + """ + For a given field, filter results to a range of dates. + + If start or end is not provided, the range is unbounded and only + affects the start or end date. At least one start or end date must be specified. + + These filters do not affect the relevancy score, they are purely + a binary filter on the overall results. + """ + if start_date is None and end_date is None: + raise ValueError("Cannot use date range filter if both start and end are None") + + range_filter = {} + if start_date is not None: + range_filter["gte"] = start_date.isoformat() + if end_date is not None: + range_filter["lte"] = end_date.isoformat() + + self.filters.append({"range": {field: range_filter}}) + return self + def aggregation_terms( self, aggregation_name: str, field_name: str, size: int = 25, minimum_count: int = 1 ) -> typing.Self: diff --git a/api/tests/src/adapters/search/test_opensearch_query_builder.py b/api/tests/src/adapters/search/test_opensearch_query_builder.py index 731c0e6ca..33f9b2b19 100644 --- a/api/tests/src/adapters/search/test_opensearch_query_builder.py +++ b/api/tests/src/adapters/search/test_opensearch_query_builder.py @@ -1,4 +1,5 @@ import uuid +from datetime import date import pytest @@ -12,6 +13,7 @@ "author": "Brandon Sanderson", "in_stock": True, "page_count": 1007, + "publication_date": "2010-08-31", } WORDS_OF_RADIANCE = { "id": 2, @@ -19,6 +21,7 @@ "author": "Brandon Sanderson", "in_stock": False, "page_count": 1087, + "publication_date": "2014-03-04", } OATHBRINGER = { "id": 3, @@ -26,6 +29,7 @@ "author": "Brandon Sanderson", "in_stock": True, "page_count": 1248, + "publication_date": "2017-11-14", } RHYTHM_OF_WAR = { "id": 4, @@ -33,6 +37,7 @@ "author": "Brandon Sanderson", "in_stock": False, "page_count": 1232, + "publication_date": "2020-11-17", } GAME_OF_THRONES = { "id": 5, @@ -40,6 +45,7 @@ "author": "George R.R. Martin", "in_stock": True, "page_count": 694, + "publication_date": "1996-08-01", } CLASH_OF_KINGS = { "id": 6, @@ -47,6 +53,7 @@ "author": "George R.R. Martin", "in_stock": True, "page_count": 768, + "publication_date": "1998-11-16", } STORM_OF_SWORDS = { "id": 7, @@ -54,6 +61,7 @@ "author": "George R.R. Martin", "in_stock": True, "page_count": 973, + "publication_date": "2000-08-08", } FEAST_FOR_CROWS = { "id": 8, @@ -61,6 +69,7 @@ "author": "George R.R. Martin", "in_stock": True, "page_count": 753, + "publication_date": "2005-10-17", } DANCE_WITH_DRAGONS = { "id": 9, @@ -68,6 +77,7 @@ "author": "George R.R. Martin", "in_stock": False, "page_count": 1056, + "publication_date": "2011-07-12", } FELLOWSHIP_OF_THE_RING = { "id": 10, @@ -75,6 +85,7 @@ "author": "J R.R. Tolkien", "in_stock": True, "page_count": 423, + "publication_date": "1954-07-29", } TWO_TOWERS = { "id": 11, @@ -82,6 +93,7 @@ "author": "J R.R. Tolkien", "in_stock": True, "page_count": 352, + "publication_date": "1954-11-11", } RETURN_OF_THE_KING = { "id": 12, @@ -89,6 +101,7 @@ "author": "J R.R. Tolkien", "in_stock": True, "page_count": 416, + "publication_date": "1955-10-20", } FULL_DATA = [ @@ -120,7 +133,9 @@ def validate_valid_request( f"Request generated was invalid and caused an error in search client: {json_value}" ) - assert resp.records == expected_results + assert ( + resp.records == expected_results + ), f"{[record['title'] for record in resp.records]} != {[expected['title'] for expected in expected_results]}" if expected_aggregations is not None: assert resp.aggregations == expected_aggregations @@ -364,6 +379,131 @@ def test_query_builder_filter_terms( validate_valid_request(search_client, search_index, builder, expected_results) + @pytest.mark.parametrize( + "start_date,end_date,expected_results", + [ + # Date range that will include all results + (date(1900, 1, 1), date(2050, 1, 1), FULL_DATA), + # Start only date range that will get all results + (date(1950, 1, 1), None, FULL_DATA), + # End only date range that will get all results + (None, date(2025, 1, 1), FULL_DATA), + # Range that filters to just oldest + ( + date(1950, 1, 1), + date(1960, 1, 1), + [FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING], + ), + # Unbounded range for oldest few + (None, date(1990, 1, 1), [FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING]), + # Unbounded range for newest few + (date(2011, 8, 1), None, [WORDS_OF_RADIANCE, OATHBRINGER, RHYTHM_OF_WAR]), + # Selecting a few in the middle + ( + date(2005, 1, 1), + date(2014, 1, 1), + [WAY_OF_KINGS, FEAST_FOR_CROWS, DANCE_WITH_DRAGONS], + ), + # Exact date + (date(1954, 7, 29), date(1954, 7, 29), [FELLOWSHIP_OF_THE_RING]), + # None fetched in range + (date(1981, 1, 1), date(1989, 1, 1), []), + ], + ) + def test_query_builder_filter_date_range( + self, search_client, search_index, start_date, end_date, expected_results + ): + builder = ( + SearchQueryBuilder() + .sort_by([]) + .filter_date_range("publication_date", start_date, end_date) + ) + + expected_ranges = {} + if start_date is not None: + expected_ranges["gte"] = start_date.isoformat() + if end_date is not None: + expected_ranges["lte"] = end_date.isoformat() + + expected_query = { + "size": 25, + "from": 0, + "track_scores": True, + "query": {"bool": {"filter": [{"range": {"publication_date": expected_ranges}}]}}, + } + + assert builder.build() == expected_query + + validate_valid_request(search_client, search_index, builder, expected_results) + + @pytest.mark.parametrize( + "min_value,max_value,expected_results", + [ + # All fetched + (1, 2000, FULL_DATA), + # None fetched + (2000, 3000, []), + # "Short" books + (300, 700, [GAME_OF_THRONES, FELLOWSHIP_OF_THE_RING, TWO_TOWERS, RETURN_OF_THE_KING]), + # Unbounded short + (None, 416, [TWO_TOWERS, RETURN_OF_THE_KING]), + # Unbounded long + (1050, None, [WORDS_OF_RADIANCE, OATHBRINGER, RHYTHM_OF_WAR, DANCE_WITH_DRAGONS]), + # Middle length + ( + 500, + 1010, + [WAY_OF_KINGS, GAME_OF_THRONES, CLASH_OF_KINGS, STORM_OF_SWORDS, FEAST_FOR_CROWS], + ), + ], + ) + def test_query_builder_filter_int_range( + self, search_client, search_index, min_value, max_value, expected_results + ): + builder = ( + SearchQueryBuilder().sort_by([]).filter_int_range("page_count", min_value, max_value) + ) + + expected_ranges = {} + if min_value is not None: + expected_ranges["gte"] = min_value + if max_value is not None: + expected_ranges["lte"] = max_value + + expected_query = { + "size": 25, + "from": 0, + "track_scores": True, + "query": {"bool": {"filter": [{"range": {"page_count": expected_ranges}}]}}, + } + + assert builder.build() == expected_query + + validate_valid_request(search_client, search_index, builder, expected_results) + + def test_multiple_ranges(self, search_client, search_index): + # Sanity test that we can specify multiple ranges (in this case, a date + int range) + # in the same query + builder = ( + SearchQueryBuilder() + .sort_by([]) + .filter_int_range("page_count", 600, 1100) + .filter_date_range("publication_date", date(2000, 1, 1), date(2013, 1, 1)) + ) + + expected_results = [WAY_OF_KINGS, STORM_OF_SWORDS, FEAST_FOR_CROWS, DANCE_WITH_DRAGONS] + validate_valid_request( + search_client, search_index, builder, expected_results=expected_results + ) + + def test_filter_int_range_both_none(self): + with pytest.raises(ValueError, match="Cannot use int range filter"): + SearchQueryBuilder().filter_int_range("test_field", None, None) + + def test_filter_date_range_both_none(self): + with pytest.raises(ValueError, match="Cannot use date range filter"): + SearchQueryBuilder().filter_date_range("test_field", None, None) + @pytest.mark.parametrize( "query,fields,expected_results,expected_aggregations", [