Skip to content

Commit 42e56d1

Browse files
authored
[releases/2.19] Fix lexical search required term issue (#1221)
Fix a bug in lexical query generation for required terms
1 parent acc9b48 commit 42e56d1

File tree

6 files changed

+204
-16
lines changed

6 files changed

+204
-16
lines changed

src/marqo/core/structured_vespa_index/structured_vespa_index.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -908,17 +908,17 @@ def _get_lexical_search_term(self, marqo_query: MarqoLexicalQuery, is_facets_ter
908908

909909
return f'{or_terms}{and_terms}'
910910

911-
def _get_lexical_contains_term(self, phrase, query: MarqoQuery) -> str:
911+
def _get_lexical_contains_term(self, phrase:str, query: MarqoQuery) -> str:
912912
if isinstance(query, MarqoHybridQuery):
913913
searchable_attributes = query.hybrid_parameters.searchableAttributesLexical
914914
else:
915915
searchable_attributes = query.searchable_attributes
916916

917917
if searchable_attributes is not None:
918-
return ' OR '.join([
918+
return "("+' OR '.join([
919919
f'{self._marqo_index.field_map[field].lexical_field_name} contains "{phrase}"'
920920
for field in searchable_attributes
921-
])
921+
])+")"
922922
else:
923923
return f'default contains "{phrase}"'
924924

tests/integ_tests/tensor_search/integ_tests/test_search_combined.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,11 +1021,6 @@ def test_get_lexical_search_term(self):
10211021

10221022
for index, method_name in indexes_to_test:
10231023
with self.subTest(index_type=type(index).__name__):
1024-
# Mock the _get_lexical_contains_term method for StructuredVespaIndex
1025-
if isinstance(index, StructuredVespaIndex):
1026-
index._get_lexical_contains_term = mock.MagicMock(
1027-
side_effect=lambda phrase, query: f'contains("{phrase}")')
1028-
10291024
# Test cases
10301025
test_cases = [
10311026
# Test with score modifiers (should use OR)
@@ -1037,7 +1032,7 @@ def test_get_lexical_search_term(self):
10371032
and_phrases=[],
10381033
score_modifiers=[ScoreModifier(field="field1", weight=1.0, type=ScoreModifierType.Multiply)]
10391034
),
1040-
'contains("term1") OR contains("term2")' if isinstance(index, StructuredVespaIndex)
1035+
'default contains "term1" OR default contains "term2"' if isinstance(index, StructuredVespaIndex)
10411036
else '(default contains "term1" OR default contains "term2")'
10421037
),
10431038
# Test without score modifiers (should use weakAnd)
@@ -1048,7 +1043,7 @@ def test_get_lexical_search_term(self):
10481043
or_phrases=["term1", "term2"],
10491044
and_phrases=[]
10501045
),
1051-
'weakAnd(contains("term1"), contains("term2"))' if isinstance(index, StructuredVespaIndex)
1046+
'weakAnd(default contains "term1", default contains "term2")' if isinstance(index, StructuredVespaIndex)
10521047
else '(weakAnd(default contains "term1", default contains "term2"))'
10531048
),
10541049
# Test with both OR and AND phrases
@@ -1059,7 +1054,8 @@ def test_get_lexical_search_term(self):
10591054
or_phrases=["term1", "term2"],
10601055
and_phrases=["term3", "term4"]
10611056
),
1062-
'(weakAnd(contains("term1"), contains("term2"))) AND (contains("term3") AND contains("term4"))'
1057+
'(weakAnd(default contains "term1", default contains "term2")) AND '
1058+
'(default contains "term3" AND default contains "term4")'
10631059
if isinstance(index, StructuredVespaIndex)
10641060
else '((weakAnd(default contains "term1", default contains "term2")) '
10651061
'AND (default contains "term3" AND default contains "term4"))'

tests/integ_tests/tensor_search/integ_tests/test_search_structured.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,3 +1274,81 @@ def test_special_characters_in_map_score_modifiers(self):
12741274
# Assert that no characters fail
12751275
self.assertEqual(failed_characters, [],
12761276
f"Expected no characters to fail, but got: {failed_characters}")
1277+
1278+
def test_lexical_search_with_required_terms_results(self):
1279+
docs = [
1280+
{
1281+
"_id": "1",
1282+
"text_field_1": "term1 term2 term3 word1 word2",
1283+
"text_field_2": "term3 term4 term5 word3 word4",
1284+
"text_field_3": "term5 term6 term7 word5 word6"
1285+
},
1286+
{
1287+
"_id": "2",
1288+
"text_field_4": "term1 term2",
1289+
"text_field_5": "term3 term4",
1290+
},
1291+
{
1292+
"_id": "3",
1293+
"text_field_1": "term5 term6",
1294+
"text_field_2": "word5",
1295+
},
1296+
{
1297+
"_id": "4",
1298+
"text_field_1": "term7 term8",
1299+
"text_field_2": "word7 word8",
1300+
}
1301+
]
1302+
1303+
test_cases = [
1304+
# ─── Phrase Matching ─────────────────────────────────────────────
1305+
('"term1 term2"', None, ["1", "2"]), # match phrase across fields
1306+
('"term1 term2"', ["text_field_4"], ["2"]), # phrase match restricted to field
1307+
('"term7 term8"', ["text_field_1"], ["4"]), # phrase match in single field
1308+
('"term4 term3"', None, []), # phrase order matters (should not match)
1309+
1310+
# ─── Token Matching ──────────────────────────────────────────────
1311+
('"term1" "term2"', None, ["1", "2"]), # terms in multiple fields
1312+
('"term1" "term2"', ["text_field_1"], ["1"]), # both in same field
1313+
('"term1" "term4"', None, ["1", "2"]), # terms spread across fields
1314+
('"term5" "word5"', None, ["1", "3"]), # match across or within fields
1315+
('"word1" "word6"', None, ["1"]), # span across different fields in one doc
1316+
1317+
# ─── Field Filtering ─────────────────────────────────────────────
1318+
('"term5"', ["text_field_1", "text_field_2"], ["1", "3"]),
1319+
('"term5"', ["text_field_4"], []), # valid term excluded due to field filter
1320+
('"term7"', ["text_field_2"], []), # term exists but outside filtered field
1321+
('"term7"', ["text_field_1"], ["4"]), # valid in allowed field
1322+
1323+
# ─── Matching Logic ──────────────────────────────────────────────
1324+
('"term5 term6 term7 word5"', None, ["1"]), # all terms appear in one doc
1325+
('"term5 term6"', ["text_field_1"], ["3"]), # single field match
1326+
('"term1 TERM2"', None, ["1", "2"]), # mixed case — case-insensitive
1327+
('"term5" "term5"', None, ["1", "3"]), # duplicate terms
1328+
('"term999"', None, []), # nonexistent term
1329+
('"the of and"', None, []), # stopwords
1330+
('', None, []), # empty query
1331+
1332+
# ─── Control Cases ───────────────────────────────────────────────
1333+
('"term1" "term2" word5', None, ["1"]), # combo of phrase + token
1334+
('"word5 term8"', None, []), # required terms in diff docs
1335+
]
1336+
1337+
self.add_documents(
1338+
config=self.config,
1339+
add_docs_params=AddDocsParams(
1340+
index_name=self.default_text_index,
1341+
docs=docs,
1342+
)
1343+
)
1344+
1345+
for query, searchable_attributes, expected_docs_ids in test_cases:
1346+
with self.subTest(f"{query}, {searchable_attributes}, {expected_docs_ids}"):
1347+
res = tensor_search.search(
1348+
config=self.config,
1349+
index_name=self.default_text_index,
1350+
text=query,
1351+
searchable_attributes=searchable_attributes,
1352+
search_method=SearchMethod.LEXICAL
1353+
)
1354+
self.assertEqual(set(expected_docs_ids), {hit["_id"] for hit in res["hits"]})

tests/unit_tests/marqo/core/search/test_search.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -287,12 +287,10 @@ def test_hybrid_search_with_filter_and_score_modifiers(self):
287287

288288
call_args = self.vespa_client_mock.query.call_args[1]
289289
self.assertEqual(
290-
call_args['marqo__yql.lexical'],
291-
'select * from unstructured_test_schema where (None contains "test" OR None contains "test") AND (((marqo__short_string_fields contains sameElement(key contains "text_field_1", value contains "hadhsd"))))'
290+
'select * from unstructured_test_schema where ((None contains "test" OR None contains "test")) AND (((marqo__short_string_fields contains sameElement(key contains "text_field_1", value contains "hadhsd"))))',
291+
call_args['marqo__yql.lexical']
292292
)
293293

294-
295-
296294
def test_rerank_depth_higher_than_default_ef_search_overrides_it(self):
297295
tensor_search.search(self.config, "index_name", "query", search_method="tensor", rerank_depth=3000)
298296
self.vespa_client_mock.query.assert_called_once()

tests/unit_tests/marqo/core/structured_vespa_index/test_structured_add_documents_handler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from unit_tests.marqo_test import MarqoTestCase
1111

1212

13-
class TestUnstructuredAddDocumentsHandler(MarqoTestCase):
13+
class TestStructuredAddDocumentsHandler(MarqoTestCase):
1414
IMAGE_URL = 'https://sample.com/abcd.png'
1515
AUDIO_URL = 'https://sample.com/abcd.wav'
1616
VIDEO_URL = 'https://sample.com/abcd.mp4'
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
from magic import Magic
2+
from unittest.mock import MagicMock
3+
4+
from marqo.core.exceptions import AddDocumentsError
5+
from marqo.core.inference.api import Inference, Modality
6+
from marqo.core.inference.tensor_fields_container import TensorField
7+
from marqo.core.models.add_docs_params import AddDocsParams
8+
from marqo.core.models.marqo_index import FieldType
9+
from marqo.core.structured_vespa_index.structured_add_document_handler import StructuredAddDocumentsHandler
10+
from marqo.vespa.vespa_client import VespaClient
11+
from unit_tests.marqo_test import MarqoTestCase
12+
from marqo.core.models.marqo_index import *
13+
from marqo.core.structured_vespa_index.structured_vespa_index import StructuredVespaIndex
14+
from marqo.core.models.marqo_query import MarqoLexicalQuery
15+
16+
17+
18+
class TestStructuredIndexBuildLexicalSearchQuery(MarqoTestCase):
19+
"""
20+
Test the build_lexical_search_query method of StructuredVespaIndex.
21+
Note that this method is also used in semi-structured Vespa index.
22+
"""
23+
@classmethod
24+
def setUpClass(cls) -> None:
25+
super().setUpClass()
26+
cls.maxDiff = None
27+
cls.structured_vespa_index = StructuredVespaIndex(
28+
marqo_index=cls.structured_marqo_index(
29+
name='index1',
30+
schema_name='index1',
31+
fields=[
32+
Field(name='lexical_field_1', type=FieldType.Text, features=[FieldFeature.LexicalSearch],
33+
lexical_field_name='marqo__lexical_lexical_field_1'),
34+
Field(name='lexical_field_2', type=FieldType.Text, features=[FieldFeature.LexicalSearch],
35+
lexical_field_name='marqo__lexical_lexical_field_2'),
36+
Field(name='lexical_field_3', type=FieldType.Text, features=[FieldFeature.LexicalSearch],
37+
lexical_field_name='marqo__lexical_lexical_field_3'),
38+
Field(name='lexical_field_4', type=FieldType.Text, features=[FieldFeature.LexicalSearch],
39+
lexical_field_name='marqo__lexical_lexical_field_4')
40+
],
41+
tensor_fields=[]
42+
)
43+
)
44+
45+
def _help_create_test_lexical_query_object(self, or_phrases: List[str], and_phrases: List[str],
46+
searchable_attributes: List[str] = None) -> MarqoLexicalQuery:
47+
"""
48+
Helper function to create a MarqoLexicalQuery object with the given parameters.
49+
"""
50+
return MarqoLexicalQuery(
51+
index_name='index1',
52+
limit=10,
53+
offset=0,
54+
searchable_attributes=searchable_attributes,
55+
or_phrases=or_phrases,
56+
and_phrases=and_phrases
57+
)
58+
59+
def test_generate_and_terms_for_lexical_search(self):
60+
"""Test to ensure that the AND terms are correctly built when searchable attributes are specified.
61+
"""
62+
test_cases = [
63+
(
64+
["required_1"], ['lexical_field_1'],
65+
'(marqo__lexical_lexical_field_1 contains "required_1")',
66+
"Simple 1 required term, 1 field"
67+
),
68+
(
69+
["required_1", "required_2"], ['lexical_field_1'],
70+
'(marqo__lexical_lexical_field_1 contains "required_1") AND '
71+
'(marqo__lexical_lexical_field_1 contains "required_2")',
72+
"2 required terms, 1 field"
73+
),
74+
(
75+
["required_1", "required_2"], ['lexical_field_1', 'lexical_field_2'],
76+
'(marqo__lexical_lexical_field_1 contains "required_1" '
77+
'OR marqo__lexical_lexical_field_2 contains "required_1") AND '
78+
'(marqo__lexical_lexical_field_1 contains "required_2" '
79+
'OR marqo__lexical_lexical_field_2 contains "required_2")',
80+
"2 required terms, 2 fields"
81+
),
82+
(
83+
["long required phrase", "short required phrase"], ['lexical_field_1', 'lexical_field_4'],
84+
'(marqo__lexical_lexical_field_1 contains "long required phrase" '
85+
'OR marqo__lexical_lexical_field_4 contains "long required phrase") AND '
86+
'(marqo__lexical_lexical_field_1 contains "short required phrase" OR '
87+
'marqo__lexical_lexical_field_4 contains "short required phrase")',
88+
"2 multiple words required phrases, 2 fields"
89+
),
90+
(
91+
["term1", "term2"], None,
92+
'default contains "term1" AND default contains "term2"',
93+
"2 required terms, no fields"
94+
),
95+
(
96+
["term1", "term2", "term3", "term4"], ['lexical_field_1', 'lexical_field_2', 'lexical_field_3'],
97+
'(marqo__lexical_lexical_field_1 contains "term1" OR marqo__lexical_lexical_field_2 contains "term1" '
98+
'OR marqo__lexical_lexical_field_3 contains "term1") AND (marqo__lexical_lexical_field_1 contains "term2" '
99+
'OR marqo__lexical_lexical_field_2 contains "term2" OR marqo__lexical_lexical_field_3 contains "term2") AND '
100+
'(marqo__lexical_lexical_field_1 contains "term3" OR marqo__lexical_lexical_field_2 contains "term3" OR '
101+
'marqo__lexical_lexical_field_3 contains "term3") AND (marqo__lexical_lexical_field_1 contains "term4" '
102+
'OR marqo__lexical_lexical_field_2 contains "term4" OR marqo__lexical_lexical_field_3 contains "term4")',
103+
"4 required terms, 3 fields"
104+
)
105+
]
106+
for required_phrases, searchable_attributes, expected_and_terms, msg in test_cases:
107+
with self.subTest(f"{msg}"):
108+
test_marqo_query = self._help_create_test_lexical_query_object(
109+
or_phrases=[],
110+
and_phrases=required_phrases,
111+
searchable_attributes=searchable_attributes
112+
)
113+
generated_and_terms = self.structured_vespa_index._get_lexical_search_term(
114+
test_marqo_query
115+
)
116+
self.assertEqual(generated_and_terms, expected_and_terms)

0 commit comments

Comments
 (0)