Skip to content

Commit 2f7417b

Browse files
authored
Implement configurable stemming (#1273)
Implement configurable text field stemming. This implementation is similar to language. Stemming can be configured per text field (and only for text fields). It's configured in the Vespa schema. Stemming cannot be changed once the field has been created. Unlike language, search API does not support changing stemming. It's picked by Vespa based on the fields being searched. Note only full text search is stemmed. Filtering isn't affected.
1 parent 35fb501 commit 2f7417b

File tree

19 files changed

+1518
-193
lines changed

19 files changed

+1518
-193
lines changed

scripts/vespa_local/vespa_local.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ def get_services_xml_content(self) -> str:
157157
<search/>
158158
<nodes>
159159
<node hostalias="node1"/>
160+
<jvm options="-Xms32M -Xmx256M -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005" />
160161
</nodes>
161162
</container>
162163
<content id="content_default" version="1.0">

src/marqo/core/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
MARQO_RERANK_DEPTH_MINIMUM_VERSION = semver.VersionInfo.parse('2.15.0')
2727
MARQO_SORT_BY_MINIMUM_VERSION = semver.VersionInfo.parse('2.22.0')
2828
MARQO_LANGUAGE_MINIMUM_VERSION = semver.VersionInfo.parse('2.16.0')
29+
MARQO_STEMMING_MINIMUM_VERSION = semver.VersionInfo.parse('2.16.0')
2930
MARQO_PARTIAL_UPDATE_MINIMUM_VERSION = semver.VersionInfo.parse('2.16.0')
3031

3132
# For score modifiers
@@ -37,4 +38,4 @@
3738
QUERY_INPUT_SCORE_MODIFIERS_ADD_WEIGHTS_TENSOR = 'marqo__add_weights_tensor'
3839
QUERY_INPUT_SCORE_MODIFIERS_MULT_WEIGHTS_GLOBAL = 'marqo__mult_weights_global'
3940
QUERY_INPUT_SCORE_MODIFIERS_ADD_WEIGHTS_GLOBAL = 'marqo__add_weights_global'
40-
MARQO_GLOBAL_SCORE_MODIFIERS = 'global'
41+
MARQO_GLOBAL_SCORE_MODIFIERS = 'global'

src/marqo/core/index_management/index_management.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -219,24 +219,25 @@ def update_index(self, marqo_index: SemiStructuredMarqoIndex) -> None:
219219
OperationConflictError: If another index creation/deletion operation is
220220
in progress and the lock cannot be acquired
221221
"""
222-
with self._vespa_deployment_lock():
223-
existing_index = self.get_index(marqo_index.name)
224-
if not isinstance(existing_index, SemiStructuredMarqoIndex):
225-
# This is just a sanity check, it should not happen since we do not expose this method to end user.
226-
raise InternalError(f'Index {marqo_index.name} created by Marqo version {marqo_index.marqo_version} '
227-
f'can not be updated.')
228-
229-
def is_subset(dict_a, dict_b):
230-
# check if dict_a is a subset of dict_b
231-
return all(k in dict_b and dict_b[k] == v for k, v in dict_a.items())
232-
233-
if (is_subset(marqo_index.tensor_field_map, existing_index.tensor_field_map) and
234-
is_subset(marqo_index.field_map, existing_index.field_map) and
235-
is_subset(marqo_index.name_to_string_array_field_map, existing_index.name_to_string_array_field_map)):
236-
logger.debug(f'Another thread has updated the index {marqo_index.name} already.')
237-
return
222+
existing_index = self.get_index(marqo_index.name)
223+
if not isinstance(existing_index, SemiStructuredMarqoIndex):
224+
# This is just a sanity check, it should not happen since we do not expose this method to end user.
225+
raise InternalError(f'Index {marqo_index.name} created by Marqo version {marqo_index.marqo_version} '
226+
f'can not be updated.')
227+
228+
def is_subset(dict_a, dict_b):
229+
# check if dict_a is a subset of dict_b
230+
return all(k in dict_b and dict_b[k] == v for k, v in dict_a.items())
231+
232+
if (is_subset(marqo_index.tensor_field_map, existing_index.tensor_field_map) and
233+
is_subset(marqo_index.field_map, existing_index.field_map) and
234+
is_subset(marqo_index.name_to_string_array_field_map, existing_index.name_to_string_array_field_map)):
235+
logger.debug(f'Another thread has updated the index {marqo_index.name} already.')
236+
return
238237

238+
with self._vespa_deployment_lock():
239239
schema = SemiStructuredVespaSchema.generate_vespa_schema(marqo_index)
240+
logger.debug(f'Updating index {marqo_index.name} with schema:\n{schema}')
240241
self._get_vespa_application().update_index_setting_and_schema(marqo_index, schema)
241242

242243
def _get_existing_indexes(self) -> List[MarqoIndex]:

src/marqo/core/models/marqo_index.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,13 @@ class FieldFeature(Enum):
7373
Filter = 'filter'
7474

7575

76+
class Stemming(str, Enum):
77+
None_ = 'none'
78+
Best = 'best'
79+
Shortest = 'shortest'
80+
Multiple = 'multiple'
81+
82+
7683
class DistanceMetric(Enum):
7784
Euclidean = 'euclidean'
7885
Angular = 'angular'
@@ -105,6 +112,7 @@ class Field(ImmutableStrictBaseModel):
105112
filter_field_name: Optional[str]
106113
dependent_fields: Optional[Dict[str, float]]
107114
language: Optional[str] = None
115+
stemming: Optional[Stemming] = None
108116

109117
@root_validator
110118
def check_all_fields(cls, values):
@@ -660,6 +668,15 @@ def index_supports_language(self) -> bool:
660668
'index_supports_language',
661669
lambda: self.parsed_marqo_version() >= constants.MARQO_LANGUAGE_MINIMUM_VERSION)
662670

671+
@property
672+
def index_supports_stemming(self) -> bool:
673+
"""
674+
Check if the index supports stemming.
675+
"""
676+
return self._cache_or_get(
677+
'index_supports_stemming',
678+
lambda: self.parsed_marqo_version() >= constants.MARQO_STEMMING_MINIMUM_VERSION)
679+
663680
@property
664681
def index_supports_sorty_by(self) -> bool:
665682
"""
@@ -733,6 +750,7 @@ def validate_structured_field(values, marqo_index: bool) -> None:
733750
type: FieldType = values['type']
734751
features: List[FieldFeature] = values['features']
735752
language: str = values.get('language')
753+
stemming: str = values.get('stemming')
736754
dependent_fields: Optional[Dict[str, float]] = values['dependent_fields']
737755

738756
validate_field_name(name)
@@ -762,6 +780,12 @@ def validate_structured_field(values, marqo_index: bool) -> None:
762780
f'feature is present'
763781
)
764782

783+
if stemming is not None and FieldFeature.LexicalSearch not in features:
784+
raise ValueError(
785+
f'{name}: stemming can only be populated when {FieldFeature.LexicalSearch.value} '
786+
f'feature is present'
787+
)
788+
765789
if FieldFeature.ScoreModifier in features and type not in [FieldType.Float, FieldType.Int,
766790
FieldType.Double, FieldType.MapFloat,
767791
FieldType.MapInt, FieldType.MapDouble,

src/marqo/core/semi_structured_vespa_index/semi_structured_add_document_handler.py

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Dict, Any
1+
from typing import Dict, Any, Optional
22

33
import pydantic.v1 as pydantic
44

@@ -98,7 +98,8 @@ def _handle_field(self, marqo_doc, field_name, field_content):
9898
# Add lexical field if content is a string
9999
if isinstance(marqo_doc[field_name], str):
100100
language = self._get_field_language(field_name)
101-
self._add_lexical_field_to_index(field_name, language)
101+
stemming = self._get_field_stemming(field_name)
102+
self._add_lexical_field_to_index(field_name, language, stemming)
102103

103104
# Add string array field if content is list of strings and index version supports it
104105
is_string_array = (
@@ -145,27 +146,53 @@ def _get_field_language(self, field_name):
145146
return None
146147

147148
if field_mapping.get('type') == 'text_field':
148-
if not self.marqo_index.index_supports_language:
149+
language = field_mapping.get('language')
150+
if language is not None and not self.marqo_index.index_supports_language:
149151
raise AddDocumentsError(
150152
f'Language is only supported for indexes created with Marqo version '
151153
f'{constants.MARQO_LANGUAGE_MINIMUM_VERSION} or later. This index was created with '
152154
f'Marqo {self.marqo_index.marqo_version}.'
153155
)
154-
return field_mapping.get('language')
156+
return language
155157

156158
return None
157159

160+
def _get_field_stemming(self, field_name) -> Optional[str]:
161+
"""Extract stemming specification for a field from mappings and validate."""
162+
if not self.add_docs_params.mappings:
163+
return None
158164

159-
def _add_lexical_field_to_index(self, field_name, language=None):
165+
field_mapping = self.add_docs_params.mappings.get(field_name)
166+
if not field_mapping:
167+
return None
168+
169+
if field_mapping.get('type') == 'text_field':
170+
stemming = field_mapping.get('stemming')
171+
if stemming is not None and not self.marqo_index.index_supports_stemming:
172+
raise AddDocumentsError(
173+
f'Stemming is only supported for indexes created with Marqo version '
174+
f'{constants.MARQO_STEMMING_MINIMUM_VERSION} or later. This index was created with '
175+
f'Marqo {self.marqo_index.marqo_version}.'
176+
)
177+
return stemming
178+
179+
return None
180+
181+
def _add_lexical_field_to_index(self, field_name, language=None, stemming=None):
160182
if field_name in self.marqo_index.field_map:
161-
if language is not None:
162-
existing_field = self.marqo_index.field_map[field_name]
163-
if existing_field.language != language:
164-
raise AddDocumentsError(
165-
f"Field '{field_name}' already exists with a different language configuration. "
166-
f"Cannot change language from '{existing_field.language}' to '{language}' "
167-
f"for existing field."
168-
)
183+
existing_field = self.marqo_index.field_map[field_name]
184+
if language is not None and existing_field.language != language:
185+
raise AddDocumentsError(
186+
f"Field '{field_name}' already exists with a different language configuration. "
187+
f"Cannot change language from '{existing_field.language}' to '{language}' "
188+
f"for existing field."
189+
)
190+
if stemming is not None and existing_field.stemming != stemming:
191+
raise AddDocumentsError(
192+
f"Field '{field_name}' already exists with a different stemming configuration. "
193+
f"Cannot change stemming from '{existing_field.stemming}' to '{stemming}' "
194+
f"for existing field."
195+
)
169196
return
170197

171198
max_lexical_field_count = self.field_count_config.max_lexical_field_count
@@ -176,14 +203,18 @@ def _add_lexical_field_to_index(self, field_name, language=None):
176203
f'limit in MARQO_MAX_LEXICAL_FIELD_COUNT_UNSTRUCTURED environment variable.')
177204

178205
# Add missing lexical fields to marqo index
179-
logger.debug(f'Adding lexical field {field_name} to index {self.marqo_index.name}' +
180-
(f' with language {language}' if language else ''))
206+
debug_parts = [f'Adding lexical field {field_name} to index {self.marqo_index.name}']
207+
if language:
208+
debug_parts.append(f'with language {language}')
209+
if stemming:
210+
debug_parts.append(f'with stemming {stemming}')
211+
logger.debug(' '.join(debug_parts))
181212

182213
self.marqo_index.lexical_fields.append(
183214
Field(name=field_name, type=FieldType.Text,
184215
features=[FieldFeature.LexicalSearch],
185216
lexical_field_name=f'{SemiStructuredVespaSchema.FIELD_INDEX_PREFIX}{field_name}',
186-
language=language)
217+
language=language, stemming=stemming)
187218
)
188219
self.marqo_index.clear_cache()
189220
self.should_update_index = True

src/marqo/core/semi_structured_vespa_index/semi_structured_vespa_schema_template_2_16.sd.jinja2

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ schema {{ index.schema_name }} {
9090
indexing: index | summary
9191
{%- endif %}
9292
index: enable-bm25
93+
{%- if lexical_field.stemming %}
94+
stemming: {{ lexical_field.stemming.value }}
95+
{%- endif %}
9396
}
9497
{% endfor -%}
9598

src/marqo/tensor_search/models/mappings_object.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,16 @@
7070
"language": {
7171
"type": "string",
7272
"minLength": 1
73+
},
74+
"stemming": {
75+
"type": "string",
76+
"enum": ["none", "best", "shortest", "multiple"]
7377
}
7478
},
75-
"required": ["type", "language"],
79+
"required": ["type"],
80+
"anyOf": [
81+
{"required": ["language"]},
82+
{"required": ["stemming"]}
83+
],
7684
"additionalProperties": False
7785
}

tests/api_tests/v1/tests/api_tests/test_language.py

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,22 @@ def populate_index(self):
3333
"_id": "1",
3434
"title1": "Vestido Mole Perfeito", # Portuguese
3535
"title2": "Collections de livres", # French
36-
"title3": "White dog" # English
36+
"title3": "White dog", # English,
37+
"size": "M"
3738
},
3839
{
3940
"_id": "2",
4041
"title1": "Vestido Mole Confortável",
4142
"title2": "collections art francais",
42-
"title3": "black cat"
43+
"title3": "black cat",
44+
"size": "M"
4345
},
4446
{
4547
"_id": "3",
4648
"title1": "Vestido Leve e Elegante",
4749
"title2": "mes collections Preferees",
48-
"title3": "blue sky"
50+
"title3": "blue sky",
51+
"size": "S"
4952
}
5053
]
5154

@@ -175,6 +178,43 @@ def test_field_language_override(self):
175178
self.assertEqual(["1"], hits_pt, "Should find only the Portuguese doc")
176179
self.assertEqual(["2"], hits_en, "Should find only the English doc")
177180

181+
def test_facets_and_relevance_cutoff(self):
182+
self.populate_index()
183+
184+
cases = [
185+
("pt", True),
186+
("en", False)
187+
]
188+
189+
for language, matches in cases:
190+
with self.subTest(f"Testing facets and relevance cutoff for language: {language}"):
191+
res = self.client.index(self.multilingual_index_name).search(
192+
q="mole",
193+
search_method="HYBRID",
194+
language=language,
195+
hybrid_parameters={
196+
'searchableAttributesLexical': ['title1'],
197+
'retrievalMethod': 'lexical',
198+
'rankingMethod': 'lexical',
199+
}, # Use lexical retrieval so that facets don't get tensor hits
200+
facets={
201+
"fields": {
202+
"size": {"type": "string"}
203+
}
204+
}, relevance_cutoff={"method": "mean_std_dev", "parameters": {"stdDevFactor": 1.2}},
205+
)
206+
207+
if matches:
208+
self.assertGreater(len(res["hits"]), 0, "Should find matches for 'mole'")
209+
self.assertGreater(len(res['facets']['size']), 0, "Should have facets for 'size'")
210+
self.assertGreater(res["_relevantCandidates"], 0,
211+
"Should have relevant candidates count greater than 0")
212+
else:
213+
self.assertEqual(len(res["hits"]), 0, "Should find no matches for 'mole' in English")
214+
self.assertEqual(len(res['facets']), 0, "Should have no facets for 'size' in English")
215+
self.assertEqual(res["_relevantCandidates"], 0,
216+
"Should have no relevant candidates count in English")
217+
178218
def test_tensor_search_with_language_error(self):
179219
"""Test that specifying language for tensor search raises an error."""
180220
self.populate_index()

0 commit comments

Comments
 (0)