Add more unit tests

farshidz · farshidz · commit 1238cf0e530e · 2025-06-02T17:23:18.000+10:00
diff --git a/.cursor/rules/run-tests.mdc b/.cursor/rules/run-tests.mdc
@@ -9,4 +9,4 @@ alwaysApply: true
 - If running integ or API tests, make sure Vespa is running vis docker ps. If not running, use 
 python scripts/vespa_local/vespa_local.py full_start to run Vespa first.
 - To run API tests, first run Marqo API in one process by running src/marqo/tensor_search/api.py using PYTHONPATH=./src MARQO_ENABLE_BATCH_APIS=true MARQO_MODE=COMBINED MARQO_MODELS_TO_PRELOAD="[]". While the API is running, run API tests via pytest using PYTHONPATH=./tests/api_tests/v1/tests/api_tests . If Marqo API fails to run, stop. Terminate Marqo API when done.
-- Unit tests most follow the same package hierarchy as the code they test.
+- Unit tests must follow the same package hierarchy as the code they test.
diff --git a/tests/unit_tests/marqo/core/structured_vespa_index/test_structured_vespa_index_to_vespa_query.py b/tests/unit_tests/marqo/core/structured_vespa_index/test_structured_vespa_index_to_vespa_query.py
@@ -0,0 +1,156 @@
+import time
+import unittest
+from typing import List
+
+from marqo.core.models.marqo_query import MarqoTensorQuery, MarqoHybridQuery
+from marqo.core.models.marqo_index import (
+    StructuredMarqoIndex, Model, TextPreProcessing, TextSplitMethod,
+    ImagePreProcessing, HnswConfig, DistanceMetric, Field, FieldType,
+    FieldFeature, TensorField
+)
+from marqo.core.models.hybrid_parameters import (
+    HybridParameters, RankingMethod, RetrievalMethod
+)
+from marqo.core.structured_vespa_index.structured_vespa_index import StructuredVespaIndex
+
+
+class TestStructuredVespaIndexToVespaQuery(unittest.TestCase):
+
+    def setUp(self):
+        """Set up test fixtures with a structured index that supports both tensor and lexical search."""
+        # Create a structured index with both tensor and lexical fields
+        marqo_index = self._create_structured_marqo_index(
+            name='test_index',
+            text_field_names=['title', 'description'], 
+            tensor_field_names=['title', 'description']
+        )
+        self.vespa_index = StructuredVespaIndex(marqo_index)
+
+    def _create_structured_marqo_index(
+        self, 
+        name: str,
+        text_field_names: List[str] = [],
+        tensor_field_names: List[str] = []
+    ) -> StructuredMarqoIndex:
+        """Helper method to create a structured Marqo index for testing."""
+        fields = []
+        
+        # Add text fields with lexical search and filter capabilities
+        for field_name in text_field_names:
+            fields.append(
+                Field(
+                    name=field_name,
+                    type=FieldType.Text,
+                    features=[FieldFeature.LexicalSearch, FieldFeature.Filter],
+                    lexical_field_name=f'{field_name}_lexical',
+                    filter_field_name=f'{field_name}_filter'
+                )
+            )
+
+        # Add tensor fields
+        tensor_fields = []
+        for field_name in tensor_field_names:
+            tensor_fields.append(
+                TensorField(
+                    name=field_name,
+                    embeddings_field_name=f'{field_name}_embeddings',
+                    chunk_field_name=f'{field_name}_chunks'
+                )
+            )
+
+        return StructuredMarqoIndex(
+            name=name,
+            schema_name=name,
+            model=Model(name='hf/all_datasets_v4_MiniLM-L6'),
+            normalize_embeddings=True,
+            distance_metric=DistanceMetric.Angular,
+            vector_numeric_type='float',
+            hnsw_config=HnswConfig(ef_construction=100, m=16),
+            marqo_version='2.12.0',  # Version that supports hybrid search
+            created_at=time.time(),
+            updated_at=time.time(),
+            fields=fields,
+            tensor_fields=tensor_fields,
+            text_preprocessing=TextPreProcessing(
+                split_length=2,
+                split_overlap=0,
+                split_method=TextSplitMethod.Sentence
+            ),
+            image_preprocessing=ImagePreProcessing(
+                patch_method=None
+            )
+        )
+
+    def test_to_vespa_query_tensor_mode_approximate_threshold(self):
+        """Test that to_vespa_query correctly sets approximate threshold for tensor queries."""
+        threshold_values = [0.75, 0.85, 0.95, None]
+        
+        for threshold in threshold_values:
+            with self.subTest(approximate_threshold=threshold):
+                marqo_query = MarqoTensorQuery(
+                    index_name='test_index',
+                    limit=10,
+                    offset=0,
+                    vector_query=[0.1, 0.2, 0.3, 0.4],
+                    approximate_threshold=threshold,
+                    approximate=True
+                )
+
+                vespa_query = self.vespa_index.to_vespa_query(marqo_query)
+
+                if threshold is not None:
+                    # Verify approximate threshold is set correctly
+                    self.assertEqual(vespa_query['ranking.matching.approximateThreshold'], threshold)
+                else:
+                    # When threshold is None, it should not be included in the query
+                    self.assertNotIn('ranking.matching.approximateThreshold', vespa_query)
+                
+                # Verify other key fields are present
+                self.assertIn('yql', vespa_query)
+                self.assertIn('ranking', vespa_query)
+                self.assertEqual(vespa_query['hits'], 10)
+
+    def test_to_vespa_query_hybrid_mode_approximate_threshold(self):
+        """Test that to_vespa_query correctly sets approximate threshold for hybrid queries."""
+        threshold_values = [0.70, 0.80, 0.90, None]
+        
+        for threshold in threshold_values:
+            with self.subTest(approximate_threshold=threshold):
+                hybrid_parameters = HybridParameters(
+                    retrievalMethod=RetrievalMethod.Disjunction,
+                    rankingMethod=RankingMethod.RRF,
+                    alpha=0.7,
+                    rrfK=100
+                )
+                
+                marqo_query = MarqoHybridQuery(
+                    index_name='test_index',
+                    limit=15,
+                    offset=0,
+                    vector_query=[0.2, 0.3, 0.4, 0.5],
+                    or_phrases=['search', 'query'],
+                    and_phrases=['required'],
+                    hybrid_parameters=hybrid_parameters,
+                    approximate_threshold=threshold,
+                    approximate=True
+                )
+
+                vespa_query = self.vespa_index.to_vespa_query(marqo_query)
+
+                if threshold is not None:
+                    # Verify approximate threshold is set correctly
+                    self.assertEqual(vespa_query['ranking.matching.approximateThreshold'], threshold)
+                else:
+                    # When threshold is None, it should not be included in the query
+                    self.assertNotIn('ranking.matching.approximateThreshold', vespa_query)
+                
+                # Verify hybrid-specific fields are present
+                self.assertEqual(vespa_query['hits'], 15)
+                self.assertIn('searchChain', vespa_query)
+                self.assertEqual(vespa_query['searchChain'], 'marqo')
+                self.assertIn('marqo__hybrid.retrievalMethod', vespa_query)
+                self.assertIn('marqo__hybrid.rankingMethod', vespa_query)
+
+
+if __name__ == '__main__':
+    unittest.main() 
diff --git a/tests/unit_tests/marqo/tensor_search/test_api_models.py b/tests/unit_tests/marqo/tensor_search/test_api_models.py
@@ -0,0 +1,208 @@
+import unittest
+from pydantic.v1 import ValidationError
+
+from marqo.tensor_search.models.api_models import SearchQuery, CustomVectorQuery
+from marqo.tensor_search.enums import SearchMethod
+from marqo.core.models.hybrid_parameters import HybridParameters, RankingMethod, RetrievalMethod
+from marqo.core.models.facets_parameters import FacetsParameters, FieldFacetsConfiguration
+from marqo.tensor_search.models.search import SearchContext, SearchContextTensor
+
+
+class TestSearchQuery(unittest.TestCase):
+
+    def test_search_query_with_all_parameters(self):
+        """Test SearchQuery creation with all parameters set to valid values."""
+        custom_vector_query = CustomVectorQuery(
+            customVector=CustomVectorQuery.CustomVector(
+                content="test content",
+                vector=[0.1, 0.2, 0.3, 0.4]
+            )
+        )
+        
+        hybrid_parameters = HybridParameters(
+            retrievalMethod=RetrievalMethod.Disjunction,
+            rankingMethod=RankingMethod.RRF,
+            alpha=0.7,
+            rrfK=100
+        )
+        
+        facets = FacetsParameters(
+            fields={
+                "category": FieldFacetsConfiguration(type="string", maxResults=10)
+            }
+        )
+        
+        context = SearchContext(
+            tensor=[SearchContextTensor(vector=[0.1, 0.2], weight=1.0)]
+        )
+        
+        search_query = SearchQuery(
+            q=custom_vector_query,
+            searchableAttributes=["title", "description"],
+            searchMethod=SearchMethod.HYBRID,
+            limit=20,
+            offset=5,
+            rerankDepth=100,
+            efSearch=200,
+            approximate=True,
+            approximateThreshold=0.85,
+            showHighlights=False,
+            reRanker="test_reranker",
+            filter="category:electronics",
+            attributesToRetrieve=["title", "price"],
+            boost={"title": 1.5},
+            mediaDownloadHeaders={"Authorization": "Bearer token"},
+            context=context,
+            textQueryPrefix="search:",
+            hybridParameters=hybrid_parameters,
+            facets=facets,
+            trackTotalHits=True
+        )
+        
+        # Verify key attributes
+        self.assertEqual(search_query.searchMethod, SearchMethod.HYBRID)
+        self.assertEqual(search_query.limit, 20)
+        self.assertEqual(search_query.approximateThreshold, 0.85)
+        self.assertIsNotNone(search_query.hybridParameters)
+        self.assertIsNotNone(search_query.facets)
+
+    def test_search_query_required_parameters_only(self):
+        """Test SearchQuery with only required parameters."""
+        # For tensor search, either q or context is required
+        search_query = SearchQuery(
+            q="test query",
+            searchMethod=SearchMethod.TENSOR
+        )
+        
+        # Verify defaults
+        self.assertEqual(search_query.searchMethod, SearchMethod.TENSOR)
+        self.assertEqual(search_query.limit, 10)
+        self.assertEqual(search_query.offset, 0)
+        self.assertTrue(search_query.showHighlights)
+        self.assertIsNone(search_query.hybridParameters)
+
+    def test_hybrid_parameters_validation(self):
+        """Test that hybrid parameters are only allowed for hybrid search."""
+        hybrid_parameters = HybridParameters(
+            retrievalMethod=RetrievalMethod.Disjunction,
+            rankingMethod=RankingMethod.RRF
+        )
+        
+        # Should fail for tensor search
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(
+                q="test",
+                searchMethod=SearchMethod.TENSOR,
+                hybridParameters=hybrid_parameters
+            )
+        self.assertIn("Hybrid parameters can only be provided for 'HYBRID' search", str(cm.exception))
+
+    def test_facets_validation(self):
+        """Test that facets are only allowed for hybrid search."""
+        facets = FacetsParameters(
+            fields={"category": FieldFacetsConfiguration(type="string")}
+        )
+        
+        # Should fail for tensor search
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(
+                q="test",
+                searchMethod=SearchMethod.TENSOR,
+                facets=facets
+            )
+        self.assertIn("Facets can only be provided for 'HYBRID' search", str(cm.exception))
+
+    def test_track_total_hits_validation(self):
+        """Test that trackTotalHits is only allowed for hybrid search."""
+        # Should fail for tensor search
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(
+                q="test",
+                searchMethod=SearchMethod.TENSOR,
+                trackTotalHits=True
+            )
+        self.assertIn("trackTotalHits can only be provided for 'HYBRID' search", str(cm.exception))
+
+    def test_approximate_threshold_validation(self):
+        """Test approximate threshold validation."""
+        # Should fail for lexical search
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(
+                q="test",
+                searchMethod=SearchMethod.LEXICAL,
+                approximateThreshold=0.5
+            )
+        self.assertIn("'approximateThreshold' is only valid for 'HYBRID' and 'TENSOR' search methods", str(cm.exception))
+        
+        # Should fail when approximate=False
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(
+                q="test",
+                searchMethod=SearchMethod.TENSOR,
+                approximate=False,
+                approximateThreshold=0.5
+            )
+        self.assertIn("'approximateThreshold' cannot be set when 'approximate' is False", str(cm.exception))
+        
+        # Should fail for invalid range
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(
+                q="test",
+                searchMethod=SearchMethod.TENSOR,
+                approximateThreshold=1.5
+            )
+        self.assertIn("'approximateThreshold' must be between 0 and 1", str(cm.exception))
+
+    def test_query_and_context_validation(self):
+        """Test validation of query and context requirements."""
+        # Lexical search requires query
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(searchMethod=SearchMethod.LEXICAL)
+        self.assertIn("Query(q) is required for lexical search", str(cm.exception))
+        
+        # Tensor search requires either query or context
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(searchMethod=SearchMethod.TENSOR)
+        self.assertIn("One of Query(q) or context is required for TENSOR search", str(cm.exception))
+
+    def test_rerank_depth_validation(self):
+        """Test rerank depth validation."""
+        # Should fail for lexical search
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(
+                q="test",
+                searchMethod=SearchMethod.LEXICAL,
+                rerankDepth=10
+            )
+        self.assertIn("'rerankDepth' is currently not supported for 'LEXICAL' search method", str(cm.exception))
+        
+        # Should fail for negative values
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(
+                q="test",
+                searchMethod=SearchMethod.TENSOR,
+                rerankDepth=-1
+            )
+        self.assertIn("rerankDepth cannot be negative", str(cm.exception))
+
+    def test_image_download_headers_validation(self):
+        """Test validation of image download headers."""
+        # Should fail when both headers are set
+        with self.assertRaises(ValidationError) as cm:
+            SearchQuery(
+                q="test",
+                image_download_headers={"header1": "value1"},
+                mediaDownloadHeaders={"header2": "value2"}
+            )
+        self.assertIn("Cannot set both imageDownloadHeaders", str(cm.exception))
+        
+        # Should work when imageDownloadHeaders is set and mediaDownloadHeaders is copied
+        search_query = SearchQuery(
+            q="test",
+            image_download_headers={"Authorization": "Bearer token"}
+        )
+        self.assertEqual(search_query.mediaDownloadHeaders, {"Authorization": "Bearer token"})
+
+
+if __name__ == '__main__':
+    unittest.main()