marqo-ai · farshidz · Jun 4, 2025 · May 23, 2025 · May 23, 2025 · May 23, 2025
diff --git a/.cursor/rules/run-tests.mdc b/.cursor/rules/run-tests.mdc
@@ -0,0 +1,12 @@
+---
+description: 
+globs: 
+alwaysApply: true
+---
+- Unit tests are in ./tests/unit_tests, integ tests are in ./tests/integ_tests, API tests are in ./tests/api_tests/v1/tests/api_tests
+- If you add new tests, make sure to run them and verify they pass.
+- To run unit and integ tests, make sure working directory is repo root and set PYTHONPATH=./src.
+- If running integ or API tests, make sure Vespa is running via docker ps. If not running, use
+python scripts/vespa_local/vespa_local.py full_start to run Vespa first.
+- To run API tests, first run Marqo API in one process by running src/marqo/tensor_search/api.py using PYTHONPATH=./src MARQO_ENABLE_BATCH_APIS=true MARQO_MODE=COMBINED MARQO_MODELS_TO_PRELOAD="[]". While the API is running, run API tests via pytest using PYTHONPATH=./tests/api_tests/v1/tests/api_tests . If Marqo API fails to run, stop. Terminate Marqo API when done.
+- Unit tests must follow the same package hierarchy as the code they test.
diff --git a/.github/workflows/run_required_checks_coverage.yml b/.github/workflows/run_required_checks_coverage.yml
@@ -92,7 +92,7 @@ jobs:
           coverage html --directory=../integ_coverage_data/htmlcov
           coverage report -m > ../integ_coverage_data/coverage.txt
 
-      - name: Check diff unit test coverage (95% threshold)
+      - name: Check diff unit test coverage (80% threshold)
         id: unit_diff_coverage
         continue-on-error: true
         run: |
@@ -106,7 +106,7 @@ jobs:
 
             diff-cover ../unit_coverage_data/coverage.xml --html-report ../unit_coverage_data/diff_cov.html \
               --markdown-report ../unit_coverage_data/diff_cov.md \
-              --compare-branch $BASE_BRANCH --fail-under=95
+              --compare-branch $BASE_BRANCH --fail-under=80
           else
             echo "Skipping diff-cover on push events"
             echo "Skipped diff-cover on push event" > ../unit_coverage_data/diff_cov.md
@@ -133,6 +133,26 @@ jobs:
             touch ../integ_coverage_data/diff_cov.html
           fi
 
+      - name: Check diff combined test coverage (95% threshold)
+        id: combined_diff_coverage
+        continue-on-error: true
+        run: |
+          cd marqo
+          if [[ "${GITHUB_EVENT_NAME}" == "pull_request" ]]; then
+            export PYTHONPATH="."
+            export BASE_BRANCH="${{ github.event.pull_request.base.ref }}"
+            echo "Running diff-cover against branch $BASE_BRANCH"
+            git fetch origin $BASE_BRANCH:$BASE_BRANCH
+
+            diff-cover ../coverage_data/coverage.xml --html-report ../coverage_data/diff_cov.html \
+              --markdown-report ../coverage_data/diff_cov.md \
+              --compare-branch $BASE_BRANCH --fail-under=95
+          else
+            echo "Skipping diff-cover on push events"
+            echo "Skipped diff-cover on push event" > ../coverage_data/diff_cov.md
+            touch ../coverage_data/diff_cov.html
+          fi
+
       - name: Check overall integration test coverage (78% threshold)
         id: integ_overall_coverage
         continue-on-error: true
@@ -196,38 +216,43 @@ jobs:
             echo "- \`combined-test-coverage\`, \`integration-test-coverage\`, \`unit-test-coverage\`: Coverage and diff coverage reports."
             echo "- \`marqo-integ-tests-report-*\`, \`marqo-largemodel-tests-report-*\`, \`marqo-unit-tests-report\`: pytest output and reports."
           } >> $GITHUB_STEP_SUMMARY
-
-          echo "# Combined coverage summary" >> $GITHUB_STEP_SUMMARY
+          echo "# Unit coverage summary" >> $GITHUB_STEP_SUMMARY
           echo '```text' >> $GITHUB_STEP_SUMMARY
-          cat coverage_data/coverage.txt >> $GITHUB_STEP_SUMMARY
+          cat unit_coverage_data/coverage.txt >> $GITHUB_STEP_SUMMARY
           echo '```' >> $GITHUB_STEP_SUMMARY
 
           echo "# Integ coverage summary" >> $GITHUB_STEP_SUMMARY
           echo '```text' >> $GITHUB_STEP_SUMMARY
           cat integ_coverage_data/coverage.txt >> $GITHUB_STEP_SUMMARY
           echo '```' >> $GITHUB_STEP_SUMMARY
 
-          echo "# Unit coverage summary" >> $GITHUB_STEP_SUMMARY
+          echo "# Combined coverage summary" >> $GITHUB_STEP_SUMMARY
           echo '```text' >> $GITHUB_STEP_SUMMARY
-          cat unit_coverage_data/coverage.txt >> $GITHUB_STEP_SUMMARY
+          cat coverage_data/coverage.txt >> $GITHUB_STEP_SUMMARY
           echo '```' >> $GITHUB_STEP_SUMMARY
 
+          echo "# Unit diff coverage summary" >> $GITHUB_STEP_SUMMARY
+          cat unit_coverage_data/diff_cov.md >> $GITHUB_STEP_SUMMARY
+
           echo "# Integ diff coverage summary" >> $GITHUB_STEP_SUMMARY
           cat integ_coverage_data/diff_cov.md >> $GITHUB_STEP_SUMMARY
 
-          echo "# Unit diff coverage summary" >> $GITHUB_STEP_SUMMARY
-          cat unit_coverage_data/diff_cov.md >> $GITHUB_STEP_SUMMARY
+          echo "# Combined diff coverage summary" >> $GITHUB_STEP_SUMMARY
+          cat coverage_data/diff_cov.md >> $GITHUB_STEP_SUMMARY
 
       - name: Fail if coverage thresholds not met
-        if: steps.unit_diff_coverage.outcome == 'failure' || steps.integ_diff_coverage.outcome == 'failure' || steps.integ_overall_coverage.outcome == 'failure' || steps.combined_overall_coverage.outcome == 'failure'
+        if: steps.unit_diff_coverage.outcome == 'failure' || steps.integ_diff_coverage.outcome == 'failure' || steps.combined_diff_coverage.outcome == 'failure' || steps.integ_overall_coverage.outcome == 'failure' || steps.combined_overall_coverage.outcome == 'failure'
         run: |
           echo "Coverage threshold check failed"
           if [ "${{ steps.unit_diff_coverage.outcome }}" == "failure" ]; then
-            echo "❌ Unit test diff coverage: ${{ steps.unit_diff_coverage.outputs.percentage }}% (required: 95%)"
+            echo "❌ Unit test diff coverage: ${{ steps.unit_diff_coverage.outputs.percentage }}% (required: 80%)"
           fi
           if [ "${{ steps.integ_diff_coverage.outcome }}" == "failure" ]; then
             echo "❌ Integration test diff coverage: ${{ steps.integ_diff_coverage.outputs.percentage }}% (required: 80%)"
           fi
+          if [ "${{ steps.combined_diff_coverage.outcome }}" == "failure" ]; then
+            echo "❌ Combined diff coverage: ${{ steps.combined_diff_coverage.outputs.percentage }}% (required: 95%)"
+          fi
           if [ "${{ steps.integ_overall_coverage.outcome }}" == "failure" ]; then
             echo "❌ Integration test coverage: ${{ steps.integ_overall_coverage.outputs.percentage }}% (required: 70%)"
           fi

diff --git a/src/marqo/core/models/marqo_query.py b/src/marqo/core/models/marqo_query.py
@@ -44,6 +44,7 @@ class MarqoTensorQuery(MarqoQuery):
     vector_query: List[float]
     ef_search: Optional[int] = None
     approximate: bool = True
+    approximate_threshold: Optional[float] = None
     rerank_depth_tensor: Optional[int] = None
 
     # TODO - validate that ef_search >= offset+limit if provided

diff --git a/src/marqo/core/search/hybrid_search.py b/src/marqo/core/search/hybrid_search.py
@@ -34,6 +34,7 @@ def search(
             self, config: Config, marqo_index: MarqoIndex, query: Optional[Union[None, str, CustomVectorQuery]],
             result_count: int = 5, offset: int = 0, rerank_depth: Optional[int] = None,
             ef_search: Optional[int] = None, approximate: bool = True,
+            approximate_threshold: Optional[float] = None,
             searchable_attributes: Iterable[str] = None, filter_string: str = None, device: str = None,
             attributes_to_retrieve: Optional[List[str]] = None, boost: Optional[Dict] = None,
             media_download_headers: Optional[Dict] = None, context: Optional[SearchContext] = None,
@@ -239,6 +240,7 @@ def search(
             limit=result_count,
             ef_search=ef_search,
             approximate=approximate,
+            approximate_threshold=approximate_threshold,
             offset=offset,
             global_rerank_depth=rerank_depth,
             or_phrases=optional_terms,

diff --git a/src/marqo/core/structured_vespa_index/structured_vespa_index.py b/src/marqo/core/structured_vespa_index/structured_vespa_index.py
@@ -439,7 +439,8 @@ def _to_vespa_tensor_query(self, marqo_query: MarqoTensorQuery) -> Dict[str, Any
             'offset': marqo_query.offset,
             'query_features': query_inputs,
             'presentation.summary': summary,
-            'ranking': ranking
+            'ranking': ranking,
+            'ranking.matching.approximateThreshold': marqo_query.approximate_threshold
         }
         query = {k: v for k, v in query.items() if v is not None}
 
@@ -614,6 +615,7 @@ def _to_vespa_hybrid_query(self, marqo_query: MarqoHybridQuery) -> Dict[str, Any
             'model_restrict': self._marqo_index.schema_name,
             'hits': marqo_query.limit,
             'offset': marqo_query.offset,
+            'ranking.matching.approximateThreshold': marqo_query.approximate_threshold,
             'query_features': query_inputs,
             'presentation.summary': summary,
 

diff --git a/src/marqo/tensor_search/api.py b/src/marqo/tensor_search/api.py
@@ -301,6 +301,8 @@ def marqo_internal_exception_handler(request, exc: api_exceptions.MarqoError):
 # manually converts it to an v1 model. It catches the v1.Validation error and converts it to FastAPI's
 # RequestValidationError to keep the behaviour consistent with the auto-injecting mechanism
 T = TypeVar('T')
+
+
 def parse_request_object(obj_type: Type[T], obj: Any) -> T:
     try:
         return parse_obj_as(obj_type, obj)
@@ -395,7 +397,6 @@ def get_index_stats(index_name: str, marqo_config: config.Config = Depends(get_c
     }
 
 
-
 @app.post("/indexes/{index_name}/search")
 @throttle(RequestType.SEARCH)
 def search(index_name: str, search_query_dict: dict, device: str = Depends(api_validation.validate_device),
@@ -417,10 +418,11 @@ def search(index_name: str, search_query_dict: dict, device: str = Depends(api_v
             result_count=search_query.limit, offset=search_query.offset,
             rerank_depth=search_query.rerankDepth,
             ef_search=search_query.efSearch, approximate=search_query.approximate,
+            approximate_threshold=search_query.approximateThreshold,
             reranker=search_query.reRanker,
             filter=search_query.filter, device=device,
             attributes_to_retrieve=search_query.attributesToRetrieve, boost=search_query.boost,
-            media_download_headers = search_query.mediaDownloadHeaders,
+            media_download_headers=search_query.mediaDownloadHeaders,
             context=search_query.context,
             score_modifiers=search_query.scoreModifiers,
             model_auth=search_query.modelAuth,

diff --git a/src/marqo/tensor_search/models/api_models.py b/src/marqo/tensor_search/models/api_models.py
@@ -44,6 +44,7 @@ class SearchQuery(BaseMarqoModel):
     rerankDepth: Optional[int] = None
     efSearch: Optional[int] = None
     approximate: Optional[bool] = None
+    approximateThreshold: Optional[float] = None
     showHighlights: bool = True
     reRanker: str = None
     filter: str = None
@@ -280,6 +281,23 @@ def validate_get_total_hits_only_for_hybrid_search(cls, values):
                              f"Search method is {search_method}.")
         return values
 
+    @root_validator(pre=False)
+    def validate_approximate_threshold(cls, values):
+        """Validate that approximateThreshold is only set for hybrid or tensor search and is a valid value."""
+        approximate_threshold = values.get('approximateThreshold')
+        search_method = values.get('searchMethod')
+        approximate = values.get('approximate')
+
+        if approximate_threshold is not None:
+            if search_method.upper() != SearchMethod.HYBRID and search_method.upper() != SearchMethod.TENSOR:
+                raise ValueError(f"'approximateThreshold' is only valid for 'HYBRID' and 'TENSOR' search methods")
+            if approximate is False:
+                raise ValueError(f"'approximateThreshold' cannot be set when 'approximate' is False")
+            if approximate_threshold < 0 or approximate_threshold > 1:
+                raise ValueError(f"'approximateThreshold' must be between 0 and 1, got {approximate_threshold}.")
+
+        return values
+
     def get_context_tensor(self) -> Optional[List[SearchContextTensor]]:
         """Extract the tensor from the context, if provided"""
         return self.context.tensor if self.context is not None else None