Merge releases/2.18 to mainline (#1207)

papa99do · web-flow · commit 650ac4d4a20e · 2025-04-30T13:24:11.000+10:00
diff --git a/examples/ClothingCLI/simple_marqo_demo.py b/examples/ClothingCLI/simple_marqo_demo.py
@@ -27,7 +27,7 @@ def load_index(index_name: str, number_data: int) -> None:
 
         settings = {
             "treatUrlsAndPointersAsImages": True,  # allows us to find an image file and index it
-            "model": "ViT-B/16"
+            "model": "open_clip/ViT-B-16/openai"
         }
 
         mq.create_index(index_name, settings_dict=settings)
diff --git a/examples/ClothingStreamlit/streamlit_marqo_demo.py b/examples/ClothingStreamlit/streamlit_marqo_demo.py
@@ -28,7 +28,7 @@ def load_index(number_data):
             
         settings = {
             "treatUrlsAndPointersAsImages":True,   # allows us to find an image file and index it
-            "model":"ViT-B/16"
+            "model": "open_clip/ViT-B-16/openai"
         }
          
         mq.create_index("demo-search-index", settings_dict=settings)
diff --git a/src/marqo/core/inference/api/exceptions.py b/src/marqo/core/inference/api/exceptions.py
@@ -29,3 +29,8 @@ class UnsupportedModalityError(InferenceError):
 class MediaExceedsMaxSizeError(InferenceError):
     """Raised when the media exceeds the maximum size limit"""
     pass
+
+
+class MediaMismatchError(InferenceError):
+    """Raised when the media does not match the expected type"""
+    pass
diff --git a/src/marqo/core/structured_vespa_index/structured_add_document_handler.py b/src/marqo/core/structured_vespa_index/structured_add_document_handler.py
@@ -3,31 +3,20 @@
 from marqo.api import exceptions as api_errors
 from marqo.core import constants
 from marqo.core.constants import MARQO_DOC_ID
-from marqo.core.inference.api import Modality, MediaDownloadError, Inference
-from marqo.core.inference.modality_utils import infer_modality
-from marqo.core.vespa_index.add_documents_handler import AddDocumentsHandler, AddDocumentsError
-from marqo.core.models.add_docs_params import AddDocsParams
+from marqo.core.inference.api import Modality, Inference
 from marqo.core.inference.tensor_fields_container import TensorFieldsContainer, TensorField
+from marqo.core.models.add_docs_params import AddDocsParams
 from marqo.core.models.marqo_index import FieldType, StructuredMarqoIndex
 from marqo.core.structured_vespa_index.structured_vespa_index import StructuredVespaIndex
+from marqo.core.vespa_index.add_documents_handler import AddDocumentsHandler, AddDocumentsError
 from marqo.exceptions import InvalidArgumentError
-
-from marqo.vespa.models import VespaDocument
-from marqo.vespa.models.get_document_response import Document
-
 # TODO deps to tensor_search needs to be removed
 from marqo.tensor_search import validation
+from marqo.vespa.models import VespaDocument
+from marqo.vespa.models.get_document_response import Document
 from marqo.vespa.vespa_client import VespaClient
 
 
-MODALITY_FIELD_TYPE_MAP = {
-    Modality.TEXT: FieldType.Text,
-    Modality.IMAGE: FieldType.ImagePointer,
-    Modality.VIDEO: FieldType.VideoPointer,
-    Modality.AUDIO: FieldType.AudioPointer,
-}
-
-
 class StructuredAddDocumentsHandler(AddDocumentsHandler):
     def __init__(self, marqo_index: StructuredMarqoIndex, add_docs_params: AddDocsParams, vespa_client: VespaClient,
                  inference: Inference):
@@ -74,20 +63,22 @@ def _handle_field(self, marqo_doc, field_name, field_content):
         marqo_doc[field_name] = content
 
     def _infer_modality(self, tensor_field: TensorField) -> Modality:
+        """
+        Infer modality based on tensor field type specified in the definition of structured index. Please note we
+        do not infer the modality from the content of the field here, any modality mismatch is detected later when
+        we download and preprocess the media content.
+        """
         if tensor_field.field_type == FieldType.Text:
             return Modality.TEXT
-
-        url = tensor_field.field_content
-        try:
-            modality = infer_modality(url, self.add_docs_params.media_download_headers)
-        except MediaDownloadError as err:
-            raise AddDocumentsError(f"Error processing {tensor_field.field_name}: {err.message}") from err
-
-        if MODALITY_FIELD_TYPE_MAP[modality] != tensor_field.field_type:
-            raise AddDocumentsError(f"Error processing {tensor_field.field_name}, detected as {modality.value}, "
-                                    f"but expected field type is {tensor_field.field_type}")
-
-        return modality
+        elif tensor_field.field_type == FieldType.ImagePointer:
+            return Modality.IMAGE
+        elif tensor_field.field_type == FieldType.VideoPointer:
+            return Modality.VIDEO
+        elif tensor_field.field_type == FieldType.AudioPointer:
+            return Modality.AUDIO
+        else:
+            raise AddDocumentsError(f"Error processing {tensor_field.field_name}, tensor field type "
+                                    f"{tensor_field.field_type} is not supported")
 
     def _validate_field(self, field_name: str, field_content: Any) -> None:
         try:
diff --git a/src/marqo/core/vespa_index/add_documents_handler.py b/src/marqo/core/vespa_index/add_documents_handler.py
@@ -168,7 +168,8 @@ def add_documents(self) -> MarqoAddDocumentsResponse:
                 self._populate_existing_tensors(existing_vespa_docs)
 
             # vectorise tensor fields
-            self._vectorise_tensor_fields()
+            with RequestMetricsStore.for_request().time("add_documents.inference.all"):
+                self._vectorise_tensor_fields()
 
         with RequestMetricsStore.for_request().time("add_documents.vespa.to_vespa_docs"):
             vespa_docs = self._convert_to_vespa_docs()
@@ -287,7 +288,8 @@ def _vectorise_tensor_fields(self) -> None:
         3. The result will be then populated to the tensor field. Individual errors happened during preprocessing
             and vectorisation will also be returned and collected by the `add_docs_response_collector`
         """
-        modalities = self._infer_modalities()
+        with RequestMetricsStore.for_request().time("add_documents.inference.infer_modality"):
+            modalities = self._infer_modalities()
 
         for modality in modalities:
             self._vectorise_fields(modality, for_top_level_field=True)
@@ -342,7 +344,9 @@ def subfield_predicate(f: TensorField) -> bool:
 
         # This method could raise InferenceError, we'll allow it propagate to the API layer and convert to proper
         # error response to return to users
-        inference_result = self.inference.vectorise(request)
+        with RequestMetricsStore.for_request().time(f"add_documents.inference.{modality}."
+                                                    f"is_subfield_{not for_top_level_field}.size_{len(tensor_fields)}"):
+            inference_result = self.inference.vectorise(request)
 
         if len(tensor_fields) != len(inference_result.result):
             raise InternalError(f'Inference result contains chunks and embeddings for {len(inference_result.result)} '
diff --git a/src/marqo/inference/media_download_and_preprocess/streaming_media_processor.py b/src/marqo/inference/media_download_and_preprocess/streaming_media_processor.py
@@ -44,7 +44,14 @@ def __init__(
         self.modality = preprocessing_config.modality
         
         self.media_download_header = self._convert_headers_to_cli_format(preprocessing_config.download_header)
-        self.total_size, self.duration = self._fetch_file_metadata()
+        self.total_size, self.duration, self.probed_modality = self._fetch_file_metadata()
+
+        if self.modality != self.probed_modality:
+            raise MediaMismatchError(
+                f"Error processing media file {self.url}. The provided modality {self.modality} does not match the "
+                f"detected modality {self.probed_modality}. Please check your media file and try again. If you are using "
+                f"a structured index, check if your media file matches the field type"
+            )
 
         if self.total_size > preprocessing_config.max_media_size_bytes:
             raise MediaExceedsMaxSizeError(
@@ -81,11 +88,36 @@ def _convert_headers_to_cli_format(self, raw_media_download_headers: Optional[Di
             raise InternalError("media_download_headers should be a dictionary")
         return "\r\n".join([f"{key}: {value}" for key, value in raw_media_download_headers.items()])
 
-    def _fetch_file_metadata(self) -> Tuple[float, float]:
+    def _infer_modality_from_probe(self, modality_list: list[str], format_name: Optional[str]) -> Optional[Modality]:
+        """
+        Infer the modality from the probed media file. This is used to determine whether the media is audio or video.
+        """
+        if Modality.VIDEO in modality_list:
+            # Images are also considered as video in ffmpeg, so we need to check the format name to
+            # differentiate between video and image
+            if "image" in format_name or "_pipe" in format_name:
+                return Modality.IMAGE
+            else:
+                return Modality.VIDEO
+        elif Modality.AUDIO in modality_list:
+            return Modality.AUDIO
+        else:
+            return None
+
+    def _fetch_file_metadata(self) -> Tuple[float, float, Optional[Modality]]:
+        """
+        Fetch the metadata of the media file using ffmpeg. This includes the size, duration, and modality of the
+        media file.
+
+        Returns:
+            Tuple[float, float, str]: A tuple containing the size (in bytes), duration (in seconds), and modality of the
+            media file.
+
+        """
         try:
             probe_options = {
                 'v': 'error',
-                'show_entries': 'format=size,duration',
+                'show_entries': 'stream=codec_type,format=size,duration,format_name',
                 'of': 'json',
                 'probesize': '256K',  # Probe only the first 256KB
             }
@@ -97,8 +129,11 @@ def _fetch_file_metadata(self) -> Tuple[float, float]:
 
             size = int(probe['format'].get('size', 0))
             duration = float(probe['format'].get('duration', 0))
+            format_name = probe['format'].get('format_name', "")
+            modality_list = [codec_type.get('codec_type', "") for codec_type in probe['streams']]
+            modality = self._infer_modality_from_probe(modality_list, format_name)
 
-            return size, duration
+            return size, duration, modality
 
         except ffmpeg.Error as e:
             raise MediaDownloadError(f"Error fetching metadata: {e.stderr.decode()}") from e
diff --git a/tests/integ_tests/inference/native_inference/media_download_and_preprocess/test_streaming_media_preprcessor.py b/tests/integ_tests/inference/native_inference/media_download_and_preprocess/test_streaming_media_preprcessor.py
@@ -6,7 +6,7 @@
 import torch
 from pytest import mark
 
-from integ_tests.marqo_test import TestVideoUrls, TestAudioUrls
+from integ_tests.marqo_test import TestVideoUrls, TestAudioUrls, TestImageUrls
 from marqo.core.inference.api import *
 from marqo.inference.media_download_and_preprocess.streaming_media_processor import StreamingMediaProcessor
 from marqo.inference.native_inference.embedding_models.languagebind_model import LanguagebindPreprocessor
@@ -163,7 +163,7 @@ def test_metadata_fetching_success(self):
             url=valid_url, preprocessors=self.test_preprocessor,
             preprocessing_config=self.test_video_preprocessing_config
         )
-        size, duration = streaming_media_processor_object._fetch_file_metadata()
+        size, duration, _ = streaming_media_processor_object._fetch_file_metadata()
 
         self.assertEqual(2971504, size) # Hardcoded value
         self.assertEqual(10.01, duration) # Hardcoded value
@@ -215,11 +215,72 @@ def test_header_conversion_with_valid_headers(self):
         with patch("marqo.inference.media_download_and_preprocess"
                    ".streaming_media_processor.StreamingMediaProcessor._fetch_file_metadata") \
                 as mock_fetch_file_metadata:
-            mock_fetch_file_metadata.return_value = (2971504, 10.01)
+            mock_fetch_file_metadata.return_value = (2971504, 10.01, Modality.VIDEO)
             streaming_media_processor_object = StreamingMediaProcessor(
-                url=TestAudioUrls.AUDIO1.value, preprocessors=self.test_preprocessor,
+                url=TestVideoUrls.VIDEO1.value, preprocessors=self.test_preprocessor,
                 preprocessing_config=test_video_preprocessing_config
             )
 
         expected = "Authorization: Bearer token\r\nUser-Agent: Test"
-        self.assertEqual(streaming_media_processor_object.media_download_header, expected)
+        self.assertEqual(streaming_media_processor_object.media_download_header, expected)
+
+    def test_prob_modality_correct_video(self):
+        for url in [
+            TestVideoUrls.VIDEO1.value, TestVideoUrls.VIDEO2.value, TestVideoUrls.VIDEO3.value,
+            TestVideoUrls.MKV_VIDEO1.value, TestVideoUrls.WEBM_VIDEO1.value, TestVideoUrls.AVI_VIDEO1.value
+        ]:
+            with self.subTest(url=url):
+                streaming_media_processor_object = StreamingMediaProcessor(
+                    url=url, preprocessors=self.test_preprocessor,
+                    preprocessing_config=self.test_video_preprocessing_config
+                )
+                self.assertEqual(Modality.VIDEO, streaming_media_processor_object.probed_modality)
+
+    def test_prob_modality_correct_audio(self):
+        for url in [
+            TestAudioUrls.AUDIO1.value, TestAudioUrls.AUDIO2.value, TestAudioUrls.AUDIO3.value,
+            TestAudioUrls.MP3_AUDIO1.value, TestAudioUrls.MP3_AUDIO1.value, TestAudioUrls.ACC_AUDIO1.value,
+            TestAudioUrls.OGG_AUDIO1.value, TestAudioUrls.FLAC_AUDIO1.value
+        ]:
+            with self.subTest(url=url):
+                streaming_media_processor_object = StreamingMediaProcessor(
+                    url=url, preprocessors=self.test_preprocessor,
+                    preprocessing_config=self.test_audio_preprocessing_config
+                )
+                self.assertEqual(Modality.AUDIO, streaming_media_processor_object.probed_modality)
+
+    def test_prob_modality_correct_image(self):
+        """Ensure that the probed modality is correct for various image formats. Note that
+        an error is raised as StreamingMediaProcessor is not designed to handle images."""
+        for url in [
+            TestImageUrls.IMAGE1.value, TestImageUrls.IMAGE2.value, TestImageUrls.IMAGE3.value,
+            TestImageUrls.COCO.value
+        ]:
+            with self.subTest(url=url):
+                with self.assertRaises(MediaMismatchError) as e:
+                    _ = StreamingMediaProcessor(
+                        url=url, preprocessors=self.test_preprocessor,
+                        preprocessing_config=self.test_video_preprocessing_config
+                    )
+                self.assertIn('the detected modality image', str(e.exception))
+
+
+    def test_incorrect_modality_between_audio_and_video_will_raise_an_error(self):
+        test_cases = [
+            (TestVideoUrls.VIDEO1.value, self.test_audio_preprocessing_config,
+             "The url is video, but the preprocessing config is audio"),
+            (TestAudioUrls.AUDIO1.value, self.test_video_preprocessing_config,
+             "The url is audio, but the preprocessing config is video")
+        ]
+        for url, processing_config, msg in test_cases:
+            with self.subTest(msg):
+                with self.assertRaises(MediaMismatchError) as e:
+                    _ = StreamingMediaProcessor(
+                        url=url, preprocessors=self.test_preprocessor,
+                        preprocessing_config=processing_config
+                    )
+                self.assertIn("Please check your media file and try again", str(e.exception))
+
+
+
+
diff --git a/tests/integ_tests/tensor_search/integ_tests/test_add_documents_structured.py b/tests/integ_tests/tensor_search/integ_tests/test_add_documents_structured.py
@@ -125,7 +125,7 @@ def setUpClass(cls) -> None:
                 )
             ],
             tensor_fields=['image_field', 'image_field_2'],
-            model=Model(name='ViT-B/16')
+            model=Model(name='open_clip/ViT-B-16/openai')
         )
         index_request_img_chunking = cls.structured_marqo_index_request(
             fields=[
@@ -141,7 +141,7 @@ def setUpClass(cls) -> None:
                 )
             ],
             tensor_fields=['image_field'],
-            model=Model(name='ViT-B/16'),
+            model=Model(name='open_clip/ViT-B-16/openai'),
             normalize_embeddings=True,
             image_preprocessing=ImagePreProcessing(patch_method=PatchMethod.Frcnn)
         )
@@ -938,5 +938,4 @@ def test_add_documents_nonImageContentForAnImageField(self):
         for item in r.items:
             self.assertEqual(400, item.status)
             # modality mismatch
-            self.assertIn("Error processing image_field, detected as language, "
-                          "but expected field type is image_pointer", item.message)
+            self.assertIn("is not a local file or a valid url", item.message)
diff --git a/tests/unit_tests/marqo/core/structured_vespa_index/test_structured_add_documents_handler.py b/tests/unit_tests/marqo/core/structured_vespa_index/test_structured_add_documents_handler.py
diff --git a/tests/unit_tests/marqo/inference/media_download_and_preprocess/test_streaming_media_processor.py b/tests/unit_tests/marqo/inference/media_download_and_preprocess/test_streaming_media_processor.py
diff --git a/tests/unit_tests/marqo_test.py b/tests/unit_tests/marqo_test.py

Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ def load_index(index_name: str, number_data: int) -> None:`
`27`	`27`
`28`	`28`	`settings = {`
`29`	`29`	`"treatUrlsAndPointersAsImages": True, # allows us to find an image file and index it`
`30`		`- "model": "ViT-B/16"`
	`30`	`+ "model": "open_clip/ViT-B-16/openai"`
`31`	`31`	`}`
`32`	`32`
`33`	`33`	`mq.create_index(index_name, settings_dict=settings)`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ def load_index(number_data):`
`28`	`28`
`29`	`29`	`settings = {`
`30`	`30`	`"treatUrlsAndPointersAsImages":True, # allows us to find an image file and index it`
`31`		`- "model":"ViT-B/16"`
	`31`	`+ "model": "open_clip/ViT-B-16/openai"`
`32`	`32`	`}`
`33`	`33`
`34`	`34`	`mq.create_index("demo-search-index", settings_dict=settings)`
Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ def setUpClass(cls) -> None:`
`125`	`125`	`)`
`126`	`126`	`],`
`127`	`127`	`tensor_fields=['image_field', 'image_field_2'],`
`128`		`- model=Model(name='ViT-B/16')`
	`128`	`+ model=Model(name='open_clip/ViT-B-16/openai')`
`129`	`129`	`)`
`130`	`130`	`index_request_img_chunking = cls.structured_marqo_index_request(`
`131`	`131`	`fields=[`
`@@ -141,7 +141,7 @@ def setUpClass(cls) -> None:`
`141`	`141`	`)`
`142`	`142`	`],`
`143`	`143`	`tensor_fields=['image_field'],`
`144`		`- model=Model(name='ViT-B/16'),`
	`144`	`+ model=Model(name='open_clip/ViT-B-16/openai'),`
`145`	`145`	`normalize_embeddings=True,`
`146`	`146`	`image_preprocessing=ImagePreProcessing(patch_method=PatchMethod.Frcnn)`
`147`	`147`	`)`
`@@ -938,5 +938,4 @@ def test_add_documents_nonImageContentForAnImageField(self):`
`938`	`938`	`for item in r.items:`
`939`	`939`	`self.assertEqual(400, item.status)`
`940`	`940`	`# modality mismatch`
`941`		`- self.assertIn("Error processing image_field, detected as language, "`
`942`		`- "but expected field type is image_pointer", item.message)`
	`941`	`+ self.assertIn("is not a local file or a valid url", item.message)`