opea-project · chensuyue · Jun 6, 2025 · Jun 6, 2025 · Jun 6, 2025
@@ -0,0 +1,9 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# this file should be run in the root of the repo
+services:
+  language-detection:
+    build:
+      dockerfile: comps/language_detection/src/Dockerfile
+    image: ${REGISTRY:-opea}/language-detection:${TAG:-latest}
@@ -40,6 +40,8 @@
     PIIResponseDoc,
     Audio2text,
     DocSumDoc,
+    PromptTemplateInput,
+    TranslationInput,
 )
 
 # Constants

@@ -38,6 +38,7 @@ class ServiceType(Enum):
     TEXT2CYPHER = 21
     TEXT2KG = 22
     STRUCT2GRAPH = 23
+    LANGUAGE_DETECTION = 24
 
 
 class MegaServiceEndpoint(Enum):

@@ -339,3 +339,21 @@ class ImagesPath(BaseDoc):
 
 class VideoPath(BaseDoc):
     video_path: str
+
+
+class PrevQuestionDetails(BaseDoc):
+    question: str
+    answer: str
+
+
+class PromptTemplateInput(BaseDoc):
+    data: Dict[str, Any]
+    conversation_history: Optional[List[PrevQuestionDetails]] = None
+    conversation_history_parse_type: str = "naive"
+    system_prompt_template: Optional[str] = None
+    user_prompt_template: Optional[str] = None
+
+
+class TranslationInput(BaseDoc):
+    text: str
+    target_language: str
@@ -0,0 +1,19 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  language-detection:
+    image: ${REGISTRY:-opea}/language-detection:${TAG:-latest}
+    container_name: language-detection
+    ports:
+      - "8069:8069"
+    environment:
+      - no_proxy=${no_proxy}
+      - https_proxy=${https_proxy}
+      - http_proxy=${http_proxy}
+    ipc: host
+    restart: always
+
+networks:
+  default:
+    driver: bridge
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+# Set environment variables
+ENV LANG=en_US.UTF-8
+
+RUN apt-get update -y && \
+    apt-get install build-essential -y && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    vim && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+COPY comps /home/comps
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+RUN pip install --no-cache-dir --upgrade pip setuptools && \
+    pip install --no-cache-dir -r /home/comps/language_detection/src/requirements.txt
+
+ENV PYTHONPATH=$PYTHONPATH:/home
+
+USER user
+
+WORKDIR /home/comps/language_detection/src
+
+ENTRYPOINT ["python", "opea_language_detection_microservice.py"]
@@ -0,0 +1,128 @@
+# Language Detection microservice
+
+The Language Detection microservice can be run in 2 modes:
+
+1. Pipeline: This mode adds multilingual support to ChatQnA pipelines. The microservice detects the language of the user's query as well as the LLM generated response to set up a prompt for translation.
+
+2. Standalone: This mode supports standalone translation. The microservice detects the language of the provided text. It then sets up a prompt for translating the provided text from the source language (detected language) to the provided target language.
+
+## Configuration Options
+
+The configuration for the Language Detection Microservice can be adjusted by exporting environment variable.
+
+| Environment Variable     | Description                            |
+| ------------------------ | -------------------------------------- |
+| `LANG_DETECT_STANDALONE` | Set this to `True` for Standalone mode |
+
+## Getting started
+
+### 🚀1. Start Language Detection Microservice with Python (Option 1)
+
+To start the Language Detection microservice, you need to install python packages first.
+
+#### 1.1. Install Requirements
+
+```bash
+pip install -r requirements.txt
+```
+
+#### 1.2. Start Microservice
+
+```bash
+python opea_language_detection_microservice.py
+```
+
+### 🚀2. Start Language Detection Microservice with Docker (Option 2)
+
+#### 2.1. Build the Docker Image:
+
+Use the below docker build command to create the image:
+
+```bash
+cd ../../../
+docker build -t opea/language-detection:latest -f comps/language_detection/src/Dockerfile .
+```
+
+Please note that the building process may take a while to complete.
+
+#### 2.2. Run the Docker Container:
+
+```bash
+docker run -d --name="language-detection-microservice" \
+  -p 8069:8069\
+  --net=host \
+  --ipc=host \
+  opea/language-detection:latest
+```
+
+### 3. Verify the Language Detection Microservice
+
+#### 3.1. Check Status
+
+```bash
+curl http://localhost:8069/v1/health_check \
+  -X GET \
+  -H 'Content-Type: application/json'
+```
+
+#### 3.2. Sending a Request
+
+##### 3.2.1 Pipeline Mode
+
+The input request consists of the answer that has to be translated and a prompt containing the user's query.
+
+**Example Input**
+
+```bash
+curl -X POST -H "Content-Type: application/json" -d @- http://localhost:8069/v1/language_detection <<JSON_DATA
+{
+  "text": "Hi. I am doing fine.",
+  "prompt": "### You are a helpful, respectful, and honest assistant to help the user with questions. \
+Please refer to the search results obtained from the local knowledge base. \
+But be careful to not incorporate information that you think is not relevant to the question. \
+If you don't know the answer to a question, please don't share false information. \
+### Search results:   \n
+### Question: 你好。你好吗？ \n
+### Answer:"
+}
+JSON_DATA
+```
+
+**Example Output**
+
+The output contains the answer, prompt template, source language and target language.
+
+```json
+{
+  "id": "1b16e065a1fcbdb4d999fd3d09a619cb",
+  "data": { "text": "Hi. I am doing fine.", "source_lang": "English", "target_lang": "Chinese" },
+  "prompt_template": "\n Translate this from {source_lang} to {target_lang}:\n   {source_lang}:\n   {text}\n\n  {target_lang}: \n "
+}
+```
+
+##### 3.2.2 Standalone Mode
+
+The input request consists of the text that has to be translated and the target language.
+
+**Example Input**
+
+```bash
+curl -X POST -H "Content-Type: application/json" -d @- http://localhost:8069/v1/language_detection <<JSON_DATA
+{
+  "text": "Hi. I am doing fine.",
+  "target_language": "Chinese"
+}
+JSON_DATA
+```
+
+**Example Output**
+
+The output contains the original text, prompt template, source language and target language.
+
+```json
+{
+  "id": "1b16e065a1fcbdb4d999fd3d09a619cb",
+  "data": { "text": "Hi. I am doing fine.", "source_lang": "English", "target_lang": "Chinese" },
+  "prompt_template": "\n Translate this from {source_lang} to {target_lang}:\n   {source_lang}:\n   {text}\n\n  {target_lang}: \n "
+}
+```
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,121 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import re
+from typing import Union
+
+from comps import (
+    CustomLogger,
+    GeneratedDoc,
+    OpeaComponent,
+    OpeaComponentRegistry,
+    PromptTemplateInput,
+    ServiceType,
+    TranslationInput,
+)
+from comps.language_detection.src.integrations.utils.detector import detect_language
+from comps.language_detection.src.integrations.utils.prompt import (
+    get_language_name,
+    get_prompt_template,
+    validate_language_name,
+)
+
+logger = CustomLogger("opea_language_detection")
+
+
+@OpeaComponentRegistry.register("OPEA_LANGUAGE_DETECTION")
+class OPEALanguageDetector(OpeaComponent):
+    def __init__(self, name: str, description: str, config: dict = {}):
+        super().__init__(name, ServiceType.LANGUAGE_DETECTION.name.lower(), description, config)
+        self._is_standalone = config.get("is_standalone", False)
+        logger.info("Language Detection microservice initialized.")
+        health_status = self.check_health()
+        if not health_status:
+            logger.error("OpeaImage2video health check failed.")
+
+    async def invoke(self, input: Union[GeneratedDoc, TranslationInput]) -> PromptTemplateInput:
+        """If is_standalone is False, detects the language of the query and sets up a translation prompt if needed, without modifying the query.
+        If is_standlaone is True, detects language of the provided text and sets up a translation prompt to translate text to target language.
+
+        Args:
+            input (Union[GeneratedDoc, TranslationInput]): The input document containing the initial query and answer or text and target_language.
+
+        Returns:
+            PromptTemplateInput: The prompt template and place holders for translation.
+        """
+        if self._is_standalone:
+            if not input.text.strip():
+                logger.error("No text provided.")
+                raise ValueError("Text to to be translated cannot be empty.")
+
+            if not input.target_language.strip():
+                logger.error("Target language not provided.")
+                raise ValueError("Target language cannot be empty.")
+
+            # Detect the language of the query
+            src_lang_code = detect_language(input.text)
+            source_language = get_language_name(src_lang_code)
+
+            if not source_language:
+                logger.error(f"The detected language {src_lang_code} is not supported.")
+                raise ValueError("Original language of text is not supported.")
+
+            logger.info(f"Detected language of the text: {source_language}")
+
+            # Check if the provided target language is valid
+            target_language = input.target_language.strip()
+            if not validate_language_name(target_language):
+                logger.error(f"Target language {target_language} is not supported.")
+                raise ValueError("Provided target language is not supported.")
+        else:
+            if not input.prompt.strip():
+                logger.error("No initial query provided.")
+                raise ValueError("Initial query cannot be empty.")
+
+            if not input.text.strip():
+                logger.error("No answer provided from LLM.")
+                raise ValueError("Answer from LLM cannot be empty.")
+
+            # Extract question from prompt
+            match = re.search(r"### Question:\s*(.*?)\s*(?=### Answer:|$)", input.prompt, re.DOTALL)
+
+            if match:
+                extracted_question = match.group(1).strip()  # Remove any leading/trailing whitespace
+            else:
+                logger.error("Question could not be found in the prompt.")
+                raise ValueError("Question not found in the prompt!")
+
+            # Detect the language of the query (target language)
+            tgt_lang_code = detect_language(extracted_question)
+            target_language = get_language_name(tgt_lang_code)
+
+            if not target_language:
+                logger.error(f"The detected query language {tgt_lang_code} is not supported.")
+                raise ValueError("Language of query is not supported.")
+
+            logger.info(f"Detected language of the query: {target_language}")
+
+            # Detect the language of the answer
+            src_lang_code = detect_language(input.text)
+            source_language = get_language_name(src_lang_code)
+
+            if not source_language:
+                logger.error(f"The detected answer language {src_lang_code} is not supported.")
+                raise ValueError("Language of answer is not supported.")
+
+            logger.info(f"Detected language of the answer: {source_language}")
+
+            # Prevents back-translation to English if RAG LLM generates answer in the same language
+            if source_language == target_language:
+                source_language = "en"
+
+        # Return the prompt template input for translation
+        system_prompt_template, user_prompt_template = get_prompt_template()
+        return PromptTemplateInput(
+            data={"text": input.text, "source_lang": source_language, "target_lang": target_language},
+            system_prompt_template=system_prompt_template,
+            user_prompt_template=user_prompt_template,
+        )
+
+    def check_health(self) -> bool:
+        return True
@@ -0,0 +1,16 @@
+# Copyright (C) 2025 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from ftlangdetect import detect
+
+
+def detect_language(input_text):
+    """Detects the language of input text.
+
+    Uses language detection model to get detected language.
+    """
+    input_text = input_text.replace("\n", " ")
+    detection = detect(text=input_text, low_memory=True)
+    detected_lang = detection["lang"]
+
+    return detected_lang
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (C) 2024 Intel Corporation
		# SPDX-License-Identifier: Apache-2.0