[feat] Add real amd mock endpoints for Query and Chat completion (#4543)

varshaprasad96 · Schimuneck · web-flow · commit 0e7bbb489e08 · 2025-07-25T11:39:24.000+01:00
This commit contains the following:
1. Mock and real endpoints for query.
2. Allows query when no DB is present.
3. Allows query when DB is present.

Signed-off-by: Varsha Prasad Narsing &lt;varshaprasad96@gmail.com&gt;
Co-authored-by: Matias Schimuneck &lt;schimuneck.matias@gmail.com&gt;
diff --git a/frontend/packages/llama-stack-modular-ui/bff/internal/api/app.go b/frontend/packages/llama-stack-modular-ui/bff/internal/api/app.go
@@ -30,6 +30,7 @@ const (
 
 	// making it simpler than /tool-runtime/rag-tool/insert
 	UploadPath = ApiPathPrefix + "/upload"
+	QueryPath  = ApiPathPrefix + "/query"
 )
 
 type App struct {
@@ -103,6 +104,7 @@ func (app *App) Routes() http.Handler {
 	// POST to register the vectorDB (/v1/vector-dbs)
 	apiRouter.POST(VectorDBListPath, app.RequireAuthRoute(app.AttachRESTClient(app.RegisterVectorDBHandler)))
 	apiRouter.POST(UploadPath, app.RequireAuthRoute(app.AttachRESTClient(app.UploadHandler)))
+	apiRouter.POST(QueryPath, app.RequireAuthRoute(app.AttachRESTClient(app.QueryHandler)))
 
 	// App Router
 	appMux := http.NewServeMux()
diff --git a/frontend/packages/llama-stack-modular-ui/bff/internal/api/middleware.go b/frontend/packages/llama-stack-modular-ui/bff/internal/api/middleware.go
@@ -3,12 +3,13 @@ package api
 import (
 	"context"
 	"fmt"
-	"github.com/julienschmidt/httprouter"
-	"github.com/opendatahub-io/llama-stack-modular-ui/internal/integrations"
 	"log/slog"
 	"net/http"
 	"runtime/debug"
 
+	"github.com/julienschmidt/httprouter"
+	"github.com/opendatahub-io/llama-stack-modular-ui/internal/integrations"
+
 	"github.com/google/uuid"
 	"github.com/opendatahub-io/llama-stack-modular-ui/internal/constants"
 	helper "github.com/opendatahub-io/llama-stack-modular-ui/internal/helpers"
diff --git a/frontend/packages/llama-stack-modular-ui/bff/internal/api/query_handler.go b/frontend/packages/llama-stack-modular-ui/bff/internal/api/query_handler.go
@@ -0,0 +1,167 @@
+package api
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"net/http"
+
+	"github.com/julienschmidt/httprouter"
+	"github.com/opendatahub-io/llama-stack-modular-ui/internal/constants"
+	"github.com/opendatahub-io/llama-stack-modular-ui/internal/integrations"
+	"github.com/opendatahub-io/llama-stack-modular-ui/internal/integrations/llamastack"
+)
+
+// QueryRequest represents the request body for querying documents
+type QueryRequest struct {
+	Content     string                      `json:"content"`
+	VectorDBIDs []string                    `json:"vector_db_ids,omitempty"`
+	QueryConfig llamastack.QueryConfigParam `json:"query_config,omitempty"`
+	// Chat completion options (LLM model for generating responses)
+	LLMModelID     string                     `json:"llm_model_id"`
+	SamplingParams *llamastack.SamplingParams `json:"sampling_params,omitempty"`
+}
+
+func (app *App) QueryHandler(w http.ResponseWriter, r *http.Request, params httprouter.Params) {
+	client, ok := r.Context().Value(constants.LlamaStackHttpClientKey).(integrations.HTTPClientInterface)
+
+	if !ok {
+		app.serverErrorResponse(w, r, errors.New("REST client not found"))
+		return
+	}
+
+	// Parse the request body
+	var queryRequest QueryRequest
+	if err := json.NewDecoder(r.Body).Decode(&queryRequest); err != nil {
+		app.badRequestResponse(w, r, err)
+		return
+	}
+
+	// Validate required fields
+	if queryRequest.Content == "" {
+		app.badRequestResponse(w, r, errors.New("content is required"))
+		return
+	}
+	if queryRequest.LLMModelID == "" {
+		app.badRequestResponse(w, r, errors.New("llm_model_id is required"))
+		return
+	}
+
+	// Check if we should perform RAG query
+	hasVectorDBs := len(queryRequest.VectorDBIDs) > 0
+	var response llamastack.QueryEmbeddingModelResponse
+	var hasRAGContent bool
+
+	if hasVectorDBs {
+		// Create default query configuration if not provided
+		queryConfig := queryRequest.QueryConfig
+		if queryConfig.MaxChunks == 0 {
+			queryConfig.MaxChunks = 5 // Default value
+		}
+		if queryConfig.MaxTokensInContext == 0 {
+			queryConfig.MaxTokensInContext = 1000 // Default value
+		}
+		if queryConfig.ChunkTemplate == "" {
+			queryConfig.ChunkTemplate = "Result {index}\nContent: {chunk.content}\nMetadata: {metadata}\n" // Default template
+		}
+
+		// Create the query embedding model request
+		queryEmbeddingModelRequest := llamastack.QueryEmbeddingModelRequest{
+			Content:     queryRequest.Content,
+			VectorDBIDs: queryRequest.VectorDBIDs,
+			QueryConfig: queryConfig,
+		}
+
+		// Query the embedding model
+		ragResponse, err := app.repositories.LlamaStackClient.QueryEmbeddingModel(client, queryEmbeddingModelRequest)
+		if err != nil {
+			app.serverErrorResponse(w, r, err)
+			return
+		}
+		response = ragResponse
+
+		// Check if we have RAG content
+		for _, contentItem := range response.Content {
+			if contentItem.Type == "text" {
+				hasRAGContent = true
+				break
+			}
+		}
+	}
+
+	// Extract text content from the query response
+	var contextText string
+	for _, contentItem := range response.Content {
+		if contentItem.Type == "text" {
+			contextText += contentItem.Text + "\n"
+		}
+	}
+
+	// Create messages for chat completion
+	var messages []llamastack.ChatMessage
+
+	if hasRAGContent {
+		// Use RAG context if available
+		messages = []llamastack.ChatMessage{
+			{
+				Role:    "system",
+				Content: "You are a helpful assistant that explains concepts based on the provided context. Use the context to answer the user's question accurately and concisely.",
+			},
+			{
+				Role:    "user",
+				Content: fmt.Sprintf("Based on this context, answer the following question:\n\nContext:\n%s\n\nQuestion: %s", contextText, queryRequest.Content),
+			},
+		}
+	} else {
+		// No RAG content available, provide a general response
+		messages = []llamastack.ChatMessage{
+			{
+				Role:    "system",
+				Content: "You are a helpful assistant that explains concepts. If you don't have specific context about the topic, provide a general explanation based on your knowledge.",
+			},
+			{
+				Role:    "user",
+				Content: fmt.Sprintf("Please explain: %s", queryRequest.Content),
+			},
+		}
+	}
+
+	// Use provided sampling params or defaults
+	samplingParams := llamastack.SamplingParams{
+		Strategy: llamastack.SamplingStrategy{
+			Type: "greedy",
+		},
+		MaxTokens: 500,
+	}
+	if queryRequest.SamplingParams != nil {
+		samplingParams = *queryRequest.SamplingParams
+	}
+
+	// Create chat completion request
+	chatCompletionRequest := llamastack.ChatCompletionRequest{
+		ModelID:        queryRequest.LLMModelID,
+		Messages:       messages,
+		SamplingParams: samplingParams,
+	}
+
+	// Generate chat completion
+	chatResponse, err := app.repositories.LlamaStackClient.ChatCompletion(client, chatCompletionRequest)
+	if err != nil {
+		app.serverErrorResponse(w, r, err)
+		return
+	}
+
+	// Return consistent response format regardless of RAG usage
+	combinedResponse := map[string]interface{}{
+		"rag_response":      response,
+		"chat_completion":   chatResponse,
+		"has_rag_content":   hasRAGContent,
+		"used_vector_dbs":   hasVectorDBs,
+		"assistant_message": chatResponse.CompletionMessage.Content,
+	}
+
+	w.WriteHeader(http.StatusOK)
+	if err := json.NewEncoder(w).Encode(combinedResponse); err != nil {
+		app.logger.Error("Failed to encode response", "error", err)
+	}
+}
diff --git a/frontend/packages/llama-stack-modular-ui/bff/internal/integrations/llamastack/llamastack_datatypes.go b/frontend/packages/llama-stack-modular-ui/bff/internal/integrations/llamastack/llamastack_datatypes.go
@@ -50,3 +50,93 @@ type DocumentInsertRequest struct {
 	VectorDBID        string     `json:"vector_db_id"`
 	ChunkSizeInTokens *int       `json:"chunk_size_in_tokens,omitempty"`
 }
+
+// QueryEmbeddingModelRequest represents the request body for querying an embedding model
+type QueryEmbeddingModelRequest struct {
+	// A image content item
+	Content     string   `json:"content"`
+	VectorDBIDs []string `json:"vector_db_ids"`
+	// Configuration for the RAG query generation.
+	QueryConfig QueryConfigParam `json:"query_config"`
+}
+
+type QueryConfigParam struct {
+	// Template for formatting each retrieved chunk in the context. Available
+	// placeholders: {index} (1-based chunk ordinal), {chunk.content} (chunk content
+	// string), {metadata} (chunk metadata dict). Default: "Result {index}\nContent:
+	// {chunk.content}\nMetadata: {metadata}\n"
+	ChunkTemplate string `json:"chunk_template"`
+	// Maximum number of chunks to retrieve.
+	MaxChunks int64 `json:"max_chunks"`
+	// Maximum number of tokens in the context.
+	MaxTokensInContext int64 `json:"max_tokens_in_context"`
+}
+
+type QueryEmbeddingModelResponse struct {
+	Content  []ContentItem `json:"content"`
+	Metadata Metadata      `json:"metadata"`
+}
+
+type ContentItem struct {
+	Type string `json:"type"`
+	Text string `json:"text"`
+}
+
+type Metadata struct {
+	DocumentIDs []string  `json:"document_ids"`
+	Chunks      []string  `json:"chunks"`
+	Scores      []float64 `json:"scores"`
+}
+
+// Chat completion types
+type ChatCompletionRequest struct {
+	ModelID        string         `json:"model_id"`
+	Messages       []ChatMessage  `json:"messages"`
+	SamplingParams SamplingParams `json:"sampling_params"`
+}
+
+type ChatMessage struct {
+	Role    string `json:"role"`
+	Content string `json:"content"`
+}
+
+type SamplingParams struct {
+	Strategy  SamplingStrategy `json:"strategy"`
+	MaxTokens int64            `json:"max_tokens"`
+}
+
+type SamplingStrategy struct {
+	Type string `json:"type"`
+}
+
+type ChatCompletionResponse struct {
+	Metrics           []Metric          `json:"metrics"`
+	CompletionMessage CompletionMessage `json:"completion_message"`
+	Logprobs          interface{}       `json:"logprobs"`
+}
+
+type Metric struct {
+	Metric string      `json:"metric"`
+	Value  interface{} `json:"value"`
+	Unit   interface{} `json:"unit"`
+}
+
+type CompletionMessage struct {
+	Role       string        `json:"role"`
+	Content    string        `json:"content"`
+	StopReason string        `json:"stop_reason"`
+	ToolCalls  []interface{} `json:"tool_calls"`
+}
+
+// Legacy types for backward compatibility (used in mock)
+type ChatChoice struct {
+	Index        int         `json:"index"`
+	Message      ChatMessage `json:"message"`
+	FinishReason string      `json:"finish_reason"`
+}
+
+type Usage struct {
+	PromptTokens     int `json:"prompt_tokens"`
+	CompletionTokens int `json:"completion_tokens"`
+	TotalTokens      int `json:"total_tokens"`
+}
diff --git a/frontend/packages/llama-stack-modular-ui/bff/internal/mocks/llamastack_client_mock.go b/frontend/packages/llama-stack-modular-ui/bff/internal/mocks/llamastack_client_mock.go
@@ -160,3 +160,91 @@ func (l *LlamastackClientMock) InsertDocuments(_ integrations.HTTPClientInterfac
 
 	return nil
 }
+
+func (l *LlamastackClientMock) QueryEmbeddingModel(_ integrations.HTTPClientInterface, request llamastack.QueryEmbeddingModelRequest) (llamastack.QueryEmbeddingModelResponse, error) {
+	l.mutex.Lock()
+	defer l.mutex.Unlock()
+
+	// Validate request
+	if request.Content == "" {
+		return llamastack.QueryEmbeddingModelResponse{}, fmt.Errorf("content is required")
+	}
+
+	if len(request.VectorDBIDs) == 0 {
+		return llamastack.QueryEmbeddingModelResponse{}, fmt.Errorf("at least one vector_db_id is required")
+	}
+
+	// Simulate successful query response
+	response := llamastack.QueryEmbeddingModelResponse{
+		Content: []llamastack.ContentItem{
+			{
+				Type: "text",
+				Text: fmt.Sprintf("Mock response for query: %s", request.Content),
+			},
+			{
+				Type: "text",
+				Text: "Additional mock content from vector database",
+			},
+			{
+				Type: "text",
+				Text: "More relevant content based on the query",
+			},
+		},
+		Metadata: llamastack.Metadata{
+			DocumentIDs: []string{"mock-doc-001", "mock-doc-002"},
+			Chunks:      []string{"Mock chunk 1", "Mock chunk 2"},
+			Scores:      []float64{0.95, 0.87},
+		},
+	}
+
+	fmt.Printf("Mock: Query executed successfully for content: %s\n", request.Content)
+	fmt.Printf("Mock: Searched in vector databases: %v\n", request.VectorDBIDs)
+	fmt.Printf("Mock: Returning %d content items\n", len(response.Content))
+
+	return response, nil
+}
+
+func (l *LlamastackClientMock) ChatCompletion(_ integrations.HTTPClientInterface, request llamastack.ChatCompletionRequest) (llamastack.ChatCompletionResponse, error) {
+	l.mutex.Lock()
+	defer l.mutex.Unlock()
+
+	// Validate request
+	if request.ModelID == "" {
+		return llamastack.ChatCompletionResponse{}, fmt.Errorf("model_id is required")
+	}
+
+	if len(request.Messages) == 0 {
+		return llamastack.ChatCompletionResponse{}, fmt.Errorf("messages are required")
+	}
+
+	// Get the last user message for context
+	var lastUserMessage string
+	for i := len(request.Messages) - 1; i >= 0; i-- {
+		if request.Messages[i].Role == "user" {
+			lastUserMessage = request.Messages[i].Content
+			break
+		}
+	}
+
+	// Simulate successful chat completion response
+	response := llamastack.ChatCompletionResponse{
+		Metrics: []llamastack.Metric{
+			{Metric: "prompt_tokens", Value: 50, Unit: nil},
+			{Metric: "completion_tokens", Value: 25, Unit: nil},
+			{Metric: "total_tokens", Value: 75, Unit: nil},
+		},
+		CompletionMessage: llamastack.CompletionMessage{
+			Role:       "assistant",
+			Content:    fmt.Sprintf("Mock response to: %s. This is a simulated chat completion response based on the provided context and messages.", lastUserMessage),
+			StopReason: "stop",
+			ToolCalls:  []interface{}{},
+		},
+		Logprobs: nil,
+	}
+
+	fmt.Printf("Mock: Chat completion executed successfully for model: %s\n", request.ModelID)
+	fmt.Printf("Mock: Processed %d messages\n", len(request.Messages))
+	fmt.Printf("Mock: Returning response with completion message\n")
+
+	return response, nil
+}
diff --git a/frontend/packages/llama-stack-modular-ui/bff/internal/repositories/rag-tool.go b/frontend/packages/llama-stack-modular-ui/bff/internal/repositories/rag-tool.go