feast-dev · franciscojavierarceo · Apr 23, 2025 · Apr 23, 2025 · Apr 23, 2025
@@ -0,0 +1,66 @@
+# Milvus Tutorial with Feast
+
+This tutorial demonstrates how to use Milvus as a vector database backend for Feast. You'll learn how to set up Milvus, create embeddings, store them in Feast, and perform similarity searches.
+
+## Prerequisites
+
+- Python 3.10+
+- Docker (for running Milvus)
+- Feast installed (`pip install 'feast[milvus]'`)
+
+## Setup
+
+1. Start Milvus containers with Docker Compose:
+
+```bash
+docker compose up -d
+```
+
+This will start three containers:
+- `milvus-standalone`: The Milvus server
+- `milvus-etcd`: For metadata storage
+- `milvus-minio`: For object storage
+
+2. Wait until all containers are healthy (this may take a minute or two):
+
+```bash
+docker ps
+```
+
+## Project Structure
+
+```
+milvus_tutorial/
+├── README.md
+├── feature_store.yaml    # Feast configuration
+├── docker-compose.yml    # Docker Compose configuration for Milvus
+├── data/                 # Data directory
+│   └── sample_data.parquet  # Sample data with embeddings (generated by the script)
+└── milvus_example.py     # Example script
+```
+
+## Tutorial Steps
+
+1. Configure Feast with Milvus
+2. Generate sample data with embeddings
+3. Define feature views
+4. Register and apply feature definitions
+5. Perform vector similarity search
+
+Run the complete example:
+
+```bash
+python milvus_example.py
+```
+
+## How It Works
+
+This tutorial demonstrates:
+
+- Setting up Milvus as a vector database
+- Configuring Feast to use Milvus as the online store
+- Generating embeddings for text data
+- Storing embeddings in Feast feature views
+- Performing vector similarity searches using Feast's retrieval API
+
+Milvus is a powerful vector database designed for efficient similarity searches, making it an excellent choice for applications like semantic search and recommendation systems.
@@ -0,0 +1,31 @@
+version: "3.9"
+
+services:
+  etcd:
+    image: quay.io/coreos/etcd:v3.5.18
+    command: >
+      etcd -advertise-client-urls=http://etcd:2379
+           -listen-client-urls http://0.0.0.0:2379
+    volumes: ["./volumes/etcd:/etcd"]
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 30s
+
+  minio:
+    image: minio/minio:RELEASE.2023-03-20T20-16-18Z
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    command: server /data --console-address ":9001"
+    volumes: ["./volumes/minio:/data"]
+    ports: ["9000:9000", "9001:9001"]
+
+  milvus:
+    image: milvusdb/milvus:v2.5.10
+    command: ["milvus", "run", "standalone"]
+    environment:
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9000
+    depends_on: [etcd, minio]
+    volumes: ["./volumes/milvus:/var/lib/milvus"]
+    ports: ["19530:19530", "9091:9091"]
@@ -0,0 +1,16 @@
+project: milvus_tutorial
+provider: local
+registry: data/registry.db
+online_store:
+  type: milvus
+  host: localhost
+  port: 19530
+  vector_enabled: true
+  embedding_dim: 384
+  index_type: "FLAT"
+  metric_type: "L2"
+
+offline_store:
+  type: file
+
+entity_key_serialization_version: 3 
@@ -0,0 +1,191 @@
+# Milvus Tutorial with Feast
+#
+# This example demonstrates how to use Milvus
+# as a vector database backend for Feast.
+
+import os
+import subprocess
+from datetime import datetime, timedelta
+
+import pandas as pd
+
+# For generating embeddings
+try:
+    from sentence_transformers import SentenceTransformer
+except ImportError:
+    print("Installing sentence_transformers...")
+    subprocess.check_call(["pip", "install", "sentence-transformers"])
+    from sentence_transformers import SentenceTransformer
+
+from feast import FeatureStore, Entity, FeatureView, Field, FileSource
+from feast.data_format import ParquetFormat
+from feast.types import Float32, Array, String
+from feast.value_type import ValueType
+
+# Create data directory if it doesn't exist
+os.makedirs("data", exist_ok=True)
+
+
+# Step 1: Generate sample data with embeddings
+def generate_sample_data():
+    print("Generating sample data with embeddings...")
+
+    # Sample product data
+    products = [
+        {"id": 1, "name": "Smartphone",
+         "description": "A high-end smartphone with advanced camera features and long battery life."},
+        {"id": 2, "name": "Laptop",
+         "description": "Powerful laptop with fast processor and high-resolution display for professional use."},
+        {"id": 3, "name": "Headphones",
+         "description": "Wireless noise-cancelling headphones with premium sound quality."},
+        {"id": 4, "name": "Smartwatch",
+         "description": "Fitness tracking smartwatch with heart rate monitoring and sleep analysis."},
+        {"id": 5, "name": "Tablet",
+         "description": "Lightweight tablet with vibrant display perfect for reading and browsing."},
+        {"id": 6, "name": "Camera",
+         "description": "Professional digital camera with high-resolution sensor and interchangeable lenses."},
+        {"id": 7, "name": "Speaker",
+         "description": "Bluetooth speaker with rich bass and long battery life for outdoor use."},
+        {"id": 8, "name": "Gaming Console",
+         "description": "Next-generation gaming console with 4K graphics and fast loading times."},
+        {"id": 9, "name": "E-reader",
+         "description": "E-ink display reader with backlight for comfortable reading in any lighting condition."},
+        {"id": 10, "name": "Smart TV",
+         "description": "4K smart television with built-in streaming apps and voice control."}
+    ]
+
+    # Create DataFrame
+    df = pd.DataFrame(products)
+
+    # Generate embeddings using sentence-transformers
+    model = SentenceTransformer('all-MiniLM-L6-v2')  # Small, fast model with 384-dim embeddings
+    embeddings = model.encode(df['description'].tolist())
+
+    # Add embeddings and timestamp to DataFrame
+    df['embedding'] = embeddings.tolist()
+    df['event_timestamp'] = datetime.now() - timedelta(days=1)
+    df['created_timestamp'] = datetime.now() - timedelta(days=1)
+
+    # Save to parquet file
+    parquet_path = "data/sample_data.parquet"
+    df.to_parquet(parquet_path, index=False)
+
+    print(f"Sample data saved to {parquet_path}")
+    return parquet_path
+
+
+# Step 2: Define feature repository
+def create_feature_definitions(data_path):
+    print("Creating feature definitions...")
+
+    product = Entity(
+        name="product_id",
+        description="Product ID",
+        join_keys=["id"],
+        value_type=ValueType.INT64,
+    )
+
+    source = FileSource(
+        file_format=ParquetFormat(),
+        path=data_path,
+        timestamp_field="event_timestamp",
+        created_timestamp_column="created_timestamp",
+    )
+
+    # Define feature view with vector embeddings
+    product_embeddings = FeatureView(
+        name="product_embeddings",
+        entities=[product],
+        ttl=timedelta(days=30),
+        schema=[
+            Field(
+                name="embedding",
+                dtype=Array(Float32),
+                vector_index=True,  # Mark as vector field
+            ),
+            Field(name="name", dtype=String),
+            Field(name="description", dtype=String),
+        ],
+        source=source,
+        online=True,
+    )
+
+    return product, product_embeddings
+
+
+def setup_feature_store(product, product_embeddings):
+    print("Setting up feature store...")
+
+    store = FeatureStore(repo_path=".")
+
+    store.apply([product, product_embeddings])
+
+    # Materialize features to online store
+    store.materialize(
+        start_date=datetime.now() - timedelta(days=2),
+        end_date=datetime.now(),
+    )
+
+    print("Feature store setup complete")
+    return store
+
+
+# Step 4: Perform vector similarity search
+def perform_similarity_search(store, query_text: str, top_k: int = 3):
+    print(f"\nPerforming similarity search for: '{query_text}'")
+
+    # Generate embedding for query text
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    query_embedding = model.encode(query_text).tolist()
+
+    # Perform similarity search using vector embeddings with version 2 API
+    try:
+        results = store.retrieve_online_documents_v2(
+            features=["product_embeddings:embedding", "product_embeddings:name", "product_embeddings:description"],
+            query=query_embedding,
+            top_k=top_k,
+            distance_metric="L2"
+        ).to_df()
+
+        # Print results
+        print(f"\nTop {top_k} similar products:")
+        for i, row in results.iterrows():
+            print(f"\n{i + 1}. Name: {row['product_embeddings__name']}")
+            print(f"   Description: {row['product_embeddings__description']}")
+            print(f"   Distance: {row['distance']}")
+
+        return results
+    except Exception as e:
+        print(f"Error performing search: {e}")
+        return None
+
+
+# Main function to run the example
+def main():
+    print("=== Milvus Tutorial with Feast ===")
+
+    # Check if Milvus is running
+    print("\nEnsure Milvus is running:")
+    print("docker compose up -d")
+
+    input("\nPress Enter to continue once Milvus is ready...")
+
+    # Generate sample data
+    data_path = generate_sample_data()
+
+    # Create feature definitions
+    product, product_embeddings = create_feature_definitions(data_path)
+
+    # Setup feature store
+    store = setup_feature_store(product, product_embeddings)
+
+    # Perform similarity searches
+    perform_similarity_search(store, "wireless audio device with good sound", top_k=3)
+    perform_similarity_search(store, "portable computing device for work", top_k=3)
+
+    print("\n=== Tutorial Complete ===")
+    print("You've successfully set up Milvus with Feast and performed vector similarity searches!")
+
+
+if __name__ == "__main__":
+    main()