Skip to content

Commit b6fc492

Browse files
committed
add function impl for pymilvus, specifically BM25 function
Signed-off-by: Buqian Zheng <[email protected]>
1 parent b51ebce commit b6fc492

File tree

12 files changed

+847
-33
lines changed

12 files changed

+847
-33
lines changed

examples/hello_bm25.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
# hello_bm25.py demonstrates how to insert raw data only into Milvus and perform
2+
# sparse vector based ANN search using BM25 algorithm.
3+
# 1. connect to Milvus
4+
# 2. create collection
5+
# 3. insert data
6+
# 4. create index
7+
# 5. search, query, and filtering search on entities
8+
# 6. delete entities by PK
9+
# 7. drop collection
10+
import time
11+
12+
from pymilvus import (
13+
connections,
14+
utility,
15+
FieldSchema, CollectionSchema, Function, DataType, FunctionType,
16+
Collection,
17+
)
18+
19+
fmt = "\n=== {:30} ===\n"
20+
search_latency_fmt = "search latency = {:.4f}s"
21+
22+
#################################################################################
23+
# 1. connect to Milvus
24+
# Add a new connection alias `default` for Milvus server in `localhost:19530`
25+
print(fmt.format("start connecting to Milvus"))
26+
connections.connect("default", host="localhost", port="19530")
27+
28+
has = utility.has_collection("hello_bm25")
29+
print(f"Does collection hello_bm25 exist in Milvus: {has}")
30+
31+
#################################################################################
32+
# 2. create collection
33+
# We're going to create a collection with 2 explicit fields and a function.
34+
# +-+------------+------------+------------------+------------------------------+
35+
# | | field name | field type | other attributes | field description |
36+
# +-+------------+------------+------------------+------------------------------+
37+
# |1| "id" | INT64 | is_primary=True | "primary field" |
38+
# | | | | auto_id=False | |
39+
# +-+------------+------------+------------------+------------------------------+
40+
# |2| "document" | VarChar | | "raw text document" |
41+
# +-+------------+------------+------------------+------------------------------+
42+
#
43+
# Function 'bm25' is used to convert raw text document to a sparse vector representation
44+
# and store it in the 'sparse' field.
45+
# +-+------------+-------------------+-----------+------------------------------+
46+
# | | field name | field type | other attr| field description |
47+
# +-+------------+-------------------+-----------+------------------------------+
48+
# |3| "sparse" |SPARSE_FLOAT_VECTOR| | |
49+
# +-+------------+-------------------+-----------+------------------------------+
50+
#
51+
fields = [
52+
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
53+
FieldSchema(name="sparse", dtype=DataType.SPARSE_FLOAT_VECTOR),
54+
FieldSchema(name="document", dtype=DataType.VARCHAR, max_length=1000, enable_tokenizer=True),
55+
]
56+
57+
bm25_function = Function(
58+
name="bm25",
59+
function_type=FunctionType.BM25,
60+
input_field_names=["document"],
61+
output_field_names="sparse",
62+
)
63+
64+
schema = CollectionSchema(fields, "hello_bm25 demo")
65+
schema.add_function(bm25_function)
66+
67+
print(fmt.format("Create collection `hello_bm25`"))
68+
hello_bm25 = Collection("hello_bm25", schema, consistency_level="Strong")
69+
70+
################################################################################
71+
# 3. insert data
72+
# We are going to insert 3 rows of data into `hello_bm25`
73+
# Data to be inserted must be organized in fields.
74+
#
75+
# The insert() method returns:
76+
# - either automatically generated primary keys by Milvus if auto_id=True in the schema;
77+
# - or the existing primary key field from the entities if auto_id=False in the schema.
78+
79+
print(fmt.format("Start inserting entities"))
80+
81+
num_entities = 6
82+
83+
entities = [
84+
[f"This is a test document {i + hello_bm25.num_entities}" for i in range(num_entities)],
85+
]
86+
87+
insert_result = hello_bm25.insert(entities)
88+
ids = insert_result.primary_keys
89+
90+
time.sleep(3)
91+
92+
hello_bm25.flush()
93+
print(f"Number of entities in Milvus: {hello_bm25.num_entities}") # check the num_entities
94+
95+
################################################################################
96+
# 4. create index
97+
# We are going to create an index for hello_bm25 collection, here we simply
98+
# uses AUTOINDEX so Milvus can use the default parameters.
99+
print(fmt.format("Start Creating index AUTOINDEX"))
100+
index = {
101+
"index_type": "AUTOINDEX",
102+
"metric_type": "BM25",
103+
}
104+
105+
hello_bm25.create_index("sparse", index)
106+
107+
################################################################################
108+
# 5. search, query, and scalar filtering search
109+
# After data were inserted into Milvus and indexed, you can perform:
110+
# - search texts relevance by BM25 using sparse vector ANN search
111+
# - query based on scalar filtering(boolean, int, etc.)
112+
# - scalar filtering search.
113+
#
114+
115+
# Before conducting a search or a query, you need to load the data in `hello_bm25` into memory.
116+
print(fmt.format("Start loading"))
117+
hello_bm25.load()
118+
119+
# -----------------------------------------------------------------------------
120+
print(fmt.format("Start searching based on BM25 texts relevance using sparse vector ANN search"))
121+
texts_to_search = entities[-1][-2:]
122+
print(fmt.format(f"texts_to_search: {texts_to_search}"))
123+
search_params = {
124+
"metric_type": "BM25",
125+
"params": {},
126+
}
127+
128+
start_time = time.time()
129+
result = hello_bm25.search(texts_to_search, "sparse", search_params, limit=3, output_fields=["document"], consistency_level="Strong")
130+
end_time = time.time()
131+
132+
for hits, text in zip(result, texts_to_search):
133+
print(f"result of text: {text}")
134+
for hit in hits:
135+
print(f"\thit: {hit}, document field: {hit.entity.get('document')}")
136+
print(search_latency_fmt.format(end_time - start_time))
137+
138+
# -----------------------------------------------------------------------------
139+
# query based on scalar filtering(boolean, int, etc.)
140+
filter_id = ids[num_entities // 2 - 1]
141+
print(fmt.format(f"Start querying with `id > {filter_id}`"))
142+
143+
start_time = time.time()
144+
result = hello_bm25.query(expr=f"id > {filter_id}", output_fields=["document"])
145+
end_time = time.time()
146+
147+
print(f"query result:\n-{result[0]}")
148+
print(search_latency_fmt.format(end_time - start_time))
149+
150+
# -----------------------------------------------------------------------------
151+
# pagination
152+
r1 = hello_bm25.query(expr=f"id > {filter_id}", limit=3, output_fields=["document"])
153+
r2 = hello_bm25.query(expr=f"id > {filter_id}", offset=1, limit=2, output_fields=["document"])
154+
print(f"query pagination(limit=3):\n\t{r1}")
155+
print(f"query pagination(offset=1, limit=2):\n\t{r2}")
156+
157+
158+
# -----------------------------------------------------------------------------
159+
# scalar filtering search
160+
print(fmt.format(f"Start filtered searching with `id > {filter_id}`"))
161+
162+
start_time = time.time()
163+
result = hello_bm25.search(texts_to_search, "sparse", search_params, limit=3, expr=f"id > {filter_id}", output_fields=["document"])
164+
end_time = time.time()
165+
166+
for hits, text in zip(result, texts_to_search):
167+
print(f"result of text: {text}")
168+
for hit in hits:
169+
print(f"\thit: {hit}, document field: {hit.entity.get('document')}")
170+
print(search_latency_fmt.format(end_time - start_time))
171+
172+
###############################################################################
173+
# 6. delete entities by PK
174+
# You can delete entities by their PK values using boolean expressions.
175+
176+
expr = f'id in [{ids[0]}, {ids[1]}]'
177+
print(fmt.format(f"Start deleting with expr `{expr}`"))
178+
179+
result = hello_bm25.query(expr=expr, output_fields=["document"])
180+
print(f"query before delete by expr=`{expr}` -> result: \n- {result[0]}\n- {result[1]}\n")
181+
182+
hello_bm25.delete(expr)
183+
184+
result = hello_bm25.query(expr=expr, output_fields=["document"])
185+
print(f"query after delete by expr=`{expr}` -> result: {result}\n")
186+
187+
###############################################################################
188+
# 7. upsert by PK
189+
# You can upsert data to replace existing data.
190+
target_id = ids[2]
191+
print(fmt.format(f"Start upsert operation for id {target_id}"))
192+
193+
# Query before upsert
194+
result_before = hello_bm25.query(expr=f"id == {target_id}", output_fields=["id", "document"])
195+
print(f"Query before upsert (id={target_id}):\n{result_before}")
196+
197+
# Prepare data for upsert
198+
upsert_data = [
199+
[target_id],
200+
["This is an upserted document for testing purposes."]
201+
]
202+
203+
# Perform upsert operation
204+
hello_bm25.upsert(upsert_data)
205+
206+
# Query after upsert
207+
result_after = hello_bm25.query(expr=f"id == {target_id}", output_fields=["id", "document"])
208+
print(f"Query after upsert (id={target_id}):\n{result_after}")
209+
210+
211+
###############################################################################
212+
# 7. drop collection
213+
# Finally, drop the hello_bm25 collection
214+
print(fmt.format("Drop collection `hello_bm25`"))
215+
utility.drop_collection("hello_bm25")

examples/hello_hybrid_bm25.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
# A demo showing hybrid semantic search with dense and full text search with BM25
2+
# using Milvus.
3+
#
4+
# You can optionally choose to use the BGE-M3 model to embed the text as dense
5+
# vectors, or simply use random generated vectors as an example.
6+
#
7+
# You can also use the BGE CrossEncoder model to rerank the search results.
8+
#
9+
# Note that the full text search feature is only available in Milvus 2.4.0 or
10+
# higher version. Make sure you follow https://milvus.io/docs/install_standalone-docker.md
11+
# to set up the latest version of Milvus in your local environment.
12+
13+
# To connect to Milvus server, you need the python client library called pymilvus.
14+
# To use BGE-M3 model, you need to install the optional `model` module in pymilvus.
15+
# You can get them by simply running the following commands:
16+
#
17+
# pip install pymilvus
18+
# pip install pymilvus[model]
19+
20+
# If true, use BGE-M3 model to generate dense vectors.
21+
# If false, use random numbers to compose dense vectors.
22+
use_bge_m3 = False
23+
# If true, the search result will be reranked using BGE CrossEncoder model.
24+
use_reranker = False
25+
26+
# The overall steps are as follows:
27+
# 1. embed the text as dense and sparse vectors
28+
# 2. setup a Milvus collection to store the dense and sparse vectors
29+
# 3. insert the data to Milvus
30+
# 4. search and inspect the result!
31+
import random
32+
import string
33+
import numpy as np
34+
35+
from pymilvus import (
36+
utility,
37+
FieldSchema,
38+
CollectionSchema,
39+
DataType,
40+
Collection,
41+
AnnSearchRequest,
42+
RRFRanker,
43+
connections,
44+
Function,
45+
FunctionType,
46+
)
47+
48+
# 1. prepare a small corpus to search
49+
docs = [
50+
"Artificial intelligence was founded as an academic discipline in 1956.",
51+
"Alan Turing was the first person to conduct substantial research in AI.",
52+
"Born in Maida Vale, London, Turing was raised in southern England.",
53+
]
54+
# add some randomly generated texts
55+
docs.extend(
56+
[
57+
" ".join(
58+
"".join(random.choice(string.ascii_lowercase) for _ in range(random.randint(1, 8)))
59+
for _ in range(10)
60+
)
61+
for _ in range(1000)
62+
]
63+
)
64+
query = "Who started AI research?"
65+
66+
67+
def random_embedding(texts):
68+
rng = np.random.default_rng()
69+
return {
70+
"dense": np.random.rand(len(texts), 768),
71+
}
72+
73+
74+
dense_dim = 768
75+
ef = random_embedding
76+
77+
if use_bge_m3:
78+
# BGE-M3 model is included in the optional `model` module in pymilvus, to
79+
# install it, simply run "pip install pymilvus[model]".
80+
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
81+
82+
ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
83+
dense_dim = ef.dim["dense"]
84+
85+
docs_embeddings = ef(docs)
86+
query_embeddings = ef([query])
87+
88+
# 2. setup Milvus collection and index
89+
connections.connect("default", host="localhost", port="19530")
90+
91+
# Specify the data schema for the new Collection.
92+
fields = [
93+
# Use auto generated id as primary key
94+
FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100),
95+
# Store the original text to retrieve based on semantically distance
96+
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512, enable_tokenizer=True),
97+
# We need a sparse vector field to perform full text search with BM25,
98+
# but you don't need to provide data for it when inserting data.
99+
FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
100+
FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
101+
]
102+
functions = [
103+
Function(
104+
name="bm25",
105+
function_type=FunctionType.BM25,
106+
input_field_names=["text"],
107+
output_field_names="sparse_vector",
108+
)
109+
]
110+
schema = CollectionSchema(fields, "", functions=functions)
111+
col_name = "hybrid_bm25_demo"
112+
# Now we can create the new collection with above name and schema.
113+
col = Collection(col_name, schema, consistency_level="Strong")
114+
115+
# We need to create indices for the vector fields. The indices will be loaded
116+
# into memory for efficient search.
117+
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
118+
col.create_index("sparse_vector", sparse_index)
119+
dense_index = {"index_type": "FLAT", "metric_type": "IP"}
120+
col.create_index("dense_vector", dense_index)
121+
col.load()
122+
123+
# 3. insert text and sparse/dense vector representations into the collection
124+
entities = [docs, docs_embeddings["dense"]]
125+
col.insert(entities)
126+
col.flush()
127+
128+
# 4. search and inspect the result!
129+
k = 2 # we want to get the top 2 docs closest to the query
130+
131+
# Prepare the search requests for both full text search and dense vector search
132+
full_text_search_params = {"metric_type": "BM25"}
133+
# provide raw text query for full text search, while use the sparse vector as
134+
# ANNS field
135+
full_text_search_req = AnnSearchRequest([query], "sparse_vector", full_text_search_params, limit=k)
136+
dense_search_params = {"metric_type": "IP"}
137+
dense_req = AnnSearchRequest(
138+
query_embeddings["dense"], "dense_vector", dense_search_params, limit=k
139+
)
140+
141+
# Search topK docs based on dense and sparse vectors and rerank with RRF.
142+
res = col.hybrid_search(
143+
[full_text_search_req, dense_req], rerank=RRFRanker(), limit=k, output_fields=["text"]
144+
)
145+
146+
# Currently Milvus only support 1 query in the same hybrid search request, so
147+
# we inspect res[0] directly. In future release Milvus will accept batch
148+
# hybrid search queries in the same call.
149+
res = res[0]
150+
151+
if use_reranker:
152+
result_texts = [hit.fields["text"] for hit in res]
153+
from pymilvus.model.reranker import BGERerankFunction
154+
155+
bge_rf = BGERerankFunction(device="cpu")
156+
# rerank the results using BGE CrossEncoder model
157+
results = bge_rf(query, result_texts, top_k=2)
158+
for hit in results:
159+
print(f"text: {hit.text} distance {hit.score}")
160+
else:
161+
for hit in res:
162+
print(f'text: {hit.fields["text"]} distance {hit.distance}')
163+
164+
# If you used both BGE-M3 and the reranker, you should see the following:
165+
# text: Alan Turing was the first person to conduct substantial research in AI. distance 0.9306981017573297
166+
# text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.03217001154515051
167+
#
168+
# If you used only BGE-M3, you should see the following:
169+
# text: Alan Turing was the first person to conduct substantial research in AI. distance 0.032786883413791656
170+
# text: Artificial intelligence was founded as an academic discipline in 1956. distance 0.016129031777381897
171+
172+
# In this simple example the reranker yields the same result as the embedding based hybrid search, but in more complex
173+
# scenarios the reranker can provide more accurate results.
174+
175+
# If you used random vectors, the result will be different each time you run the script.
176+
177+
# Drop the collection to clean up the data.
178+
utility.drop_collection(col_name)

0 commit comments

Comments
 (0)