|
| 1 | +# hello_bm25.py demonstrates how to insert raw data only into Milvus and perform |
| 2 | +# sparse vector based ANN search using BM25 algorithm. |
| 3 | +# 1. connect to Milvus |
| 4 | +# 2. create collection |
| 5 | +# 3. insert data |
| 6 | +# 4. create index |
| 7 | +# 5. search, query, and filtering search on entities |
| 8 | +# 6. delete entities by PK |
| 9 | +# 7. drop collection |
| 10 | +import time |
| 11 | + |
| 12 | +from pymilvus import ( |
| 13 | + connections, |
| 14 | + utility, |
| 15 | + FieldSchema, CollectionSchema, Function, DataType, FunctionType, |
| 16 | + Collection, |
| 17 | +) |
| 18 | + |
| 19 | +fmt = "\n=== {:30} ===\n" |
| 20 | +search_latency_fmt = "search latency = {:.4f}s" |
| 21 | + |
| 22 | +################################################################################# |
| 23 | +# 1. connect to Milvus |
| 24 | +# Add a new connection alias `default` for Milvus server in `localhost:19530` |
| 25 | +print(fmt.format("start connecting to Milvus")) |
| 26 | +connections.connect("default", host="localhost", port="19530") |
| 27 | + |
| 28 | +has = utility.has_collection("hello_bm25") |
| 29 | +print(f"Does collection hello_bm25 exist in Milvus: {has}") |
| 30 | + |
| 31 | +################################################################################# |
| 32 | +# 2. create collection |
| 33 | +# We're going to create a collection with 2 explicit fields and a function. |
| 34 | +# +-+------------+------------+------------------+------------------------------+ |
| 35 | +# | | field name | field type | other attributes | field description | |
| 36 | +# +-+------------+------------+------------------+------------------------------+ |
| 37 | +# |1| "id" | INT64 | is_primary=True | "primary field" | |
| 38 | +# | | | | auto_id=False | | |
| 39 | +# +-+------------+------------+------------------+------------------------------+ |
| 40 | +# |2| "document" | VarChar | | "raw text document" | |
| 41 | +# +-+------------+------------+------------------+------------------------------+ |
| 42 | +# |
| 43 | +# Function 'bm25' is used to convert raw text document to a sparse vector representation |
| 44 | +# and store it in the 'sparse' field. |
| 45 | +# +-+------------+-------------------+-----------+------------------------------+ |
| 46 | +# | | field name | field type | other attr| field description | |
| 47 | +# +-+------------+-------------------+-----------+------------------------------+ |
| 48 | +# |3| "sparse" |SPARSE_FLOAT_VECTOR| | | |
| 49 | +# +-+------------+-------------------+-----------+------------------------------+ |
| 50 | +# |
| 51 | +fields = [ |
| 52 | + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), |
| 53 | + FieldSchema(name="sparse", dtype=DataType.SPARSE_FLOAT_VECTOR), |
| 54 | + FieldSchema(name="document", dtype=DataType.VARCHAR, max_length=1000, enable_tokenizer=True), |
| 55 | +] |
| 56 | + |
| 57 | +bm25_function = Function( |
| 58 | + name="bm25", |
| 59 | + function_type=FunctionType.BM25, |
| 60 | + input_field_names=["document"], |
| 61 | + output_field_names="sparse", |
| 62 | +) |
| 63 | + |
| 64 | +schema = CollectionSchema(fields, "hello_bm25 demo") |
| 65 | +schema.add_function(bm25_function) |
| 66 | + |
| 67 | +print(fmt.format("Create collection `hello_bm25`")) |
| 68 | +hello_bm25 = Collection("hello_bm25", schema, consistency_level="Strong") |
| 69 | + |
| 70 | +################################################################################ |
| 71 | +# 3. insert data |
| 72 | +# We are going to insert 3 rows of data into `hello_bm25` |
| 73 | +# Data to be inserted must be organized in fields. |
| 74 | +# |
| 75 | +# The insert() method returns: |
| 76 | +# - either automatically generated primary keys by Milvus if auto_id=True in the schema; |
| 77 | +# - or the existing primary key field from the entities if auto_id=False in the schema. |
| 78 | + |
| 79 | +print(fmt.format("Start inserting entities")) |
| 80 | + |
| 81 | +num_entities = 6 |
| 82 | + |
| 83 | +entities = [ |
| 84 | + [f"This is a test document {i + hello_bm25.num_entities}" for i in range(num_entities)], |
| 85 | +] |
| 86 | + |
| 87 | +insert_result = hello_bm25.insert(entities) |
| 88 | +ids = insert_result.primary_keys |
| 89 | + |
| 90 | +time.sleep(3) |
| 91 | + |
| 92 | +hello_bm25.flush() |
| 93 | +print(f"Number of entities in Milvus: {hello_bm25.num_entities}") # check the num_entities |
| 94 | + |
| 95 | +################################################################################ |
| 96 | +# 4. create index |
| 97 | +# We are going to create an index for hello_bm25 collection, here we simply |
| 98 | +# uses AUTOINDEX so Milvus can use the default parameters. |
| 99 | +print(fmt.format("Start Creating index AUTOINDEX")) |
| 100 | +index = { |
| 101 | + "index_type": "AUTOINDEX", |
| 102 | + "metric_type": "BM25", |
| 103 | +} |
| 104 | + |
| 105 | +hello_bm25.create_index("sparse", index) |
| 106 | + |
| 107 | +################################################################################ |
| 108 | +# 5. search, query, and scalar filtering search |
| 109 | +# After data were inserted into Milvus and indexed, you can perform: |
| 110 | +# - search texts relevance by BM25 using sparse vector ANN search |
| 111 | +# - query based on scalar filtering(boolean, int, etc.) |
| 112 | +# - scalar filtering search. |
| 113 | +# |
| 114 | + |
| 115 | +# Before conducting a search or a query, you need to load the data in `hello_bm25` into memory. |
| 116 | +print(fmt.format("Start loading")) |
| 117 | +hello_bm25.load() |
| 118 | + |
| 119 | +# ----------------------------------------------------------------------------- |
| 120 | +print(fmt.format("Start searching based on BM25 texts relevance using sparse vector ANN search")) |
| 121 | +texts_to_search = entities[-1][-2:] |
| 122 | +print(fmt.format(f"texts_to_search: {texts_to_search}")) |
| 123 | +search_params = { |
| 124 | + "metric_type": "BM25", |
| 125 | + "params": {}, |
| 126 | +} |
| 127 | + |
| 128 | +start_time = time.time() |
| 129 | +result = hello_bm25.search(texts_to_search, "sparse", search_params, limit=3, output_fields=["document"], consistency_level="Strong") |
| 130 | +end_time = time.time() |
| 131 | + |
| 132 | +for hits, text in zip(result, texts_to_search): |
| 133 | + print(f"result of text: {text}") |
| 134 | + for hit in hits: |
| 135 | + print(f"\thit: {hit}, document field: {hit.entity.get('document')}") |
| 136 | +print(search_latency_fmt.format(end_time - start_time)) |
| 137 | + |
| 138 | +# ----------------------------------------------------------------------------- |
| 139 | +# query based on scalar filtering(boolean, int, etc.) |
| 140 | +filter_id = ids[num_entities // 2 - 1] |
| 141 | +print(fmt.format(f"Start querying with `id > {filter_id}`")) |
| 142 | + |
| 143 | +start_time = time.time() |
| 144 | +result = hello_bm25.query(expr=f"id > {filter_id}", output_fields=["document"]) |
| 145 | +end_time = time.time() |
| 146 | + |
| 147 | +print(f"query result:\n-{result[0]}") |
| 148 | +print(search_latency_fmt.format(end_time - start_time)) |
| 149 | + |
| 150 | +# ----------------------------------------------------------------------------- |
| 151 | +# pagination |
| 152 | +r1 = hello_bm25.query(expr=f"id > {filter_id}", limit=3, output_fields=["document"]) |
| 153 | +r2 = hello_bm25.query(expr=f"id > {filter_id}", offset=1, limit=2, output_fields=["document"]) |
| 154 | +print(f"query pagination(limit=3):\n\t{r1}") |
| 155 | +print(f"query pagination(offset=1, limit=2):\n\t{r2}") |
| 156 | + |
| 157 | + |
| 158 | +# ----------------------------------------------------------------------------- |
| 159 | +# scalar filtering search |
| 160 | +print(fmt.format(f"Start filtered searching with `id > {filter_id}`")) |
| 161 | + |
| 162 | +start_time = time.time() |
| 163 | +result = hello_bm25.search(texts_to_search, "sparse", search_params, limit=3, expr=f"id > {filter_id}", output_fields=["document"]) |
| 164 | +end_time = time.time() |
| 165 | + |
| 166 | +for hits, text in zip(result, texts_to_search): |
| 167 | + print(f"result of text: {text}") |
| 168 | + for hit in hits: |
| 169 | + print(f"\thit: {hit}, document field: {hit.entity.get('document')}") |
| 170 | +print(search_latency_fmt.format(end_time - start_time)) |
| 171 | + |
| 172 | +############################################################################### |
| 173 | +# 6. delete entities by PK |
| 174 | +# You can delete entities by their PK values using boolean expressions. |
| 175 | + |
| 176 | +expr = f'id in [{ids[0]}, {ids[1]}]' |
| 177 | +print(fmt.format(f"Start deleting with expr `{expr}`")) |
| 178 | + |
| 179 | +result = hello_bm25.query(expr=expr, output_fields=["document"]) |
| 180 | +print(f"query before delete by expr=`{expr}` -> result: \n- {result[0]}\n- {result[1]}\n") |
| 181 | + |
| 182 | +hello_bm25.delete(expr) |
| 183 | + |
| 184 | +result = hello_bm25.query(expr=expr, output_fields=["document"]) |
| 185 | +print(f"query after delete by expr=`{expr}` -> result: {result}\n") |
| 186 | + |
| 187 | +############################################################################### |
| 188 | +# 7. upsert by PK |
| 189 | +# You can upsert data to replace existing data. |
| 190 | +target_id = ids[2] |
| 191 | +print(fmt.format(f"Start upsert operation for id {target_id}")) |
| 192 | + |
| 193 | +# Query before upsert |
| 194 | +result_before = hello_bm25.query(expr=f"id == {target_id}", output_fields=["id", "document"]) |
| 195 | +print(f"Query before upsert (id={target_id}):\n{result_before}") |
| 196 | + |
| 197 | +# Prepare data for upsert |
| 198 | +upsert_data = [ |
| 199 | + [target_id], |
| 200 | + ["This is an upserted document for testing purposes."] |
| 201 | +] |
| 202 | + |
| 203 | +# Perform upsert operation |
| 204 | +hello_bm25.upsert(upsert_data) |
| 205 | + |
| 206 | +# Query after upsert |
| 207 | +result_after = hello_bm25.query(expr=f"id == {target_id}", output_fields=["id", "document"]) |
| 208 | +print(f"Query after upsert (id={target_id}):\n{result_after}") |
| 209 | + |
| 210 | + |
| 211 | +############################################################################### |
| 212 | +# 7. drop collection |
| 213 | +# Finally, drop the hello_bm25 collection |
| 214 | +print(fmt.format("Drop collection `hello_bm25`")) |
| 215 | +utility.drop_collection("hello_bm25") |
0 commit comments