-
Notifications
You must be signed in to change notification settings - Fork 1.3k
CSHARP-5717: Typed builders for Atlas indexes #1769
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
/* Copyright 2010-present MongoDB Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Linq.Expressions; | ||
using MongoDB.Bson; | ||
using MongoDB.Bson.Serialization; | ||
|
||
namespace MongoDB.Driver | ||
{ | ||
/// <summary> | ||
/// Defines an Atlas vector search index model using strongly-typed C# APIs. | ||
/// </summary> | ||
public class CreateAtlasVectorIndexModel<TDocument> : CreateSearchIndexModel | ||
{ | ||
private readonly RenderArgs<TDocument> _renderArgs | ||
= new(BsonSerializer.LookupSerializer<TDocument>(), BsonSerializer.SerializerRegistry); | ||
|
||
/// <summary> | ||
/// Initializes a new instance of the <see cref="CreateSearchIndexModel"/> class, passing the required | ||
/// options for <see cref="VectorSimilarity"/> and number of vector dimensions to the constructor. | ||
/// </summary> | ||
/// <param name="name">The index name.</param> | ||
/// <param name="field">The field containing the vectors to index.</param> | ||
/// <param name="similarity">The <see cref="VectorSimilarity"/> to use to search for top K-nearest neighbors.</param> | ||
/// <param name="dimensions">Number of vector dimensions that Atlas Vector Search enforces at index-time and query-time.</param> | ||
/// <param name="filterFields">Fields that may be used as filters in the vector query.</param> | ||
public CreateAtlasVectorIndexModel( | ||
FieldDefinition<TDocument> field, | ||
string name, | ||
VectorSimilarity similarity, | ||
int dimensions, | ||
params FieldDefinition<TDocument>[] filterFields) | ||
: base(name, SearchIndexType.VectorSearch) | ||
{ | ||
Field = field; | ||
Similarity = similarity; | ||
Dimensions = dimensions; | ||
FilterFields = filterFields?.ToList() ?? []; | ||
} | ||
|
||
/// <summary> | ||
/// Initializes a new instance of the <see cref="CreateSearchIndexModel"/> class, passing the required | ||
/// options for <see cref="VectorSimilarity"/> and number of vector dimensions to the constructor. | ||
/// </summary> | ||
/// <param name="name">The index name.</param> | ||
/// <param name="field">An expression pointing to the field containing the vectors to index.</param> | ||
/// <param name="similarity">The <see cref="VectorSimilarity"/> to use to search for top K-nearest neighbors.</param> | ||
/// <param name="dimensions">Number of vector dimensions that Atlas Vector Search enforces at index-time and query-time.</param> | ||
/// <param name="filterFields">Expressions pointing to fields that may be used as filters in the vector query.</param> | ||
public CreateAtlasVectorIndexModel( | ||
Expression<Func<TDocument, object>> field, | ||
string name, | ||
VectorSimilarity similarity, | ||
int dimensions, | ||
params Expression<Func<TDocument, object>>[] filterFields) | ||
: this( | ||
new ExpressionFieldDefinition<TDocument>(field), | ||
name, | ||
similarity, | ||
dimensions, | ||
filterFields?.Select(f => (FieldDefinition<TDocument>)new ExpressionFieldDefinition<TDocument>(f)).ToArray()) | ||
{ | ||
Similarity = similarity; | ||
Dimensions = dimensions; | ||
} | ||
|
||
/// <summary> | ||
/// The field containing the vectors to index. | ||
/// </summary> | ||
public FieldDefinition<TDocument> Field { get; } | ||
|
||
/// <summary> | ||
/// The <see cref="VectorSimilarity"/> to use to search for top K-nearest neighbors. | ||
/// </summary> | ||
public VectorSimilarity Similarity { get; } | ||
|
||
/// <summary> | ||
/// Number of vector dimensions that Atlas Vector Search enforces at index-time and query-time. | ||
/// </summary> | ||
public int Dimensions { get; } | ||
|
||
/// <summary> | ||
/// Fields that may be used as filters in the vector query. | ||
/// </summary> | ||
public IReadOnlyList<FieldDefinition<TDocument>> FilterFields { get; } | ||
|
||
/// <summary> | ||
/// Type of automatic vector quantization for your vectors. | ||
/// </summary> | ||
public VectorQuantization? Quantization { get; init; } | ||
|
||
/// <summary> | ||
/// Maximum number of edges (or connections) that a node can have in the Hierarchical Navigable Small Worlds graph. | ||
/// </summary> | ||
public int? HnswMaxEdges { get; init; } | ||
|
||
/// <summary> | ||
/// Analogous to numCandidates at query-time, this parameter controls the maximum number of nodes to evaluate to find the closest neighbors to connect to a new node. | ||
/// </summary> | ||
public int? HnswNumEdgeCandidates { get; init; } | ||
|
||
// /// <summary>Paths to properties that may be used as filters on the entity type or its nested types.</summary> | ||
// public IReadOnlyList<string> FilterPaths { get; init; } | ||
|
||
/// <inheritdoc/> | ||
public override SearchIndexType? Type | ||
=> SearchIndexType.VectorSearch; | ||
|
||
/// <inheritdoc/> | ||
public override BsonDocument Definition | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a bit odd to see a property getter doing so much work. Usually properties simply return an existing value. But... making this a method would be a breaking change. |
||
{ | ||
get | ||
{ | ||
if (base.Definition != null) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't really happen right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what you mean. The idea here is that the type is immutable, and we don't want to re-create the document every time the property is accessed--as you mentioned elsewhere, ideally this should not be a property. So, the first time this is called, the definition stored in the base is null. It is then built and cached so it doesn't have to be built again if the property is accessed again. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can't cache the result of I think you should throw an exception if the |
||
{ | ||
return base.Definition; | ||
} | ||
|
||
var similarityValue = Similarity == VectorSimilarity.DotProduct | ||
? "dotProduct" // Because neither "DotProduct" or "dotproduct" are allowed. | ||
: Similarity.ToString().ToLowerInvariant(); | ||
|
||
var vectorField = new BsonDocument | ||
{ | ||
{ "type", BsonString.Create("vector") }, | ||
{ "path", Field.Render(_renderArgs).FieldName }, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not really possible to use a cached This implies that this functionality should really be in a How to do that given that this is a property is a conundrum. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How would someone have done this correctly (that is, create the document with the correct serializer) before my changes? What happens if they do it wrong? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rstam I've been pondering this some more. Given that we are building metadata for the server here, rather than user documents, shouldn't we be using a standard form of serialization always? In other words, what is the scenario where this metadata should be built with custom serializers? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't usually use serializers to build commands, just for data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rstam Since we are not building user data here, but rather just metadata, then should we not use serializers here? I'm not sure what you mean by "commands" above? If we shouldn't be using serializers, then how to we go from a FieldDefinition to path in the metadata without using a serializer? Or, to ask the same thing another way, why was this originally written to use serializers to create metadata? How would use of those serializers make the metadata generated different? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A "command" is a BSON document we send to the server telling the server to do something. In this case the command is "createSearchIndexes", though in THIS file we are just creating a small part of that command, i.e. the "indexes" field of the "createSearchIndexes" command. I didn't mean to say that we shouldn't use serializers AT ALL when creating a command, just that we don't use serializers to create the bulk of the command. Of course we have to consult the corresponding serializer when a user uses an And when creating a command that includes user defined POCOs as data to the command we use serializers to convert the POCOs to BSON inside the command, but that's not the case for this command. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't understand this question. What part of this was originally written to use serializers? |
||
{ "numDimensions", BsonInt32.Create(Dimensions) }, | ||
{ "similarity", BsonString.Create(similarityValue) }, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to call There are implicit conversions to simplify code like this. |
||
}; | ||
|
||
if (Quantization.HasValue) | ||
{ | ||
vectorField.Add("quantization", BsonString.Create(Quantization.ToString()?.ToLower())); | ||
} | ||
|
||
if (HnswMaxEdges != null || HnswNumEdgeCandidates != null) | ||
{ | ||
var hnswDocument = new BsonDocument | ||
{ | ||
{ "maxEdges", BsonInt32.Create(HnswMaxEdges ?? 16) }, | ||
{ "numEdgeCandidates", BsonInt32.Create(HnswNumEdgeCandidates ?? 100) } | ||
}; | ||
vectorField.Add("hnswOptions", hnswDocument); | ||
} | ||
|
||
var fieldDocuments = new List<BsonDocument> { vectorField }; | ||
|
||
if (FilterFields != null) | ||
{ | ||
foreach (var filterPath in FilterFields) | ||
{ | ||
var fieldDocument = new BsonDocument | ||
{ | ||
{ "type", BsonString.Create("filter") }, | ||
{ "path", BsonString.Create(filterPath.Render(_renderArgs).FieldName) } | ||
}; | ||
|
||
fieldDocuments.Add(fieldDocument); | ||
} | ||
} | ||
|
||
base.Definition = new BsonDocument { { "fields", BsonArray.Create(fieldDocuments) } }; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would have made the |
||
|
||
return base.Definition; | ||
} | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// Defines an Atlas vector search index model using strongly-typed C# APIs. | ||
/// </summary> | ||
public class CreateAtlasVectorIndexModel : CreateAtlasVectorIndexModel<BsonDocument> | ||
{ | ||
/// <summary> | ||
/// Initializes a new instance of the <see cref="CreateSearchIndexModel"/> class, passing the required | ||
/// options for <see cref="VectorSimilarity"/> and number of vector dimensions to the constructor. | ||
/// </summary> | ||
/// <param name="name">The index name.</param> | ||
/// <param name="field">The field containing the vectors to index.</param> | ||
/// <param name="similarity">The <see cref="VectorSimilarity"/> to use to search for top K-nearest neighbors.</param> | ||
/// <param name="dimensions">Number of vector dimensions that Atlas Vector Search enforces at index-time and query-time.</param> | ||
/// <param name="filterFields">Fields that may be used as filters in the vector query.</param> | ||
public CreateAtlasVectorIndexModel( | ||
FieldDefinition<BsonDocument> field, | ||
string name, | ||
VectorSimilarity similarity, | ||
int dimensions, | ||
params FieldDefinition<BsonDocument>[] filterFields) | ||
: base(field, name, similarity, dimensions, filterFields) | ||
{ | ||
} | ||
} | ||
|
||
|
||
/// <summary> | ||
/// TODO | ||
/// </summary> | ||
public class CreateAtlasSearchIndexModel | ||
{ | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
/* Copyright 2010-present MongoDB Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
namespace MongoDB.Driver | ||
rstam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
{ | ||
/// <summary> | ||
/// Type of automatic vector quantization for your vectors. Use this setting only if your embeddings are float | ||
/// or double vectors. See <see href="https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-quantization/"> | ||
/// Vector Quantization</see> for more information. | ||
/// </summary> | ||
public enum VectorQuantization | ||
{ | ||
/// <summary> | ||
/// Indicates no automatic quantization for the vector embeddings. Use this setting if you have pre-quantized | ||
/// vectors for ingestion. If omitted, this is the default value. | ||
/// </summary> | ||
None, | ||
|
||
/// <summary> | ||
/// Indicates scalar quantization, which transforms values to 1 byte integers. | ||
/// </summary> | ||
Scalar, | ||
|
||
/// <summary> | ||
/// Indicates binary quantization, which transforms values to a single bit. | ||
/// To use this value, numDimensions must be a multiple of 8. | ||
/// If precision is critical, select <see cref="None"/> or <see cref="Scalar"/> instead of <see cref="Binary"/>. | ||
/// </summary> | ||
Binary, | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/* Copyright 2010-present MongoDB Inc. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
namespace MongoDB.Driver | ||
{ | ||
/// <summary> | ||
/// Vector similarity function to use to search for top K-nearest neighbors. | ||
/// See <see href="https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-type/">How to Index Fields for | ||
/// Vector Search</see> for more information. | ||
/// </summary> | ||
public enum VectorSimilarity | ||
{ | ||
/// <summary> | ||
/// Measures the distance between ends of vectors. | ||
/// </summary> | ||
Euclidean, | ||
|
||
/// <summary> | ||
/// Measures similarity based on the angle between vectors. | ||
/// </summary> | ||
Cosine, | ||
|
||
/// <summary> | ||
/// mMasures similarity like cosine, but takes into account the magnitude of the vector. | ||
/// </summary> | ||
DotProduct, | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
documentSerializer
should come from thecollection.DocumentSerializer
and not from the registry.That's why we have
Render
methods withRenderArgs
so that the document serializer can be passed in later when it is known.