Skip to content

Commit 4e1c033

Browse files
authored
Query: Adds hybrid search query pipeline stage (#4794)
## Description Adds hybrid search query pipeline stage. This requires the new Direct package and gateway to be available in order to light up. Given an input SQL such as: ```sql SELECT TOP 100 c.text, c.abstract FROM c ORDER BY RANK RRF(FullTextScore(c.text, ['swim', 'run']), FullTextScore(c.abstract, ['energy'])) ``` The new query plan (encoded below as XML instead of JSON to help readability) is as follows: ``` <queryRanges> <Item>{"min":[],"max":"Infinity","isMinInclusive":true,"isMaxInclusive":false}</Item> </queryRanges> <hybridSearchQueryInfo> <globalStatisticsQuery><![CDATA[ SELECT COUNT(1) AS documentCount, [ { totalWordCount: SUM(_FullTextWordCount(c.text)), hitCounts: [ COUNTIF(FullTextContains(c.text, "swim")), COUNTIF(FullTextContains(c.text, "run")) ] }, { totalWordCount: SUM(_FullTextWordCount(c.abstract)), hitCounts: [ COUNTIF(FullTextContains(c.abstract, "energy")) ] } ] AS fullTextStatistics FROM c ]]></globalStatisticsQuery> <componentQueryInfos> <Item> <distinctType>None</distinctType> <top>200</top> <orderBy> <Item>Descending</Item> </orderBy> <orderByExpressions> <Item>_FullTextScore(c.text, ["swim", "run"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-0}, {documentdb-formattablehybridsearchquery-hitcountsarray-0})</Item> </orderByExpressions> <hasSelectValue>false</hasSelectValue> <rewrittenQuery><![CDATA[ SELECT TOP 200 c._rid, [ { item: _FullTextScore(c.text, ["swim", "run"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-0}, {documentdb-formattablehybridsearchquery-hitcountsarray-0}) } ] AS orderByItems, { payload: { text: c.text, abstract: c.abstract }, componentScores: [ _FullTextScore(c.text, ["swim", "run"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-0}, {documentdb-formattablehybridsearchquery-hitcountsarray-0}), _FullTextScore(c.abstract, ["energy"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-1}, {documentdb-formattablehybridsearchquery-hitcountsarray-1}) ] } AS payload FROM c WHERE {documentdb-formattableorderbyquery-filter} ORDER BY _FullTextScore(c.text, ["swim", "run"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-0}, {documentdb-formattablehybridsearchquery-hitcountsarray-0}) DESC ]]></rewrittenQuery> <hasNonStreamingOrderBy>true</hasNonStreamingOrderBy> </Item> <Item> <distinctType>None</distinctType> <top>200</top> <orderBy> <Item>Descending</Item> </orderBy> <orderByExpressions> <Item>_FullTextScore(c.abstract, ["energy"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-1}, {documentdb-formattablehybridsearchquery-hitcountsarray-1})</Item> </orderByExpressions> <hasSelectValue>false</hasSelectValue> <rewrittenQuery><![CDATA[ SELECT TOP 200 c._rid, [ { item: _FullTextScore(c.abstract, ["energy"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-1}, {documentdb-formattablehybridsearchquery-hitcountsarray-1}) } ] AS orderByItems, { payload: { text: c.text, abstract: c.abstract }, componentScores: [ _FullTextScore(c.text, ["swim", "run"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-0}, {documentdb-formattablehybridsearchquery-hitcountsarray-0}), _FullTextScore(c.abstract, ["energy"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-1}, {documentdb-formattablehybridsearchquery-hitcountsarray-1}) ] } AS payload FROM c WHERE {documentdb-formattableorderbyquery-filter} ORDER BY _FullTextScore(c.abstract, ["energy"], {documentdb-formattablehybridsearchquery-totaldocumentcount}, {documentdb-formattablehybridsearchquery-totalwordcount-1}, {documentdb-formattablehybridsearchquery-hitcountsarray-1}) DESC ]]></rewrittenQuery> <hasNonStreamingOrderBy>true</hasNonStreamingOrderBy> </Item> </componentQueryInfos> <take>100</take> <requiresGlobalStatistics>true</requiresGlobalStatistics> </hybridSearchQueryInfo> ``` We have a custom implementation for the global statistics inside the `HybridSearchCrossPartitionQueryPipelineStage` because it uses nested aggregates. Each of the component queries in the hybrid search query plan is cross partition, and we run them using the existing cross partition query pipelines. Note the use of placeholders such as `{documentdb-formattablehybridsearchquery-totaldocumentcount}` in the query plan. These need to be replaced by the global statistics. ## Type of change - [x] New feature (non-breaking change which adds functionality)
1 parent 57c681f commit 4e1c033

21 files changed

+1721
-170
lines changed

Microsoft.Azure.Cosmos/src/Query/Core/Pipeline/Aggregate/Aggregators/SingleGroupAggregator.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,8 @@ public static TryCatch<AggregateValue> TryCreate(
341341
tryCreateAggregator = AverageAggregator.TryCreate(continuationToken);
342342
break;
343343

344-
case AggregateOperator.Count:
344+
case AggregateOperator.Count:
345+
case AggregateOperator.CountIf:
345346
tryCreateAggregator = CountAggregator.TryCreate(continuationToken);
346347
break;
347348

Microsoft.Azure.Cosmos/src/Query/Core/Pipeline/CosmosQueryExecutionContextFactory.cs

Lines changed: 91 additions & 123 deletions
Large diffs are not rendered by default.
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// ------------------------------------------------------------
2+
// Copyright (c) Microsoft Corporation. All rights reserved.
3+
// ------------------------------------------------------------
4+
5+
namespace Microsoft.Azure.Cosmos.Query.Core.Pipeline.CrossPartition.HybridSearch
6+
{
7+
using System;
8+
using System.Collections.Generic;
9+
using Microsoft.Azure.Cosmos.CosmosElements;
10+
11+
internal sealed class FullTextStatistics
12+
{
13+
private readonly long[] hitCounts;
14+
15+
public long TotalWordCount { get; }
16+
17+
public ReadOnlyMemory<long> HitCounts => this.hitCounts;
18+
19+
public FullTextStatistics(long totalWordCount, long[] hitCounts)
20+
{
21+
this.TotalWordCount = totalWordCount;
22+
this.hitCounts = hitCounts;
23+
}
24+
25+
public FullTextStatistics(CosmosObject cosmosObject)
26+
{
27+
if (cosmosObject == null)
28+
{
29+
throw new System.ArgumentNullException($"{nameof(cosmosObject)} must not be null.");
30+
}
31+
32+
if (!cosmosObject.TryGetValue(FieldNames.TotalWordCount, out CosmosNumber totalWordCount))
33+
{
34+
throw new System.ArgumentException($"{FieldNames.TotalWordCount} must exist and be a number");
35+
}
36+
37+
if (!cosmosObject.TryGetValue(FieldNames.HitCounts, out CosmosArray hitCountsArray))
38+
{
39+
throw new System.ArgumentException($"{FieldNames.HitCounts} must exist and be an array");
40+
}
41+
42+
long[] hitCounts = new long[hitCountsArray.Count];
43+
for (int index = 0; index < hitCountsArray.Count; ++index)
44+
{
45+
if (!(hitCountsArray[index] is CosmosNumber cosmosNumber))
46+
{
47+
throw new System.ArgumentException($"{FieldNames.HitCounts} must be an array of numbers");
48+
}
49+
50+
hitCounts[index] = Number64.ToLong(cosmosNumber.Value);
51+
}
52+
53+
this.TotalWordCount = Number64.ToLong(totalWordCount.Value);
54+
this.hitCounts = hitCounts;
55+
}
56+
57+
private static class FieldNames
58+
{
59+
public const string TotalWordCount = "totalWordCount";
60+
61+
public const string HitCounts = "hitCounts";
62+
}
63+
}
64+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// ------------------------------------------------------------
2+
// Copyright (c) Microsoft Corporation. All rights reserved.
3+
// ------------------------------------------------------------
4+
5+
namespace Microsoft.Azure.Cosmos.Query.Core.Pipeline.CrossPartition.HybridSearch
6+
{
7+
using System.Collections.Generic;
8+
using Microsoft.Azure.Cosmos.CosmosElements;
9+
10+
internal sealed class GlobalFullTextSearchStatistics
11+
{
12+
public long DocumentCount { get; }
13+
14+
public IReadOnlyList<FullTextStatistics> FullTextStatistics { get; }
15+
16+
public GlobalFullTextSearchStatistics(long documentCount, IReadOnlyList<FullTextStatistics> fullTextStatistics)
17+
{
18+
this.DocumentCount = documentCount;
19+
this.FullTextStatistics = fullTextStatistics ?? throw new System.ArgumentNullException($"{nameof(fullTextStatistics)} must not be null.");
20+
}
21+
22+
public GlobalFullTextSearchStatistics(CosmosElement cosmosElement)
23+
{
24+
if (cosmosElement == null)
25+
{
26+
throw new System.ArgumentNullException($"{nameof(cosmosElement)} must not be null.");
27+
}
28+
29+
if (!(cosmosElement is CosmosObject cosmosObject))
30+
{
31+
throw new System.ArgumentException($"{nameof(cosmosElement)} must be an object.");
32+
}
33+
34+
if (!cosmosObject.TryGetValue(FieldNames.DocumentCount, out CosmosNumber cosmosNumber))
35+
{
36+
throw new System.ArgumentException($"{FieldNames.DocumentCount} must exist and be a number");
37+
}
38+
39+
if (!cosmosObject.TryGetValue(FieldNames.Statistics, out CosmosArray statisticsArray))
40+
{
41+
throw new System.ArgumentException($"{FieldNames.Statistics} must exist and be an array");
42+
}
43+
44+
List<FullTextStatistics> fullTextStatisticsList = new List<FullTextStatistics>(statisticsArray.Count);
45+
foreach (CosmosElement statisticsElement in statisticsArray)
46+
{
47+
if (!(statisticsElement is CosmosObject))
48+
{
49+
throw new System.ArgumentException($"{FieldNames.Statistics} must be an array of objects");
50+
}
51+
52+
FullTextStatistics fullTextStatistics = new FullTextStatistics(statisticsElement as CosmosObject);
53+
fullTextStatisticsList.Add(fullTextStatistics);
54+
}
55+
56+
this.DocumentCount = Number64.ToLong(cosmosNumber.Value);
57+
this.FullTextStatistics = fullTextStatisticsList;
58+
}
59+
60+
private static class FieldNames
61+
{
62+
public const string DocumentCount = "documentCount";
63+
64+
public const string Statistics = "fullTextStatistics";
65+
}
66+
}
67+
}

0 commit comments

Comments
 (0)