@@ -2,18 +2,21 @@ use std::fmt;
2
2
use std:: marker:: PhantomData ;
3
3
use std:: sync:: Arc ;
4
4
5
- use columnar:: ColumnValues ;
5
+ use columnar:: { ColumnValues , StrColumn } ;
6
6
use serde:: { Deserialize , Serialize } ;
7
7
8
8
use super :: Collector ;
9
- use crate :: collector:: custom_score_top_collector:: CustomScoreTopCollector ;
9
+ use crate :: collector:: custom_score_top_collector:: {
10
+ CustomScoreTopCollector , CustomScoreTopSegmentCollector ,
11
+ } ;
10
12
use crate :: collector:: top_collector:: { ComparableDoc , TopCollector , TopSegmentCollector } ;
11
13
use crate :: collector:: tweak_score_top_collector:: TweakedScoreTopCollector ;
12
14
use crate :: collector:: {
13
15
CustomScorer , CustomSegmentScorer , ScoreSegmentTweaker , ScoreTweaker , SegmentCollector ,
14
16
} ;
15
17
use crate :: fastfield:: { FastFieldNotAvailableError , FastValue } ;
16
18
use crate :: query:: Weight ;
19
+ use crate :: termdict:: TermOrdinal ;
17
20
use crate :: { DocAddress , DocId , Order , Score , SegmentOrdinal , SegmentReader , TantivyError } ;
18
21
19
22
struct FastFieldConvertCollector <
@@ -83,6 +86,163 @@ where
83
86
}
84
87
}
85
88
89
+ struct StringConvertCollector {
90
+ pub collector : CustomScoreTopCollector < ScorerByField , u64 > ,
91
+ pub field : String ,
92
+ order : Order ,
93
+ limit : usize ,
94
+ offset : usize ,
95
+ }
96
+
97
+ impl Collector for StringConvertCollector {
98
+ type Fruit = Vec < ( String , DocAddress ) > ;
99
+
100
+ type Child = StringConvertSegmentCollector ;
101
+
102
+ fn for_segment (
103
+ & self ,
104
+ segment_local_id : crate :: SegmentOrdinal ,
105
+ segment : & SegmentReader ,
106
+ ) -> crate :: Result < Self :: Child > {
107
+ let schema = segment. schema ( ) ;
108
+ let field = schema. get_field ( & self . field ) ?;
109
+ let field_entry = schema. get_field_entry ( field) ;
110
+ if !field_entry. is_fast ( ) {
111
+ return Err ( TantivyError :: SchemaError ( format ! (
112
+ "Field {:?} is not a fast field." ,
113
+ field_entry. name( )
114
+ ) ) ) ;
115
+ }
116
+ let requested_type = crate :: schema:: Type :: Str ;
117
+ let schema_type = field_entry. field_type ( ) . value_type ( ) ;
118
+ if schema_type != requested_type {
119
+ return Err ( TantivyError :: SchemaError ( format ! (
120
+ "Field {:?} is of type {schema_type:?}!={requested_type:?}" ,
121
+ field_entry. name( )
122
+ ) ) ) ;
123
+ }
124
+ let ff = segment
125
+ . fast_fields ( )
126
+ . str ( & self . field ) ?
127
+ . expect ( "ff should be a str field" ) ;
128
+ Ok ( StringConvertSegmentCollector {
129
+ collector : self . collector . for_segment ( segment_local_id, segment) ?,
130
+ ff,
131
+ order : self . order . clone ( ) ,
132
+ } )
133
+ }
134
+
135
+ fn requires_scoring ( & self ) -> bool {
136
+ self . collector . requires_scoring ( )
137
+ }
138
+
139
+ fn merge_fruits (
140
+ & self ,
141
+ child_fruits : Vec < <Self :: Child as SegmentCollector >:: Fruit > ,
142
+ ) -> crate :: Result < Self :: Fruit > {
143
+ if self . limit == 0 {
144
+ return Ok ( Vec :: new ( ) ) ;
145
+ }
146
+ if self . order . is_desc ( ) {
147
+ let mut top_collector: TopNComputer < _ , _ , true > =
148
+ TopNComputer :: new ( self . limit + self . offset ) ;
149
+ for child_fruit in child_fruits {
150
+ for ( feature, doc) in child_fruit {
151
+ top_collector. push ( feature, doc) ;
152
+ }
153
+ }
154
+ Ok ( top_collector
155
+ . into_sorted_vec ( )
156
+ . into_iter ( )
157
+ . skip ( self . offset )
158
+ . map ( |cdoc| ( cdoc. feature , cdoc. doc ) )
159
+ . collect ( ) )
160
+ } else {
161
+ let mut top_collector: TopNComputer < _ , _ , false > =
162
+ TopNComputer :: new ( self . limit + self . offset ) ;
163
+ for child_fruit in child_fruits {
164
+ for ( feature, doc) in child_fruit {
165
+ top_collector. push ( feature, doc) ;
166
+ }
167
+ }
168
+
169
+ Ok ( top_collector
170
+ . into_sorted_vec ( )
171
+ . into_iter ( )
172
+ . skip ( self . offset )
173
+ . map ( |cdoc| ( cdoc. feature , cdoc. doc ) )
174
+ . collect ( ) )
175
+ }
176
+ }
177
+ }
178
+
179
+ struct StringConvertSegmentCollector {
180
+ pub collector : CustomScoreTopSegmentCollector < ScorerByFastFieldReader , u64 > ,
181
+ ff : StrColumn ,
182
+ order : Order ,
183
+ }
184
+
185
+ impl SegmentCollector for StringConvertSegmentCollector {
186
+ type Fruit = Vec < ( String , DocAddress ) > ;
187
+
188
+ fn collect ( & mut self , doc : DocId , score : Score ) {
189
+ self . collector . collect ( doc, score) ;
190
+ }
191
+
192
+ fn harvest ( self ) -> Vec < ( String , DocAddress ) > {
193
+ let top_ordinals: Vec < ( TermOrdinal , DocAddress ) > = self . collector . harvest ( ) ;
194
+
195
+ // Collect terms.
196
+ let mut terms: Vec < String > = Vec :: with_capacity ( top_ordinals. len ( ) ) ;
197
+ let result = if self . order . is_asc ( ) {
198
+ self . ff . dictionary ( ) . sorted_ords_to_term_cb (
199
+ top_ordinals. iter ( ) . map ( |( term_ord, _) | u64:: MAX - term_ord) ,
200
+ |term| {
201
+ terms. push (
202
+ std:: str:: from_utf8 ( term)
203
+ . expect ( "Failed to decode term as unicode" )
204
+ . to_owned ( ) ,
205
+ ) ;
206
+ Ok ( ( ) )
207
+ } ,
208
+ )
209
+ } else {
210
+ self . ff . dictionary ( ) . sorted_ords_to_term_cb (
211
+ top_ordinals. iter ( ) . rev ( ) . map ( |( term_ord, _) | * term_ord) ,
212
+ |term| {
213
+ terms. push (
214
+ std:: str:: from_utf8 ( term)
215
+ . expect ( "Failed to decode term as unicode" )
216
+ . to_owned ( ) ,
217
+ ) ;
218
+ Ok ( ( ) )
219
+ } ,
220
+ )
221
+ } ;
222
+
223
+ assert ! (
224
+ result. expect( "Failed to read terms from term dictionary" ) ,
225
+ "Not all terms were matched in segment."
226
+ ) ;
227
+
228
+ // Zip them back with their docs.
229
+ if self . order . is_asc ( ) {
230
+ terms
231
+ . into_iter ( )
232
+ . zip ( top_ordinals)
233
+ . map ( |( term, ( _, doc) ) | ( term, doc) )
234
+ . collect ( )
235
+ } else {
236
+ terms
237
+ . into_iter ( )
238
+ . rev ( )
239
+ . zip ( top_ordinals)
240
+ . map ( |( term, ( _, doc) ) | ( term, doc) )
241
+ . collect ( )
242
+ }
243
+ }
244
+ }
245
+
86
246
/// The `TopDocs` collector keeps track of the top `K` documents
87
247
/// sorted by their score.
88
248
///
@@ -410,6 +570,30 @@ impl TopDocs {
410
570
}
411
571
}
412
572
573
+ /// Like `order_by_fast_field`, but for a `String` fast field.
574
+ pub fn order_by_string_fast_field (
575
+ self ,
576
+ fast_field : impl ToString ,
577
+ order : Order ,
578
+ ) -> impl Collector < Fruit = Vec < ( String , DocAddress ) > > {
579
+ let limit = self . 0 . limit ;
580
+ let offset = self . 0 . offset ;
581
+ let u64_collector = CustomScoreTopCollector :: new (
582
+ ScorerByField {
583
+ field : fast_field. to_string ( ) ,
584
+ order : order. clone ( ) ,
585
+ } ,
586
+ self . 0 . into_tscore ( ) ,
587
+ ) ;
588
+ StringConvertCollector {
589
+ collector : u64_collector,
590
+ field : fast_field. to_string ( ) ,
591
+ order,
592
+ limit,
593
+ offset,
594
+ }
595
+ }
596
+
413
597
/// Ranks the documents using a custom score.
414
598
///
415
599
/// This method offers a convenient way to tweak or replace
@@ -1214,6 +1398,94 @@ mod tests {
1214
1398
Ok ( ( ) )
1215
1399
}
1216
1400
1401
+ #[ test]
1402
+ fn test_top_field_collector_string ( ) -> crate :: Result < ( ) > {
1403
+ let mut schema_builder = Schema :: builder ( ) ;
1404
+ let city = schema_builder. add_text_field ( "city" , TEXT | FAST ) ;
1405
+ let schema = schema_builder. build ( ) ;
1406
+ let index = Index :: create_in_ram ( schema) ;
1407
+ let mut index_writer = index. writer_for_tests ( ) ?;
1408
+ index_writer. add_document ( doc ! (
1409
+ city => "austin" ,
1410
+ ) ) ?;
1411
+ index_writer. add_document ( doc ! (
1412
+ city => "greenville" ,
1413
+ ) ) ?;
1414
+ index_writer. add_document ( doc ! (
1415
+ city => "tokyo" ,
1416
+ ) ) ?;
1417
+ index_writer. commit ( ) ?;
1418
+
1419
+ fn query (
1420
+ index : & Index ,
1421
+ order : Order ,
1422
+ limit : usize ,
1423
+ offset : usize ,
1424
+ ) -> crate :: Result < Vec < ( String , DocAddress ) > > {
1425
+ let searcher = index. reader ( ) ?. searcher ( ) ;
1426
+ let top_collector = TopDocs :: with_limit ( limit)
1427
+ . and_offset ( offset)
1428
+ . order_by_string_fast_field ( "city" , order) ;
1429
+ searcher. search ( & AllQuery , & top_collector)
1430
+ }
1431
+
1432
+ assert_eq ! (
1433
+ & query( & index, Order :: Desc , 3 , 0 ) ?,
1434
+ & [
1435
+ ( "tokyo" . to_owned( ) , DocAddress :: new( 0 , 2 ) ) ,
1436
+ ( "greenville" . to_owned( ) , DocAddress :: new( 0 , 1 ) ) ,
1437
+ ( "austin" . to_owned( ) , DocAddress :: new( 0 , 0 ) ) ,
1438
+ ]
1439
+ ) ;
1440
+
1441
+ assert_eq ! (
1442
+ & query( & index, Order :: Desc , 2 , 0 ) ?,
1443
+ & [
1444
+ ( "tokyo" . to_owned( ) , DocAddress :: new( 0 , 2 ) ) ,
1445
+ ( "greenville" . to_owned( ) , DocAddress :: new( 0 , 1 ) ) ,
1446
+ ]
1447
+ ) ;
1448
+
1449
+ assert_eq ! ( & query( & index, Order :: Desc , 3 , 3 ) ?, & [ ] ) ;
1450
+
1451
+ assert_eq ! (
1452
+ & query( & index, Order :: Desc , 2 , 1 ) ?,
1453
+ & [
1454
+ ( "greenville" . to_owned( ) , DocAddress :: new( 0 , 1 ) ) ,
1455
+ ( "austin" . to_owned( ) , DocAddress :: new( 0 , 0 ) ) ,
1456
+ ]
1457
+ ) ;
1458
+
1459
+ assert_eq ! (
1460
+ & query( & index, Order :: Asc , 3 , 0 ) ?,
1461
+ & [
1462
+ ( "austin" . to_owned( ) , DocAddress :: new( 0 , 0 ) ) ,
1463
+ ( "greenville" . to_owned( ) , DocAddress :: new( 0 , 1 ) ) ,
1464
+ ( "tokyo" . to_owned( ) , DocAddress :: new( 0 , 2 ) ) ,
1465
+ ]
1466
+ ) ;
1467
+
1468
+ assert_eq ! (
1469
+ & query( & index, Order :: Asc , 2 , 1 ) ?,
1470
+ & [
1471
+ ( "greenville" . to_owned( ) , DocAddress :: new( 0 , 1 ) ) ,
1472
+ ( "tokyo" . to_owned( ) , DocAddress :: new( 0 , 2 ) ) ,
1473
+ ]
1474
+ ) ;
1475
+
1476
+ assert_eq ! (
1477
+ & query( & index, Order :: Asc , 2 , 0 ) ?,
1478
+ & [
1479
+ ( "austin" . to_owned( ) , DocAddress :: new( 0 , 0 ) ) ,
1480
+ ( "greenville" . to_owned( ) , DocAddress :: new( 0 , 1 ) ) ,
1481
+ ]
1482
+ ) ;
1483
+
1484
+ assert_eq ! ( & query( & index, Order :: Asc , 3 , 3 ) ?, & [ ] ) ;
1485
+
1486
+ Ok ( ( ) )
1487
+ }
1488
+
1217
1489
#[ test]
1218
1490
#[ should_panic]
1219
1491
fn test_field_does_not_exist ( ) {
0 commit comments