Skip to content

Commit a2400f4

Browse files
authored
Add string fast field support to TopDocs. (#2642)
* Add string fast field support to `TopDocs`. * Remove unnecessary generics, and review feedback. * Use actual/less-ambiguous cities. * Review feedback
1 parent 436ec6c commit a2400f4

File tree

1 file changed

+274
-2
lines changed

1 file changed

+274
-2
lines changed

src/collector/top_score_collector.rs

Lines changed: 274 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,21 @@ use std::fmt;
22
use std::marker::PhantomData;
33
use std::sync::Arc;
44

5-
use columnar::ColumnValues;
5+
use columnar::{ColumnValues, StrColumn};
66
use serde::{Deserialize, Serialize};
77

88
use super::Collector;
9-
use crate::collector::custom_score_top_collector::CustomScoreTopCollector;
9+
use crate::collector::custom_score_top_collector::{
10+
CustomScoreTopCollector, CustomScoreTopSegmentCollector,
11+
};
1012
use crate::collector::top_collector::{ComparableDoc, TopCollector, TopSegmentCollector};
1113
use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
1214
use crate::collector::{
1315
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
1416
};
1517
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
1618
use crate::query::Weight;
19+
use crate::termdict::TermOrdinal;
1720
use crate::{DocAddress, DocId, Order, Score, SegmentOrdinal, SegmentReader, TantivyError};
1821

1922
struct FastFieldConvertCollector<
@@ -83,6 +86,163 @@ where
8386
}
8487
}
8588

89+
struct StringConvertCollector {
90+
pub collector: CustomScoreTopCollector<ScorerByField, u64>,
91+
pub field: String,
92+
order: Order,
93+
limit: usize,
94+
offset: usize,
95+
}
96+
97+
impl Collector for StringConvertCollector {
98+
type Fruit = Vec<(String, DocAddress)>;
99+
100+
type Child = StringConvertSegmentCollector;
101+
102+
fn for_segment(
103+
&self,
104+
segment_local_id: crate::SegmentOrdinal,
105+
segment: &SegmentReader,
106+
) -> crate::Result<Self::Child> {
107+
let schema = segment.schema();
108+
let field = schema.get_field(&self.field)?;
109+
let field_entry = schema.get_field_entry(field);
110+
if !field_entry.is_fast() {
111+
return Err(TantivyError::SchemaError(format!(
112+
"Field {:?} is not a fast field.",
113+
field_entry.name()
114+
)));
115+
}
116+
let requested_type = crate::schema::Type::Str;
117+
let schema_type = field_entry.field_type().value_type();
118+
if schema_type != requested_type {
119+
return Err(TantivyError::SchemaError(format!(
120+
"Field {:?} is of type {schema_type:?}!={requested_type:?}",
121+
field_entry.name()
122+
)));
123+
}
124+
let ff = segment
125+
.fast_fields()
126+
.str(&self.field)?
127+
.expect("ff should be a str field");
128+
Ok(StringConvertSegmentCollector {
129+
collector: self.collector.for_segment(segment_local_id, segment)?,
130+
ff,
131+
order: self.order.clone(),
132+
})
133+
}
134+
135+
fn requires_scoring(&self) -> bool {
136+
self.collector.requires_scoring()
137+
}
138+
139+
fn merge_fruits(
140+
&self,
141+
child_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
142+
) -> crate::Result<Self::Fruit> {
143+
if self.limit == 0 {
144+
return Ok(Vec::new());
145+
}
146+
if self.order.is_desc() {
147+
let mut top_collector: TopNComputer<_, _, true> =
148+
TopNComputer::new(self.limit + self.offset);
149+
for child_fruit in child_fruits {
150+
for (feature, doc) in child_fruit {
151+
top_collector.push(feature, doc);
152+
}
153+
}
154+
Ok(top_collector
155+
.into_sorted_vec()
156+
.into_iter()
157+
.skip(self.offset)
158+
.map(|cdoc| (cdoc.feature, cdoc.doc))
159+
.collect())
160+
} else {
161+
let mut top_collector: TopNComputer<_, _, false> =
162+
TopNComputer::new(self.limit + self.offset);
163+
for child_fruit in child_fruits {
164+
for (feature, doc) in child_fruit {
165+
top_collector.push(feature, doc);
166+
}
167+
}
168+
169+
Ok(top_collector
170+
.into_sorted_vec()
171+
.into_iter()
172+
.skip(self.offset)
173+
.map(|cdoc| (cdoc.feature, cdoc.doc))
174+
.collect())
175+
}
176+
}
177+
}
178+
179+
struct StringConvertSegmentCollector {
180+
pub collector: CustomScoreTopSegmentCollector<ScorerByFastFieldReader, u64>,
181+
ff: StrColumn,
182+
order: Order,
183+
}
184+
185+
impl SegmentCollector for StringConvertSegmentCollector {
186+
type Fruit = Vec<(String, DocAddress)>;
187+
188+
fn collect(&mut self, doc: DocId, score: Score) {
189+
self.collector.collect(doc, score);
190+
}
191+
192+
fn harvest(self) -> Vec<(String, DocAddress)> {
193+
let top_ordinals: Vec<(TermOrdinal, DocAddress)> = self.collector.harvest();
194+
195+
// Collect terms.
196+
let mut terms: Vec<String> = Vec::with_capacity(top_ordinals.len());
197+
let result = if self.order.is_asc() {
198+
self.ff.dictionary().sorted_ords_to_term_cb(
199+
top_ordinals.iter().map(|(term_ord, _)| u64::MAX - term_ord),
200+
|term| {
201+
terms.push(
202+
std::str::from_utf8(term)
203+
.expect("Failed to decode term as unicode")
204+
.to_owned(),
205+
);
206+
Ok(())
207+
},
208+
)
209+
} else {
210+
self.ff.dictionary().sorted_ords_to_term_cb(
211+
top_ordinals.iter().rev().map(|(term_ord, _)| *term_ord),
212+
|term| {
213+
terms.push(
214+
std::str::from_utf8(term)
215+
.expect("Failed to decode term as unicode")
216+
.to_owned(),
217+
);
218+
Ok(())
219+
},
220+
)
221+
};
222+
223+
assert!(
224+
result.expect("Failed to read terms from term dictionary"),
225+
"Not all terms were matched in segment."
226+
);
227+
228+
// Zip them back with their docs.
229+
if self.order.is_asc() {
230+
terms
231+
.into_iter()
232+
.zip(top_ordinals)
233+
.map(|(term, (_, doc))| (term, doc))
234+
.collect()
235+
} else {
236+
terms
237+
.into_iter()
238+
.rev()
239+
.zip(top_ordinals)
240+
.map(|(term, (_, doc))| (term, doc))
241+
.collect()
242+
}
243+
}
244+
}
245+
86246
/// The `TopDocs` collector keeps track of the top `K` documents
87247
/// sorted by their score.
88248
///
@@ -410,6 +570,30 @@ impl TopDocs {
410570
}
411571
}
412572

573+
/// Like `order_by_fast_field`, but for a `String` fast field.
574+
pub fn order_by_string_fast_field(
575+
self,
576+
fast_field: impl ToString,
577+
order: Order,
578+
) -> impl Collector<Fruit = Vec<(String, DocAddress)>> {
579+
let limit = self.0.limit;
580+
let offset = self.0.offset;
581+
let u64_collector = CustomScoreTopCollector::new(
582+
ScorerByField {
583+
field: fast_field.to_string(),
584+
order: order.clone(),
585+
},
586+
self.0.into_tscore(),
587+
);
588+
StringConvertCollector {
589+
collector: u64_collector,
590+
field: fast_field.to_string(),
591+
order,
592+
limit,
593+
offset,
594+
}
595+
}
596+
413597
/// Ranks the documents using a custom score.
414598
///
415599
/// This method offers a convenient way to tweak or replace
@@ -1214,6 +1398,94 @@ mod tests {
12141398
Ok(())
12151399
}
12161400

1401+
#[test]
1402+
fn test_top_field_collector_string() -> crate::Result<()> {
1403+
let mut schema_builder = Schema::builder();
1404+
let city = schema_builder.add_text_field("city", TEXT | FAST);
1405+
let schema = schema_builder.build();
1406+
let index = Index::create_in_ram(schema);
1407+
let mut index_writer = index.writer_for_tests()?;
1408+
index_writer.add_document(doc!(
1409+
city => "austin",
1410+
))?;
1411+
index_writer.add_document(doc!(
1412+
city => "greenville",
1413+
))?;
1414+
index_writer.add_document(doc!(
1415+
city => "tokyo",
1416+
))?;
1417+
index_writer.commit()?;
1418+
1419+
fn query(
1420+
index: &Index,
1421+
order: Order,
1422+
limit: usize,
1423+
offset: usize,
1424+
) -> crate::Result<Vec<(String, DocAddress)>> {
1425+
let searcher = index.reader()?.searcher();
1426+
let top_collector = TopDocs::with_limit(limit)
1427+
.and_offset(offset)
1428+
.order_by_string_fast_field("city", order);
1429+
searcher.search(&AllQuery, &top_collector)
1430+
}
1431+
1432+
assert_eq!(
1433+
&query(&index, Order::Desc, 3, 0)?,
1434+
&[
1435+
("tokyo".to_owned(), DocAddress::new(0, 2)),
1436+
("greenville".to_owned(), DocAddress::new(0, 1)),
1437+
("austin".to_owned(), DocAddress::new(0, 0)),
1438+
]
1439+
);
1440+
1441+
assert_eq!(
1442+
&query(&index, Order::Desc, 2, 0)?,
1443+
&[
1444+
("tokyo".to_owned(), DocAddress::new(0, 2)),
1445+
("greenville".to_owned(), DocAddress::new(0, 1)),
1446+
]
1447+
);
1448+
1449+
assert_eq!(&query(&index, Order::Desc, 3, 3)?, &[]);
1450+
1451+
assert_eq!(
1452+
&query(&index, Order::Desc, 2, 1)?,
1453+
&[
1454+
("greenville".to_owned(), DocAddress::new(0, 1)),
1455+
("austin".to_owned(), DocAddress::new(0, 0)),
1456+
]
1457+
);
1458+
1459+
assert_eq!(
1460+
&query(&index, Order::Asc, 3, 0)?,
1461+
&[
1462+
("austin".to_owned(), DocAddress::new(0, 0)),
1463+
("greenville".to_owned(), DocAddress::new(0, 1)),
1464+
("tokyo".to_owned(), DocAddress::new(0, 2)),
1465+
]
1466+
);
1467+
1468+
assert_eq!(
1469+
&query(&index, Order::Asc, 2, 1)?,
1470+
&[
1471+
("greenville".to_owned(), DocAddress::new(0, 1)),
1472+
("tokyo".to_owned(), DocAddress::new(0, 2)),
1473+
]
1474+
);
1475+
1476+
assert_eq!(
1477+
&query(&index, Order::Asc, 2, 0)?,
1478+
&[
1479+
("austin".to_owned(), DocAddress::new(0, 0)),
1480+
("greenville".to_owned(), DocAddress::new(0, 1)),
1481+
]
1482+
);
1483+
1484+
assert_eq!(&query(&index, Order::Asc, 3, 3)?, &[]);
1485+
1486+
Ok(())
1487+
}
1488+
12171489
#[test]
12181490
#[should_panic]
12191491
fn test_field_does_not_exist() {

0 commit comments

Comments
 (0)