merge column: small refactors (#2579)

PSeitz · web-flow · commit d5d2d412642c · 2025-03-07T18:52:34.000+08:00
* merge column: small refactors

* make ord dependency more explicit

* add columnar merge crashtest proptest

* fix naming
diff --git a/columnar/src/column_index/merge/stacked.rs b/columnar/src/column_index/merge/stacked.rs
@@ -56,7 +56,7 @@ fn get_doc_ids_with_values<'a>(
         ColumnIndex::Full => Box::new(doc_range),
         ColumnIndex::Optional(optional_index) => Box::new(
             optional_index
-                .iter_rows()
+                .iter_docs()
                 .map(move |row| row + doc_range.start),
         ),
         ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
@@ -73,7 +73,7 @@ fn get_doc_ids_with_values<'a>(
             MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
                 multivalued_index
                     .optional_index
-                    .iter_rows()
+                    .iter_docs()
                     .map(move |row| row + doc_range.start),
             ),
         },
@@ -177,7 +177,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
                         ColumnIndex::Full => Box::new(columnar_row_range),
                         ColumnIndex::Optional(optional_index) => Box::new(
                             optional_index
-                                .iter_rows()
+                                .iter_docs()
                                 .map(move |row_id: RowId| columnar_row_range.start + row_id),
                         ),
                         ColumnIndex::Multivalued(_) => {
diff --git a/columnar/src/column_index/optional_index/mod.rs b/columnar/src/column_index/optional_index/mod.rs
@@ -80,23 +80,23 @@ impl BlockVariant {
 /// index is the block index. For each block `byte_start` and `offset` is computed.
 #[derive(Clone)]
 pub struct OptionalIndex {
-    num_rows: RowId,
-    num_non_null_rows: RowId,
+    num_docs: RowId,
+    num_non_null_docs: RowId,
     block_data: OwnedBytes,
     block_metas: Arc<[BlockMeta]>,
 }
 
 impl Iterable<u32> for &OptionalIndex {
     fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
-        Box::new(self.iter_rows())
+        Box::new(self.iter_docs())
     }
 }
 
 impl std::fmt::Debug for OptionalIndex {
     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
         f.debug_struct("OptionalIndex")
-            .field("num_rows", &self.num_rows)
-            .field("num_non_null_rows", &self.num_non_null_rows)
+            .field("num_docs", &self.num_docs)
+            .field("num_non_null_docs", &self.num_non_null_docs)
             .finish_non_exhaustive()
     }
 }
@@ -271,17 +271,17 @@ impl OptionalIndex {
     }
 
     pub fn num_docs(&self) -> RowId {
-        self.num_rows
+        self.num_docs
     }
 
     pub fn num_non_nulls(&self) -> RowId {
-        self.num_non_null_rows
+        self.num_non_null_docs
     }
 
-    pub fn iter_rows(&self) -> impl Iterator<Item = RowId> + '_ {
+    pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
         // TODO optimize
         let mut select_batch = self.select_cursor();
-        (0..self.num_non_null_rows).map(move |rank| select_batch.select(rank))
+        (0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
     }
     pub fn select_batch(&self, ranks: &mut [RowId]) {
         let mut select_cursor = self.select_cursor();
@@ -519,15 +519,15 @@ pub fn open_optional_index(bytes: OwnedBytes) -> io::Result<OptionalIndex> {
     let (mut bytes, num_non_empty_blocks_bytes) = bytes.rsplit(2);
     let num_non_empty_block_bytes =
         u16::from_le_bytes(num_non_empty_blocks_bytes.as_slice().try_into().unwrap());
-    let num_rows = VInt::deserialize_u64(&mut bytes)? as u32;
+    let num_docs = VInt::deserialize_u64(&mut bytes)? as u32;
     let block_metas_num_bytes =
         num_non_empty_block_bytes as usize * SERIALIZED_BLOCK_META_NUM_BYTES;
     let (block_data, block_metas) = bytes.rsplit(block_metas_num_bytes);
-    let (block_metas, num_non_null_rows) =
-        deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_rows);
+    let (block_metas, num_non_null_docs) =
+        deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_docs);
     let optional_index = OptionalIndex {
-        num_rows,
-        num_non_null_rows,
+        num_docs,
+        num_non_null_docs,
         block_data,
         block_metas: block_metas.into(),
     };
diff --git a/columnar/src/column_index/optional_index/tests.rs b/columnar/src/column_index/optional_index/tests.rs
@@ -164,7 +164,7 @@ fn test_optional_index_large() {
 fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
     let optional_index = OptionalIndex::for_test(num_rows, row_ids);
     assert_eq!(optional_index.num_docs(), num_rows);
-    assert!(optional_index.iter_rows().eq(row_ids.iter().copied()));
+    assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
 }
 
 #[test]
diff --git a/columnar/src/columnar/merge/merge_dict_column.rs b/columnar/src/columnar/merge/merge_dict_column.rs
@@ -3,7 +3,7 @@ use std::io::{self, Write};
 use common::{BitSet, CountingWriter, ReadOnlyBitSet};
 use sstable::{SSTable, Streamer, TermOrdinal, VoidSSTable};
 
-use super::term_merger::TermMerger;
+use super::term_merger::{TermMerger, TermsWithSegmentOrd};
 use crate::column::serialize_column_mappable_to_u64;
 use crate::column_index::SerializableColumnIndex;
 use crate::iterable::Iterable;
@@ -126,14 +126,17 @@ fn serialize_merged_dict(
     let mut term_ord_mapping = TermOrdinalMapping::default();
 
     let mut field_term_streams = Vec::new();
-    for column_opt in bytes_columns.iter() {
+    for (segment_ord, column_opt) in bytes_columns.iter().enumerate() {
         if let Some(column) = column_opt {
             term_ord_mapping.add_segment(column.dictionary.num_terms());
             let terms: Streamer<VoidSSTable> = column.dictionary.stream()?;
-            field_term_streams.push(terms);
+            field_term_streams.push(TermsWithSegmentOrd { terms, segment_ord });
         } else {
             term_ord_mapping.add_segment(0);
-            field_term_streams.push(Streamer::empty());
+            field_term_streams.push(TermsWithSegmentOrd {
+                terms: Streamer::empty(),
+                segment_ord,
+            });
         }
     }
 
@@ -191,6 +194,7 @@ fn serialize_merged_dict(
 
 #[derive(Default, Debug)]
 struct TermOrdinalMapping {
+    /// Contains the new term ordinals for each segment.
     per_segment_new_term_ordinals: Vec<Vec<TermOrdinal>>,
 }
 
@@ -205,6 +209,6 @@ impl TermOrdinalMapping {
     }
 
     fn get_segment(&self, segment_ord: u32) -> &[TermOrdinal] {
-        &(self.per_segment_new_term_ordinals[segment_ord as usize])[..]
+        &self.per_segment_new_term_ordinals[segment_ord as usize]
     }
 }
diff --git a/columnar/src/columnar/merge/merge_mapping.rs b/columnar/src/columnar/merge/merge_mapping.rs
@@ -26,7 +26,7 @@ impl StackMergeOrder {
         let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(columnars.len());
         let mut cumulated_row_id = 0;
         for columnar in columnars {
-            cumulated_row_id += columnar.num_rows();
+            cumulated_row_id += columnar.num_docs();
             cumulated_row_ids.push(cumulated_row_id);
         }
         StackMergeOrder { cumulated_row_ids }
diff --git a/columnar/src/columnar/merge/mod.rs b/columnar/src/columnar/merge/mod.rs
@@ -80,29 +80,31 @@ pub fn merge_columnar(
     output: &mut impl io::Write,
 ) -> io::Result<()> {
     let mut serializer = ColumnarSerializer::new(output);
-    let num_rows_per_columnar = columnar_readers
+    let num_docs_per_columnar = columnar_readers
         .iter()
-        .map(|reader| reader.num_rows())
+        .map(|reader| reader.num_docs())
         .collect::<Vec<u32>>();
 
-    let columns_to_merge =
-        group_columns_for_merge(columnar_readers, required_columns, &merge_row_order)?;
+    let columns_to_merge = group_columns_for_merge(columnar_readers, required_columns)?;
     for res in columns_to_merge {
         let ((column_name, _column_type_category), grouped_columns) = res;
         let grouped_columns = grouped_columns.open(&merge_row_order)?;
         if grouped_columns.is_empty() {
             continue;
         }
 
-        let column_type = grouped_columns.column_type_after_merge();
+        let column_type_after_merge = grouped_columns.column_type_after_merge();
         let mut columns = grouped_columns.columns;
-        coerce_columns(column_type, &mut columns)?;
+        // Make sure the number of columns is the same as the number of columnar readers.
+        // Or num_docs_per_columnar would be incorrect.
+        assert_eq!(columns.len(), columnar_readers.len());
+        coerce_columns(column_type_after_merge, &mut columns)?;
 
         let mut column_serializer =
-            serializer.start_serialize_column(column_name.as_bytes(), column_type);
+            serializer.start_serialize_column(column_name.as_bytes(), column_type_after_merge);
         merge_column(
-            column_type,
-            &num_rows_per_columnar,
+            column_type_after_merge,
+            &num_docs_per_columnar,
             columns,
             &merge_row_order,
             &mut column_serializer,
@@ -128,7 +130,7 @@ fn dynamic_column_to_u64_monotonic(dynamic_column: DynamicColumn) -> Option<Colu
 fn merge_column(
     column_type: ColumnType,
     num_docs_per_column: &[u32],
-    columns: Vec<Option<DynamicColumn>>,
+    columns_to_merge: Vec<Option<DynamicColumn>>,
     merge_row_order: &MergeRowOrder,
     wrt: &mut impl io::Write,
 ) -> io::Result<()> {
@@ -138,10 +140,10 @@ fn merge_column(
         | ColumnType::F64
         | ColumnType::DateTime
         | ColumnType::Bool => {
-            let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
+            let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns_to_merge.len());
             let mut column_values: Vec<Option<Arc<dyn ColumnValues>>> =
-                Vec::with_capacity(columns.len());
-            for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
+                Vec::with_capacity(columns_to_merge.len());
+            for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() {
                 if let Some(Column { index: idx, values }) =
                     dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic)
                 {
@@ -164,10 +166,10 @@ fn merge_column(
             serialize_column_mappable_to_u64(merged_column_index, &merge_column_values, wrt)?;
         }
         ColumnType::IpAddr => {
-            let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
+            let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns_to_merge.len());
             let mut column_values: Vec<Option<Arc<dyn ColumnValues<Ipv6Addr>>>> =
-                Vec::with_capacity(columns.len());
-            for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
+                Vec::with_capacity(columns_to_merge.len());
+            for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() {
                 if let Some(DynamicColumn::IpAddr(Column { index: idx, values })) =
                     dynamic_column_opt
                 {
@@ -192,9 +194,10 @@ fn merge_column(
             serialize_column_mappable_to_u128(merged_column_index, &merge_column_values, wrt)?;
         }
         ColumnType::Bytes | ColumnType::Str => {
-            let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
-            let mut bytes_columns: Vec<Option<BytesColumn>> = Vec::with_capacity(columns.len());
-            for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
+            let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns_to_merge.len());
+            let mut bytes_columns: Vec<Option<BytesColumn>> =
+                Vec::with_capacity(columns_to_merge.len());
+            for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() {
                 match dynamic_column_opt {
                     Some(DynamicColumn::Str(str_column)) => {
                         column_indexes.push(str_column.term_ord_column.index.clone());
@@ -248,7 +251,7 @@ impl GroupedColumns {
         if column_type.len() == 1 {
             return column_type.into_iter().next().unwrap();
         }
-        // At the moment, only the numerical categorical column type has more than one possible
+        // At the moment, only the numerical column type category has more than one possible
         // column type.
         assert!(self
             .columns
@@ -361,7 +364,7 @@ fn is_empty_after_merge(
                     ColumnIndex::Empty { .. } => true,
                     ColumnIndex::Full => alive_bitset.len() == 0,
                     ColumnIndex::Optional(optional_index) => {
-                        for doc in optional_index.iter_rows() {
+                        for doc in optional_index.iter_docs() {
                             if alive_bitset.contains(doc) {
                                 return false;
                             }
@@ -391,7 +394,6 @@ fn is_empty_after_merge(
 fn group_columns_for_merge<'a>(
     columnar_readers: &'a [&'a ColumnarReader],
     required_columns: &'a [(String, ColumnType)],
-    _merge_row_order: &'a MergeRowOrder,
 ) -> io::Result<BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle>> {
     let mut columns: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = BTreeMap::new();
 
diff --git a/columnar/src/columnar/merge/term_merger.rs b/columnar/src/columnar/merge/term_merger.rs
@@ -5,28 +5,29 @@ use sstable::TermOrdinal;
 
 use crate::Streamer;
 
-pub struct HeapItem<'a> {
-    pub streamer: Streamer<'a>,
+/// The terms of a column with the ordinal of the segment.
+pub struct TermsWithSegmentOrd<'a> {
+    pub terms: Streamer<'a>,
     pub segment_ord: usize,
 }
 
-impl PartialEq for HeapItem<'_> {
+impl PartialEq for TermsWithSegmentOrd<'_> {
     fn eq(&self, other: &Self) -> bool {
         self.segment_ord == other.segment_ord
     }
 }
 
-impl Eq for HeapItem<'_> {}
+impl Eq for TermsWithSegmentOrd<'_> {}
 
-impl<'a> PartialOrd for HeapItem<'a> {
-    fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
+impl<'a> PartialOrd for TermsWithSegmentOrd<'a> {
+    fn partial_cmp(&self, other: &TermsWithSegmentOrd<'a>) -> Option<Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl<'a> Ord for HeapItem<'a> {
-    fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
-        (&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
+impl<'a> Ord for TermsWithSegmentOrd<'a> {
+    fn cmp(&self, other: &TermsWithSegmentOrd<'a>) -> Ordering {
+        (&other.terms.key(), &other.segment_ord).cmp(&(&self.terms.key(), &self.segment_ord))
     }
 }
 
@@ -37,39 +38,32 @@ impl<'a> Ord for HeapItem<'a> {
 /// - the term
 /// - a slice with the ordinal of the segments containing the terms.
 pub struct TermMerger<'a> {
-    heap: BinaryHeap<HeapItem<'a>>,
-    current_streamers: Vec<HeapItem<'a>>,
+    heap: BinaryHeap<TermsWithSegmentOrd<'a>>,
+    term_streams_with_segment: Vec<TermsWithSegmentOrd<'a>>,
 }
 
 impl<'a> TermMerger<'a> {
     /// Stream of merged term dictionary
-    pub fn new(streams: Vec<Streamer<'a>>) -> TermMerger<'a> {
+    pub fn new(term_streams_with_segment: Vec<TermsWithSegmentOrd<'a>>) -> TermMerger<'a> {
         TermMerger {
             heap: BinaryHeap::new(),
-            current_streamers: streams
-                .into_iter()
-                .enumerate()
-                .map(|(ord, streamer)| HeapItem {
-                    streamer,
-                    segment_ord: ord,
-                })
-                .collect(),
+            term_streams_with_segment,
         }
     }
 
     pub(crate) fn matching_segments<'b: 'a>(
         &'b self,
     ) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
-        self.current_streamers
+        self.term_streams_with_segment
             .iter()
-            .map(|heap_item| (heap_item.segment_ord, heap_item.streamer.term_ord()))
+            .map(|heap_item| (heap_item.segment_ord, heap_item.terms.term_ord()))
     }
 
     fn advance_segments(&mut self) {
-        let streamers = &mut self.current_streamers;
+        let streamers = &mut self.term_streams_with_segment;
         let heap = &mut self.heap;
         for mut heap_item in streamers.drain(..) {
-            if heap_item.streamer.advance() {
+            if heap_item.terms.advance() {
                 heap.push(heap_item);
             }
         }
@@ -81,13 +75,13 @@ impl<'a> TermMerger<'a> {
     pub fn advance(&mut self) -> bool {
         self.advance_segments();
         if let Some(head) = self.heap.pop() {
-            self.current_streamers.push(head);
+            self.term_streams_with_segment.push(head);
             while let Some(next_streamer) = self.heap.peek() {
-                if self.current_streamers[0].streamer.key() != next_streamer.streamer.key() {
+                if self.term_streams_with_segment[0].terms.key() != next_streamer.terms.key() {
                     break;
                 }
                 let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
-                self.current_streamers.push(next_heap_it);
+                self.term_streams_with_segment.push(next_heap_it);
             }
             true
         } else {
@@ -101,6 +95,6 @@ impl<'a> TermMerger<'a> {
     /// if and only if advance() has been called before
     /// and "true" was returned.
     pub fn key(&self) -> &[u8] {
-        self.current_streamers[0].streamer.key()
+        self.term_streams_with_segment[0].terms.key()
     }
 }
diff --git a/columnar/src/columnar/merge/tests.rs b/columnar/src/columnar/merge/tests.rs
diff --git a/columnar/src/columnar/reader/mod.rs b/columnar/src/columnar/reader/mod.rs
diff --git a/columnar/src/tests.rs b/columnar/src/tests.rs

Original file line number	Diff line number	Diff line change
`@@ -164,7 +164,7 @@ fn test_optional_index_large() {`
`164`	`164`	`fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {`
`165`	`165`	`let optional_index = OptionalIndex::for_test(num_rows, row_ids);`
`166`	`166`	`assert_eq!(optional_index.num_docs(), num_rows);`
`167`		`- assert!(optional_index.iter_rows().eq(row_ids.iter().copied()));`
	`167`	`+ assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));`
`168`	`168`	`}`
`169`	`169`
`170`	`170`	`#[test]`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ impl StackMergeOrder {`
`26`	`26`	`let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(columnars.len());`
`27`	`27`	`let mut cumulated_row_id = 0;`
`28`	`28`	`for columnar in columnars {`
`29`		`- cumulated_row_id += columnar.num_rows();`
	`29`	`+ cumulated_row_id += columnar.num_docs();`
`30`	`30`	`cumulated_row_ids.push(cumulated_row_id);`
`31`	`31`	`}`
`32`	`32`	`StackMergeOrder { cumulated_row_ids }`