-
Notifications
You must be signed in to change notification settings - Fork 529
feat: support FixedSizeList<Struct> #5593
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
21164cc
5f7daf0
d4bea20
507efd9
7736131
3d0064b
9eb5a9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,7 @@ use arrow_array::{ | |
| make_array, | ||
| types::{ArrowDictionaryKeyType, BinaryType, ByteArrayType, Utf8Type}, | ||
| Array, BinaryArray, FixedSizeBinaryArray, FixedSizeListArray, Float32Array, LargeListArray, | ||
| LargeStringArray, ListArray, NullArray, OffsetSizeTrait, PrimitiveArray, RecordBatch, | ||
| LargeStringArray, ListArray, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray, RecordBatch, | ||
| RecordBatchOptions, RecordBatchReader, StringArray, StructArray, | ||
| }; | ||
| use arrow_schema::{ArrowError, DataType, Field, Fields, IntervalUnit, Schema, SchemaRef}; | ||
|
|
@@ -1712,6 +1712,85 @@ impl ArrayGenerator for RandomListGenerator { | |
| } | ||
| } | ||
|
|
||
| /// Generates random map arrays where each map has 0-4 entries. | ||
| #[derive(Debug)] | ||
| struct RandomMapGenerator { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you document this (or the |
||
| field: Arc<Field>, | ||
| entries_field: Arc<Field>, | ||
| keys_gen: Box<dyn ArrayGenerator>, | ||
| values_gen: Box<dyn ArrayGenerator>, | ||
| lengths_gen: Box<dyn ArrayGenerator>, | ||
| } | ||
|
|
||
| impl RandomMapGenerator { | ||
| fn new(keys_gen: Box<dyn ArrayGenerator>, values_gen: Box<dyn ArrayGenerator>) -> Self { | ||
| let entries_fields = Fields::from(vec![ | ||
| Field::new("keys", keys_gen.data_type().clone(), false), | ||
| Field::new("values", values_gen.data_type().clone(), true), | ||
| ]); | ||
| let entries_field = Arc::new(Field::new( | ||
| "entries", | ||
| DataType::Struct(entries_fields), | ||
| false, | ||
| )); | ||
| let map_type = DataType::Map(entries_field.clone(), false); | ||
| let field = Arc::new(Field::new("", map_type, true)); | ||
| let lengths_dist = Uniform::new_inclusive(0_i32, 4).unwrap(); | ||
| let lengths_gen = rand_with_distribution::<Int32Type, Uniform<i32>>(lengths_dist); | ||
|
|
||
| Self { | ||
| field, | ||
| entries_field, | ||
| keys_gen, | ||
| values_gen, | ||
| lengths_gen, | ||
| } | ||
| } | ||
| } | ||
|
|
||
| impl ArrayGenerator for RandomMapGenerator { | ||
| fn generate( | ||
| &mut self, | ||
| length: RowCount, | ||
| rng: &mut rand_xoshiro::Xoshiro256PlusPlus, | ||
| ) -> Result<Arc<dyn Array>, ArrowError> { | ||
| let lengths = self.lengths_gen.generate(length, rng)?; | ||
| let lengths = lengths.as_primitive::<Int32Type>(); | ||
| let total_entries = lengths.values().iter().sum::<i32>() as u64; | ||
| let offsets = OffsetBuffer::from_lengths(lengths.values().iter().map(|v| *v as usize)); | ||
|
|
||
| let keys = self.keys_gen.generate(RowCount::from(total_entries), rng)?; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not really relevant but I wonder if keys need to be unique within a map? I guess not. |
||
| let values = self | ||
| .values_gen | ||
| .generate(RowCount::from(total_entries), rng)?; | ||
|
|
||
| let entries = StructArray::new( | ||
| Fields::from(vec![ | ||
| Field::new("keys", keys.data_type().clone(), false), | ||
| Field::new("values", values.data_type().clone(), true), | ||
| ]), | ||
| vec![keys, values], | ||
| None, | ||
| ); | ||
|
|
||
| Ok(Arc::new(MapArray::try_new( | ||
| self.entries_field.clone(), | ||
| offsets, | ||
| entries, | ||
| None, | ||
| false, | ||
| )?)) | ||
| } | ||
|
|
||
| fn data_type(&self) -> &DataType { | ||
| self.field.data_type() | ||
| } | ||
|
|
||
| fn element_size_bytes(&self) -> Option<ByteCount> { | ||
| None | ||
| } | ||
| } | ||
|
|
||
| #[derive(Debug)] | ||
| struct NullArrayGenerator {} | ||
|
|
||
|
|
@@ -2754,6 +2833,13 @@ pub mod array { | |
| Box::new(RandomListGenerator::new(item_gen, is_large)) | ||
| } | ||
|
|
||
| /// Generates random map arrays where each map has 0-4 entries. | ||
| pub fn rand_map(key_type: &DataType, value_type: &DataType) -> Box<dyn ArrayGenerator> { | ||
| let keys_gen = rand_type(key_type); | ||
| let values_gen = rand_type(value_type); | ||
| Box::new(RandomMapGenerator::new(keys_gen, values_gen)) | ||
| } | ||
|
|
||
| pub fn rand_struct(fields: Fields) -> Box<dyn ArrayGenerator> { | ||
| let child_gens = fields | ||
| .iter() | ||
|
|
@@ -2797,6 +2883,14 @@ pub mod array { | |
| DataType::FixedSizeBinary(size) => rand_fsb(*size), | ||
| DataType::List(child) => rand_list(child.data_type(), false), | ||
| DataType::LargeList(child) => rand_list(child.data_type(), true), | ||
| DataType::Map(entries_field, _) => { | ||
| let DataType::Struct(fields) = entries_field.data_type() else { | ||
| panic!("Map entries field must be a struct"); | ||
| }; | ||
| let key_type = fields[0].data_type(); | ||
| let value_type = fields[1].data_type(); | ||
| rand_map(key_type, value_type) | ||
| } | ||
| DataType::Duration(unit) => match unit { | ||
| TimeUnit::Second => rand::<DurationSecondType>(), | ||
| TimeUnit::Millisecond => rand::<DurationMillisecondType>(), | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this mean we can panic if we read a dataset that has a corrupt schema?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the lance schema. I'm not entirely sure users are able to create their own lance schema. So I think the only way this could happen is if there was some kind of corrupt protobuf. Also, there is a significant panic potential down below at
lt => DataType::try_from(lt).unwrap().I suppose it is technically a valid concern but this method has many many callsites and changing it to result returning should probably be a PR on its own or else this one is going to get real confusing.