Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 38 additions & 8 deletions datafusion/common/src/hash_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,22 +214,19 @@ fn hash_struct_array(
hashes_buffer: &mut [u64],
) -> Result<()> {
let nulls = array.nulls();
let num_columns = array.num_columns();
let row_len = array.len();

// Skip null columns
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I forgot to correct the comment last time. Indices are all about row, not columns

let valid_indices: Vec<usize> = if let Some(nulls) = nulls {
let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
nulls.valid_indices().collect()
} else {
(0..num_columns).collect()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Main fix

(0..row_len).collect()
};

// Create hashes for each row that combines the hashes over all the column at that row.
// array.len() is the number of rows.
let mut values_hashes = vec![0u64; array.len()];
let mut values_hashes = vec![0u64; row_len];
create_hashes(array.columns(), random_state, &mut values_hashes)?;

// Skip the null columns, nulls should get hash value 0.
for i in valid_indices {
for i in valid_row_indices {
let hash = &mut hashes_buffer[i];
*hash = combine_hashes(*hash, values_hashes[i]);
}
Expand Down Expand Up @@ -601,6 +598,39 @@ mod tests {
assert_eq!(hashes[4], hashes[5]);
}

#[test]
// Tests actual values of hashes, which are different if forcing collisions
#[cfg(not(feature = "force_hash_collisions"))]
fn create_hashes_for_struct_arrays_more_column_than_row() {
let struct_array = StructArray::from(vec![
(
Arc::new(Field::new("bool", DataType::Boolean, false)),
Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
),
(
Arc::new(Field::new("i32-1", DataType::Int32, false)),
Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
),
(
Arc::new(Field::new("i32-2", DataType::Int32, false)),
Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
),
(
Arc::new(Field::new("i32-3", DataType::Int32, false)),
Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
),
]);

assert!(struct_array.is_valid(0));
assert!(struct_array.is_valid(1));

let array = Arc::new(struct_array) as ArrayRef;
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut hashes = vec![0; array.len()];
create_hashes(&[array], &random_state, &mut hashes).unwrap();
assert_eq!(hashes[0], hashes[1]);
}

#[test]
// Tests actual values of hashes, which are different if forcing collisions
#[cfg(not(feature = "force_hash_collisions"))]
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/dictionary.slt
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ select count(*) from m1 where tag_id = '1000' and time < '2024-01-03T14:46:35+01
----
10

query RRR
query RRR rowsort
Copy link
Contributor Author

@jayzhan211 jayzhan211 Jan 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rowsort is added because I got and groupby is not order sensitive

700 1000 850
100 600 350

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah -- also added the same fix in #8769 and I agree this looks good

select min(f5), max(f5), avg(f5) from m2 where tag_id = '1000' and time < '2024-01-03T14:46:35+01:00' group by type;
----
100 600 350
Expand Down