Skip to content

Commit 8730989

Browse files
pravindrapraveenbingo
authored andcommitted
GDV-13: [C++] Add support for filters (apache#75)
- similar to projection, filter is built for a specific schema and condition (i.e expression) - the output of filter is a selection vector (Int16Array)
1 parent 495956a commit 8730989

20 files changed

+1142
-71
lines changed

cpp/src/gandiva/codegen/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,13 @@ set(SRC_FILES annotator.cc
3030
expr_validator.cc
3131
expression.cc
3232
expression_registry.cc
33+
filter.cc
3334
function_registry.cc
3435
function_signature.cc
3536
llvm_generator.cc
3637
llvm_types.cc
3738
projector.cc
39+
selection_vector.cc
3840
status.cc
3941
tree_expr_builder.cc
4042
${BC_FILE_PATH_CC})
@@ -93,4 +95,5 @@ add_gandiva_unit_test(tree_expr_test.cc tree_expr_builder.cc expr_decomposer.cc
9395
add_gandiva_unit_test(expr_decomposer_test.cc expr_decomposer.cc tree_expr_builder.cc annotator.cc function_registry.cc function_signature.cc)
9496
add_gandiva_unit_test(status_test.cc status.cc)
9597
add_gandiva_unit_test(expression_registry_test.cc llvm_types.cc expression_registry.cc function_signature.cc function_registry.cc)
98+
add_gandiva_unit_test(selection_vector_test.cc selection_vector.cc status.cc)
9699

cpp/src/gandiva/codegen/annotator_test.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,18 @@ TEST_F(TestAnnotator, TestAdd) {
8282

8383
auto arrow_sum = MakeInt32Array(num_records);
8484
EvalBatchPtr batch = annotator.PrepareEvalBatch(*record_batch, {arrow_sum->data()});
85-
EXPECT_EQ(batch->num_buffers(), 6);
85+
EXPECT_EQ(batch->GetNumBuffers(), 6);
8686

87-
auto buffers = batch->buffers();
87+
auto buffers = batch->GetBufferArray();
8888
EXPECT_EQ(buffers[desc_a->validity_idx()], arrow_v0->data()->buffers.at(0)->data());
8989
EXPECT_EQ(buffers[desc_a->data_idx()], arrow_v0->data()->buffers.at(1)->data());
9090
EXPECT_EQ(buffers[desc_b->validity_idx()], arrow_v1->data()->buffers.at(0)->data());
9191
EXPECT_EQ(buffers[desc_b->data_idx()], arrow_v1->data()->buffers.at(1)->data());
9292
EXPECT_EQ(buffers[desc_sum->validity_idx()], arrow_sum->data()->buffers.at(0)->data());
9393
EXPECT_EQ(buffers[desc_sum->data_idx()], arrow_sum->data()->buffers.at(1)->data());
94+
95+
auto bitmaps = batch->GetLocalBitMapArray();
96+
EXPECT_EQ(bitmaps, nullptr);
9497
}
9598

9699
} // namespace gandiva
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* Copyright (C) 2017-2018 Dremio Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#ifndef GANDIVA_CONDITION_H
17+
#define GANDIVA_CONDITION_H
18+
19+
#include "gandiva/arrow.h"
20+
#include "gandiva/expression.h"
21+
#include "gandiva/gandiva_aliases.h"
22+
23+
namespace gandiva {
24+
25+
/// \brief A condition expression.
26+
class Condition : public Expression {
27+
public:
28+
Condition(const NodePtr root)
29+
: Expression(root, std::make_shared<arrow::Field>("cond", arrow::boolean())) {}
30+
31+
virtual ~Condition() = default;
32+
};
33+
34+
} // namespace gandiva
35+
36+
#endif // GANDIVA_CONDITION_H

cpp/src/gandiva/codegen/eval_batch.h

Lines changed: 23 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <arrow/util/logging.h>
1919
#include "gandiva/arrow.h"
2020
#include "gandiva/gandiva_aliases.h"
21+
#include "local_bitmaps_holder.h"
2122

2223
namespace gandiva {
2324

@@ -26,90 +27,57 @@ namespace gandiva {
2627
class EvalBatch {
2728
public:
2829
explicit EvalBatch(int num_records, int num_buffers, int num_local_bitmaps)
29-
: num_records_(num_records),
30-
num_buffers_(num_buffers),
31-
num_local_bitmaps_(num_local_bitmaps) {
32-
buffers_ = new uint8_t *[num_buffers];
33-
AllocLocalBitMaps();
34-
}
35-
36-
~EvalBatch() {
37-
FreeLocalBitMaps();
38-
delete[] buffers_;
30+
: num_records_(num_records), num_buffers_(num_buffers) {
31+
if (num_buffers > 0) {
32+
buffers_array_.reset(new uint8_t *[num_buffers]);
33+
}
34+
local_bitmaps_holder_.reset(new LocalBitMapsHolder(num_records, num_local_bitmaps));
3935
}
4036

4137
int num_records() const { return num_records_; }
4238

43-
uint8_t **buffers() const { return buffers_; }
39+
uint8_t **GetBufferArray() const { return buffers_array_.get(); }
4440

45-
int num_buffers() const { return num_buffers_; }
41+
int GetNumBuffers() const { return num_buffers_; }
4642

4743
uint8_t *GetBuffer(int idx) const {
4844
DCHECK(idx <= num_buffers_);
49-
return buffers_[idx];
45+
return (buffers_array_.get())[idx];
5046
}
5147

5248
void SetBuffer(int idx, uint8_t *buffer) {
5349
DCHECK(idx <= num_buffers_);
54-
buffers_[idx] = buffer;
50+
(buffers_array_.get())[idx] = buffer;
5551
}
5652

57-
uint8_t **local_bitmaps() const { return local_bitmaps_; }
53+
int GetNumLocalBitMaps() const { return local_bitmaps_holder_->GetNumLocalBitMaps(); }
5854

59-
int num_local_bitmaps() const { return num_local_bitmaps_; }
55+
int GetLocalBitmapSize() const { return local_bitmaps_holder_->GetLocalBitMapSize(); }
6056

6157
uint8_t *GetLocalBitMap(int idx) const {
62-
DCHECK(idx <= num_local_bitmaps_);
63-
return local_bitmaps_[idx];
58+
DCHECK(idx <= GetNumLocalBitMaps());
59+
return local_bitmaps_holder_->GetLocalBitMap(idx);
6460
}
6561

66-
private:
67-
/// Alloc 'num_local_bitmaps_' number of bitmaps, each of capacity 'num_records_'.
68-
void AllocLocalBitMaps();
69-
70-
/// Free up local bitmaps, if any.
71-
void FreeLocalBitMaps();
62+
uint8_t **GetLocalBitMapArray() const {
63+
return local_bitmaps_holder_->GetLocalBitMapArray();
64+
}
7265

66+
private:
7367
/// number of records in the current batch.
7468
int num_records_;
7569

70+
// number of buffers.
71+
int num_buffers_;
72+
7673
/// An array of 'num_buffers_', each containing a buffer. The buffer
7774
/// sizes depends on the data type, but all of them have the same
7875
/// number of slots (equal to num_records_).
79-
uint8_t **buffers_;
80-
int num_buffers_;
76+
std::unique_ptr<uint8_t *> buffers_array_;
8177

82-
/// An array of 'local_bitmaps_', each sized to accomodate 'num_records'.
83-
uint8_t **local_bitmaps_;
84-
int num_local_bitmaps_;
78+
std::unique_ptr<LocalBitMapsHolder> local_bitmaps_holder_;
8579
};
8680

87-
inline void EvalBatch::AllocLocalBitMaps() {
88-
if (num_local_bitmaps_ == 0) {
89-
local_bitmaps_ = nullptr;
90-
return;
91-
}
92-
93-
// 64-bit aligned bitmaps.
94-
int bitmap_sz = arrow::BitUtil::RoundUpNumi64(num_records_) * 8;
95-
96-
local_bitmaps_ = new uint8_t *[num_local_bitmaps_];
97-
for (int i = 0; i < num_local_bitmaps_; ++i) {
98-
// TODO : round-up to a slab friendly multiple.
99-
local_bitmaps_[i] = new uint8_t[bitmap_sz];
100-
101-
// pre-fill with 1s (assuming that the probability of is_valid is higher).
102-
memset(local_bitmaps_[i], 0xff, bitmap_sz);
103-
}
104-
}
105-
106-
inline void EvalBatch::FreeLocalBitMaps() {
107-
for (int i = 0; i < num_local_bitmaps_; ++i) {
108-
delete[] local_bitmaps_[i];
109-
}
110-
delete[] local_bitmaps_;
111-
}
112-
11381
} // namespace gandiva
11482

11583
#endif // GANDIVA_EXPR_EVALBATCH_H

cpp/src/gandiva/codegen/expression.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class Expression {
2626
public:
2727
Expression(const NodePtr root, const FieldPtr result) : root_(root), result_(result) {}
2828

29+
virtual ~Expression() = default;
30+
2931
const NodePtr &root() const { return root_; }
3032

3133
const FieldPtr &result() const { return result_; }

cpp/src/gandiva/codegen/filter.cc

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
// Copyright (C) 2017-2018 Dremio Corporation
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "gandiva/filter.h"
16+
17+
#include <memory>
18+
#include <utility>
19+
#include <vector>
20+
21+
#include "codegen/bitmap_accumulator.h"
22+
#include "codegen/expr_validator.h"
23+
#include "codegen/llvm_generator.h"
24+
#include "gandiva/condition.h"
25+
#include "gandiva/status.h"
26+
27+
namespace gandiva {
28+
29+
Filter::Filter(std::unique_ptr<LLVMGenerator> llvm_generator, SchemaPtr schema,
30+
std::shared_ptr<Configuration> configuration)
31+
: llvm_generator_(std::move(llvm_generator)),
32+
schema_(schema),
33+
configuration_(configuration) {}
34+
35+
Status Filter::Make(SchemaPtr schema, ConditionPtr condition,
36+
std::shared_ptr<Configuration> configuration,
37+
std::shared_ptr<Filter> *filter) {
38+
GANDIVA_RETURN_FAILURE_IF_FALSE(schema != nullptr,
39+
Status::Invalid("schema cannot be null"));
40+
GANDIVA_RETURN_FAILURE_IF_FALSE(condition != nullptr,
41+
Status::Invalid("condition cannot be null"));
42+
GANDIVA_RETURN_FAILURE_IF_FALSE(configuration != nullptr,
43+
Status::Invalid("configuration cannot be null"));
44+
// Build LLVM generator, and generate code for the specified expression
45+
std::unique_ptr<LLVMGenerator> llvm_gen;
46+
Status status = LLVMGenerator::Make(configuration, &llvm_gen);
47+
GANDIVA_RETURN_NOT_OK(status);
48+
49+
// Run the validation on the expression.
50+
// Return if the expression is invalid since we will not be able to process further.
51+
ExprValidator expr_validator(llvm_gen->types(), schema);
52+
status = expr_validator.Validate(condition);
53+
GANDIVA_RETURN_NOT_OK(status);
54+
55+
status = llvm_gen->Build({condition});
56+
GANDIVA_RETURN_NOT_OK(status);
57+
58+
// Instantiate the filter with the completely built llvm generator
59+
*filter = std::make_shared<Filter>(std::move(llvm_gen), schema, configuration);
60+
return Status::OK();
61+
}
62+
63+
Status Filter::Evaluate(const arrow::RecordBatch &batch,
64+
std::shared_ptr<SelectionVector> out_selection) {
65+
if (!batch.schema()->Equals(*schema_)) {
66+
return Status::Invalid("Schema in RecordBatch must match the schema in Make()");
67+
}
68+
if (batch.num_rows() == 0) {
69+
return Status::Invalid("RecordBatch must be non-empty.");
70+
}
71+
if (out_selection == nullptr) {
72+
return Status::Invalid("out_selection must be non-null.");
73+
}
74+
if (out_selection->GetMaxSlots() < batch.num_rows()) {
75+
std::stringstream ss;
76+
ss << "out_selection has " << out_selection->GetMaxSlots()
77+
<< " slots, which is less than the batch size " << batch.num_rows();
78+
return Status::Invalid(ss.str());
79+
}
80+
81+
// Allocate three local_bitmaps (one for output, one for validity, one to compute the
82+
// intersection).
83+
LocalBitMapsHolder bitmaps(batch.num_rows(), 3 /*local_bitmaps*/);
84+
int bitmap_size = bitmaps.GetLocalBitMapSize();
85+
86+
auto validity = std::make_shared<arrow::Buffer>(bitmaps.GetLocalBitMap(0), bitmap_size);
87+
auto value = std::make_shared<arrow::Buffer>(bitmaps.GetLocalBitMap(1), bitmap_size);
88+
auto array_data =
89+
arrow::ArrayData::Make(arrow::boolean(), batch.num_rows(), {validity, value});
90+
91+
// Execute the expression(s).
92+
auto status = llvm_generator_->Execute(batch, {array_data});
93+
GANDIVA_RETURN_NOT_OK(status);
94+
95+
// Compute the intersection of the value and validity.
96+
auto result = bitmaps.GetLocalBitMap(2);
97+
BitMapAccumulator::IntersectBitMaps(
98+
result, {bitmaps.GetLocalBitMap(0), bitmaps.GetLocalBitMap((1))}, bitmap_size);
99+
100+
return out_selection->PopulateFromBitMap(result, bitmap_size, batch.num_rows() - 1);
101+
}
102+
103+
} // namespace gandiva

cpp/src/gandiva/codegen/filter.h

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright (C) 2017-2018 Dremio Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either condess or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#ifndef GANDIVA_EXPR_FILTER_H
17+
#define GANDIVA_EXPR_FILTER_H
18+
19+
#include <memory>
20+
#include <string>
21+
#include <utility>
22+
#include <vector>
23+
24+
#include "gandiva/arrow.h"
25+
#include "gandiva/condition.h"
26+
#include "gandiva/configuration.h"
27+
#include "gandiva/selection_vector.h"
28+
#include "gandiva/status.h"
29+
30+
namespace gandiva {
31+
32+
class LLVMGenerator;
33+
34+
/// \brief filter records based on a condition.
35+
///
36+
/// A filter is built for a specific schema and condition. Once the filter is built, it
37+
/// can be used to evaluate many row batches.
38+
class Filter {
39+
public:
40+
Filter(std::unique_ptr<LLVMGenerator> llvm_generator, SchemaPtr schema,
41+
std::shared_ptr<Configuration> config);
42+
43+
~Filter() = default;
44+
45+
/// Build a filter for the given schema and condition, with the default configuration.
46+
///
47+
/// \param[in] : schema schema for the record batches, and the condition.
48+
/// \param[in] : condition filter condition.
49+
/// \param[out]: filter the returned filter object
50+
static Status Make(SchemaPtr schema, ConditionPtr condition,
51+
std::shared_ptr<Filter> *filter) {
52+
return Make(schema, condition, ConfigurationBuilder::DefaultConfiguration(), filter);
53+
}
54+
55+
/// \brief Build a filter for the given schema and condition.
56+
/// Customize the filter with runtime configuration.
57+
///
58+
/// \param[in] : schema schema for the record batches, and the condition.
59+
/// \param[in] : condition filter conditions.
60+
/// \param[in] : config run time configuration.
61+
/// \param[out]: filter the returned filter object
62+
static Status Make(SchemaPtr schema, ConditionPtr condition,
63+
std::shared_ptr<Configuration> config,
64+
std::shared_ptr<Filter> *filter);
65+
66+
/// Evaluate the specified record batch, and populate output selection vector.
67+
///
68+
/// \param[in] : batch the record batch. schema should be the same as the one in 'Make'
69+
/// \param[in/out]: out_selection the selection array with indices of rows that match
70+
/// the condition.
71+
Status Evaluate(const arrow::RecordBatch &batch,
72+
std::shared_ptr<SelectionVector> out_selection);
73+
74+
private:
75+
const std::unique_ptr<LLVMGenerator> llvm_generator_;
76+
const SchemaPtr schema_;
77+
const std::shared_ptr<Configuration> configuration_;
78+
};
79+
80+
} // namespace gandiva
81+
82+
#endif // GANDIVA_EXPR_FILTER_H

0 commit comments

Comments
 (0)