Skip to content
70 changes: 50 additions & 20 deletions cpp/src/arrow/array-dict-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ TYPED_TEST_CASE(TestDictionaryBuilder, PrimitiveDictionaries);
TYPED_TEST(TestDictionaryBuilder, Basic) {
using c_type = typename TypeParam::c_type;

DictionaryBuilder<TypeParam> builder(default_memory_pool());
DictionaryBuilder<TypeParam> builder;
ASSERT_OK(builder.Append(static_cast<c_type>(1)));
ASSERT_OK(builder.Append(static_cast<c_type>(2)));
ASSERT_OK(builder.Append(static_cast<c_type>(1)));
Expand Down Expand Up @@ -81,7 +81,7 @@ TYPED_TEST(TestDictionaryBuilder, ArrayInit) {
auto dict_array = ArrayFromJSON(value_type, "[1, 2]");
auto dict_type = dictionary(int8(), value_type);

DictionaryBuilder<TypeParam> builder(dict_array, default_memory_pool());
DictionaryBuilder<TypeParam> builder(dict_array);
ASSERT_OK(builder.Append(static_cast<c_type>(1)));
ASSERT_OK(builder.Append(static_cast<c_type>(2)));
ASSERT_OK(builder.Append(static_cast<c_type>(1)));
Expand Down Expand Up @@ -134,7 +134,7 @@ TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
auto type = std::make_shared<TypeParam>();

auto intermediate_result = ArrayFromJSON(type, "[1, 2, 1]");
DictionaryBuilder<TypeParam> dictionary_builder(default_memory_pool());
DictionaryBuilder<TypeParam> dictionary_builder;
ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result));
std::shared_ptr<Array> result;
ASSERT_OK(dictionary_builder.Finish(&result));
Expand All @@ -154,7 +154,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
// Skip this test for (u)int8
if (sizeof(Scalar) > 1) {
// Build the dictionary Array
DictionaryBuilder<TypeParam> builder(default_memory_pool());
DictionaryBuilder<TypeParam> builder;
// Build expected data
NumericBuilder<TypeParam> dict_builder;
Int16Builder int_builder;
Expand Down Expand Up @@ -192,7 +192,7 @@ TYPED_TEST(TestDictionaryBuilder, DeltaDictionary) {
using c_type = typename TypeParam::c_type;
auto type = std::make_shared<TypeParam>();

DictionaryBuilder<TypeParam> builder(default_memory_pool());
DictionaryBuilder<TypeParam> builder;

ASSERT_OK(builder.Append(static_cast<c_type>(1)));
ASSERT_OK(builder.Append(static_cast<c_type>(2)));
Expand Down Expand Up @@ -232,7 +232,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {
auto type = std::make_shared<TypeParam>();
auto dict_type = dictionary(int8(), type);

DictionaryBuilder<TypeParam> builder(default_memory_pool());
DictionaryBuilder<TypeParam> builder;

ASSERT_OK(builder.Append(static_cast<c_type>(1)));
ASSERT_OK(builder.Append(static_cast<c_type>(2)));
Expand Down Expand Up @@ -284,7 +284,7 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {

TEST(TestStringDictionaryBuilder, Basic) {
// Build the dictionary Array
StringDictionaryBuilder builder(default_memory_pool());
StringDictionaryBuilder builder;
ASSERT_OK(builder.Append("test"));
ASSERT_OK(builder.Append("test2"));
ASSERT_OK(builder.Append("test"));
Expand All @@ -301,12 +301,43 @@ TEST(TestStringDictionaryBuilder, Basic) {
ASSERT_TRUE(expected.Equals(result));
}

TEST(TestStringDictionaryBuilder, AppendIndices) {
auto ex_dict = ArrayFromJSON(utf8(), R"(["c", "a", "b", "d"])");
auto invalid_dict = ArrayFromJSON(binary(), R"(["e", "f"])");

StringDictionaryBuilder builder;
ASSERT_OK(builder.InsertMemoValues(*ex_dict));

// Inserting again should have no effect
ASSERT_OK(builder.InsertMemoValues(*ex_dict));

// Type mismatch
ASSERT_RAISES(Invalid, builder.InsertMemoValues(*invalid_dict));

std::vector<int64_t> raw_indices = {0, 1, 2, -1, 3};
std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};
for (int i = 0; i < 2; ++i) {
ASSERT_OK(builder.AppendIndices(
raw_indices.data(), static_cast<int64_t>(raw_indices.size()), is_valid.data()));
}

ASSERT_EQ(10, builder.length());

std::shared_ptr<Array> result;
ASSERT_OK(builder.Finish(&result));

auto ex_indices = ArrayFromJSON(int8(), R"([0, 1, 2, null, 3, 0, 1, 2, null, 3])");
auto dtype = dictionary(int8(), utf8());
DictionaryArray expected(dtype, ex_indices, ex_dict);
ASSERT_TRUE(expected.Equals(result));
}

TEST(TestStringDictionaryBuilder, ArrayInit) {
auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])");
auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");

// Build the dictionary Array
StringDictionaryBuilder builder(dict_array, default_memory_pool());
StringDictionaryBuilder builder(dict_array);
ASSERT_OK(builder.Append("test"));
ASSERT_OK(builder.Append("test2"));
ASSERT_OK(builder.Append("test"));
Expand Down Expand Up @@ -345,7 +376,7 @@ TEST(TestStringDictionaryBuilder, MakeBuilder) {
// ARROW-4367
TEST(TestStringDictionaryBuilder, OnlyNull) {
// Build the dictionary Array
StringDictionaryBuilder builder(default_memory_pool());
StringDictionaryBuilder builder;
ASSERT_OK(builder.AppendNull());

std::shared_ptr<Array> result;
Expand All @@ -362,7 +393,7 @@ TEST(TestStringDictionaryBuilder, OnlyNull) {

TEST(TestStringDictionaryBuilder, DoubleTableSize) {
// Build the dictionary Array
StringDictionaryBuilder builder(default_memory_pool());
StringDictionaryBuilder builder;
// Build expected data
StringBuilder str_builder;
Int16Builder int_builder;
Expand Down Expand Up @@ -398,7 +429,7 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) {

TEST(TestStringDictionaryBuilder, DeltaDictionary) {
// Build the dictionary Array
StringDictionaryBuilder builder(default_memory_pool());
StringDictionaryBuilder builder;
ASSERT_OK(builder.Append("test"));
ASSERT_OK(builder.Append("test2"));
ASSERT_OK(builder.Append("test"));
Expand Down Expand Up @@ -432,7 +463,7 @@ TEST(TestStringDictionaryBuilder, DeltaDictionary) {
TEST(TestStringDictionaryBuilder, BigDeltaDictionary) {
constexpr int16_t kTestLength = 2048;
// Build the dictionary Array
StringDictionaryBuilder builder(default_memory_pool());
StringDictionaryBuilder builder;

StringBuilder str_builder1;
Int16Builder int_builder1;
Expand Down Expand Up @@ -518,8 +549,7 @@ TEST(TestStringDictionaryBuilder, BigDeltaDictionary) {

TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
// Build the dictionary Array
DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
default_memory_pool());
DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4));
std::vector<uint8_t> test{12, 12, 11, 12};
std::vector<uint8_t> test2{12, 12, 11, 11};
ASSERT_OK(builder.Append(test.data()));
Expand Down Expand Up @@ -555,7 +585,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, ArrayInit) {
auto value_type = fixed_size_binary(4);
auto dict_array = ArrayFromJSON(value_type, R"(["abcd", "wxyz"])");
util::string_view test = "abcd", test2 = "wxyz";
DictionaryBuilder<FixedSizeBinaryType> builder(dict_array, default_memory_pool());
DictionaryBuilder<FixedSizeBinaryType> builder(dict_array);
ASSERT_OK(builder.Append(test));
ASSERT_OK(builder.Append(test2));
ASSERT_OK(builder.Append(test));
Expand Down Expand Up @@ -597,7 +627,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
auto value_type = arrow::fixed_size_binary(4);
auto dict_type = dictionary(int8(), value_type);

DictionaryBuilder<FixedSizeBinaryType> builder(value_type, default_memory_pool());
DictionaryBuilder<FixedSizeBinaryType> builder(value_type);
std::vector<uint8_t> test{12, 12, 11, 12};
std::vector<uint8_t> test2{12, 12, 11, 11};
std::vector<uint8_t> test3{12, 12, 11, 10};
Expand Down Expand Up @@ -656,7 +686,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) {
auto value_type = arrow::fixed_size_binary(4);
auto dict_type = dictionary(int16(), value_type);

DictionaryBuilder<FixedSizeBinaryType> builder(value_type, default_memory_pool());
DictionaryBuilder<FixedSizeBinaryType> builder(value_type);
// Build expected data
FixedSizeBinaryBuilder fsb_builder(value_type);
Int16Builder int_builder;
Expand Down Expand Up @@ -693,7 +723,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) {
TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
// Build the dictionary Array
auto value_type = arrow::fixed_size_binary(4);
DictionaryBuilder<FixedSizeBinaryType> builder(value_type, default_memory_pool());
DictionaryBuilder<FixedSizeBinaryType> builder(value_type);
// Build an array with different byte width
FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5));
std::vector<uint8_t> value{100, 1, 1, 1, 1};
Expand All @@ -707,7 +737,7 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
TEST(TestDecimalDictionaryBuilder, Basic) {
// Build the dictionary Array
auto decimal_type = arrow::decimal(2, 0);
DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type);

// Test data
std::vector<Decimal128> test{12, 12, 11, 12};
Expand All @@ -730,7 +760,7 @@ TEST(TestDecimalDictionaryBuilder, DoubleTableSize) {
const auto& decimal_type = arrow::decimal(21, 0);

// Build the dictionary Array
DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type);

// Build expected data
FixedSizeBinaryBuilder fsb_builder(decimal_type);
Expand Down
67 changes: 40 additions & 27 deletions cpp/src/arrow/array/builder_dict.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,16 @@ using internal::checked_cast;
// ----------------------------------------------------------------------
// DictionaryType unification

template <typename T, typename Out = void>
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pitrou I ran into brittleness with the way the template specialization was working in this file. I refactored things to more explicitly use std::enable_if

using enable_if_memoize = typename std::enable_if<
!std::is_same<typename internal::DictionaryTraits<T>::MemoTableType, void>::value,
Out>::type;

template <typename T, typename Out = void>
using enable_if_no_memoize = typename std::enable_if<
std::is_same<typename internal::DictionaryTraits<T>::MemoTableType, void>::value,
Out>::type;

struct UnifyDictionaryValues {
MemoryPool* pool_;
std::shared_ptr<DataType> value_type_;
Expand All @@ -48,21 +58,15 @@ struct UnifyDictionaryValues {
std::shared_ptr<Array>* out_values_;
std::vector<std::vector<int32_t>>* out_transpose_maps_;

Status Visit(const DataType&, void* = nullptr) {
template <typename T>
enable_if_no_memoize<T, Status> Visit(const T&) {
// Default implementation for non-dictionary-supported datatypes
return Status::NotImplemented("Unification of ", value_type_,
" dictionaries is not implemented");
}

Status Visit(const DayTimeIntervalType&, void* = nullptr) {
return Status::NotImplemented(
"Unification of DayTime"
" dictionaries is not implemented");
}

template <typename T>
Status Visit(const T&,
typename internal::DictionaryTraits<T>::MemoTableType* = nullptr) {
enable_if_memoize<T, Status> Visit(const T&) {
using ArrayType = typename TypeTraits<T>::ArrayType;
using DictTraits = typename internal::DictionaryTraits<T>;
using MemoTableType = typename DictTraits::MemoTableType;
Expand Down Expand Up @@ -163,14 +167,14 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl {
std::shared_ptr<DataType> value_type_;
std::unique_ptr<MemoTable>* memo_table_;

Status Visit(const DataType&, void* = nullptr) {
template <typename T>
enable_if_no_memoize<T, Status> Visit(const T&) {
return Status::NotImplemented("Initialization of ", value_type_,
" memo table is not implemented");
}

template <typename T>
Status Visit(const T&,
typename internal::DictionaryTraits<T>::MemoTableType* = nullptr) {
enable_if_memoize<T, Status> Visit(const T&) {
using MemoTable = typename internal::DictionaryTraits<T>::MemoTableType;
memo_table_->reset(new MemoTable(0));
return Status::OK();
Expand All @@ -179,23 +183,24 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl {

struct ArrayValuesInserter {
DictionaryMemoTableImpl* impl_;
const Array& values_;

template <typename T>
Status Visit(const T& array) {
return InsertValues(array.type(), array);
Status Visit(const T& type) {
using ArrayType = typename TypeTraits<T>::ArrayType;
return InsertValues(type, checked_cast<const ArrayType&>(values_));
}

private:
template <typename DataType, typename Array>
Status InsertValues(const DataType& type, const Array&, void* = nullptr) {
template <typename DType, typename ArrayType>
enable_if_no_memoize<DType, Status> InsertValues(const DType& type,
const ArrayType&) {
return Status::NotImplemented("Inserting array values of ", type,
" is not implemented");
}

template <typename DataType, typename Array>
Status InsertValues(
const DataType&, const Array& array,
typename internal::DictionaryTraits<DataType>::MemoTableType* = nullptr) {
template <typename DType, typename ArrayType>
enable_if_memoize<DType, Status> InsertValues(const DType&, const ArrayType& array) {
for (int64_t i = 0; i < array.length(); ++i) {
ARROW_IGNORE_EXPR(impl_->GetOrInsert(array.GetView(i)));
}
Expand All @@ -210,14 +215,14 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl {
int64_t start_offset_;
std::shared_ptr<ArrayData>* out_;

Status Visit(const DataType&, void* = nullptr) {
template <typename T>
enable_if_no_memoize<T, Status> Visit(const T&) {
return Status::NotImplemented("Getting array data of ", value_type_,
" is not implemented");
}

template <typename T>
Status Visit(const T&,
typename internal::DictionaryTraits<T>::MemoTableType* = nullptr) {
enable_if_memoize<T, Status> Visit(const T&) {
using ConcreteMemoTable = typename internal::DictionaryTraits<T>::MemoTableType;
auto memo_table = static_cast<ConcreteMemoTable*>(memo_table_);
return internal::DictionaryTraits<T>::GetDictionaryArrayData(
Expand All @@ -232,9 +237,13 @@ class internal::DictionaryMemoTable::DictionaryMemoTableImpl {
ARROW_IGNORE_EXPR(VisitTypeInline(*type_, &visitor));
}

Status InsertValues(const std::shared_ptr<Array>& array) {
ArrayValuesInserter visitor{this};
return VisitArrayInline(*array, &visitor);
Status InsertValues(const Array& array) {
if (!array.type()->Equals(*type_)) {
return Status::Invalid("Array value type does not match memo type: ",
array.type()->ToString());
}
ArrayValuesInserter visitor{this, array};
return VisitTypeInline(*array.type(), &visitor);
}

template <typename T>
Expand Down Expand Up @@ -267,7 +276,7 @@ internal::DictionaryMemoTable::DictionaryMemoTable(const std::shared_ptr<DataTyp
internal::DictionaryMemoTable::DictionaryMemoTable(
const std::shared_ptr<Array>& dictionary)
: impl_(new DictionaryMemoTableImpl(dictionary->type())) {
ARROW_IGNORE_EXPR(impl_->InsertValues(dictionary));
ARROW_IGNORE_EXPR(impl_->InsertValues(*dictionary));
}

internal::DictionaryMemoTable::~DictionaryMemoTable() = default;
Expand Down Expand Up @@ -325,6 +334,10 @@ Status internal::DictionaryMemoTable::GetArrayData(MemoryPool* pool, int64_t sta
return impl_->GetArrayData(pool, start_offset, out);
}

Status internal::DictionaryMemoTable::InsertValues(const Array& array) {
return impl_->InsertValues(array);
}

int32_t internal::DictionaryMemoTable::size() const { return impl_->size(); }

} // namespace arrow
Loading