Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions cpp/src/arrow/parquet/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -250,11 +250,6 @@ Status FieldToNode(const std::shared_ptr<Field>& field, NodePtr* out) {
case Type::DOUBLE:
type = ParquetType::DOUBLE;
break;
case Type::CHAR:
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
logical_type = LogicalType::UTF8;
length = static_cast<CharType*>(field->type.get())->size;
break;
case Type::STRING:
type = ParquetType::BYTE_ARRAY;
logical_type = LogicalType::UTF8;
Expand Down
17 changes: 16 additions & 1 deletion cpp/src/arrow/type.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,18 @@ std::string Field::ToString() const {

DataType::~DataType() {}

StringType::StringType() : DataType(Type::STRING) {}
bool DataType::Equals(const DataType* other) const {
bool equals = other && ((this == other) ||
((this->type == other->type) &&
((this->num_children() == other->num_children()))));
if (equals) {
for (int i = 0; i < num_children(); ++i) {
// TODO(emkornfield) limit recursion
if (!children_[i]->Equals(other->children_[i])) { return false; }
}
}
return equals;
}

std::string StringType::ToString() const {
std::string result(name());
Expand All @@ -44,6 +55,10 @@ std::string ListType::ToString() const {
return s.str();
}

std::string BinaryType::ToString() const {
return std::string(name());
}

std::string StructType::ToString() const {
std::stringstream s;
s << "struct<";
Expand Down
55 changes: 38 additions & 17 deletions cpp/src/arrow/type.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
#include <string>
#include <vector>

#include "arrow/util/macros.h"

namespace arrow {

// Data types in this library are all *logical*. They can be expressed as
Expand Down Expand Up @@ -53,15 +55,9 @@ struct Type {
// 8-byte floating point value
DOUBLE = 11,

// CHAR(N): fixed-length UTF8 string with length N
CHAR = 12,

// UTF8 variable-length string as List<Char>
STRING = 13,

// VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1)
VARCHAR = 14,

// Variable-length bytes (no guarantee of UTF8-ness)
BINARY = 15,

Expand Down Expand Up @@ -114,12 +110,15 @@ struct DataType {

virtual ~DataType();

bool Equals(const DataType* other) {
// Call with a pointer so more friendly to subclasses
return other && ((this == other) || (this->type == other->type));
}
// Return whether the types are equal
//
// Types that are logically convertable from one to another e.g. List<UInt8>
// and Binary are NOT equal).
virtual bool Equals(const DataType* other) const;

bool Equals(const std::shared_ptr<DataType>& other) { return Equals(other.get()); }
bool Equals(const std::shared_ptr<DataType>& other) const {
return Equals(other.get());
}

const std::shared_ptr<Field>& child(int i) const { return children_[i]; }

Expand Down Expand Up @@ -236,9 +235,8 @@ struct DoubleType : public PrimitiveType<DoubleType> {

struct ListType : public DataType {
// List can contain any other logical value type
explicit ListType(const std::shared_ptr<DataType>& value_type) : DataType(Type::LIST) {
children_ = {std::make_shared<Field>("item", value_type)};
}
explicit ListType(const std::shared_ptr<DataType>& value_type)
: ListType(value_type, Type::LIST) {}

explicit ListType(const std::shared_ptr<Field>& value_field) : DataType(Type::LIST) {
children_ = {value_field};
Expand All @@ -251,15 +249,38 @@ struct ListType : public DataType {
static char const* name() { return "list"; }

std::string ToString() const override;

protected:
// Constructor for classes that are implemented as List Arrays.
ListType(const std::shared_ptr<DataType>& value_type, Type::type logical_type)
: DataType(logical_type) {
// TODO ARROW-187 this can technically fail, make a constructor method ?
children_ = {std::make_shared<Field>("item", value_type)};
}
};

// String is a logical type consisting of a physical list of 1-byte values
struct StringType : public DataType {
StringType();
// BinaryType type is reprsents lists of 1-byte values.
struct BinaryType : public ListType {
BinaryType() : BinaryType(Type::BINARY) {}
static char const* name() { return "binary"; }
std::string ToString() const override;

protected:
// Allow subclasses to change the logical type.
explicit BinaryType(Type::type logical_type)
: ListType(std::shared_ptr<DataType>(new UInt8Type()), logical_type) {}
};

// UTF encoded strings
struct StringType : public BinaryType {
StringType() : BinaryType(Type::STRING) {}

static char const* name() { return "string"; }

std::string ToString() const override;

protected:
explicit StringType(Type::type logical_type) : BinaryType(logical_type) {}
};

struct StructType : public DataType {
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/arrow/types/construct.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,8 @@ Status MakeListArray(const TypePtr& type, int32_t length,
case Type::LIST:
out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap));
break;
case Type::CHAR:
case Type::DECIMAL_TEXT:
case Type::STRING:
case Type::VARCHAR:
out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap));
break;
default:
Expand Down
1 change: 0 additions & 1 deletion cpp/src/arrow/types/decimal.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ struct DecimalType : public DataType {
: DataType(Type::DECIMAL), precision(precision_), scale(scale_) {}
int precision;
int scale;

static char const* name() { return "decimal"; }

std::string ToString() const override;
Expand Down
8 changes: 4 additions & 4 deletions cpp/src/arrow/types/list.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ class ListArray : public Array {
int32_t offset(int i) const { return offsets_[i]; }

// Neither of these functions will perform boundschecking
int32_t value_offset(int i) { return offsets_[i]; }
int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i]; }
int32_t value_offset(int i) const { return offsets_[i]; }
int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; }

bool EqualsExact(const ListArray& other) const;
bool Equals(const std::shared_ptr<Array>& arr) const override;
Expand All @@ -92,9 +92,9 @@ class ListArray : public Array {
// a sequence of offests and null values.
//
// A note on types. Per arrow/type.h all types in the c++ implementation are
// logical so even though this class always builds an Array of lists, this can
// logical so even though this class always builds list array, this can
// represent multiple different logical types. If no logical type is provided
// at construction time, the class defaults to List<T> where t is take from the
// at construction time, the class defaults to List<T> where t is taken from the
// value_builder/values that the object is constructed with.
class ListBuilder : public ArrayBuilder {
public:
Expand Down
188 changes: 161 additions & 27 deletions cpp/src/arrow/types/string-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,32 +34,14 @@ namespace arrow {

class Buffer;

TEST(TypesTest, TestCharType) {
CharType t1(5);

ASSERT_EQ(t1.type, Type::CHAR);
ASSERT_EQ(t1.size, 5);

ASSERT_EQ(t1.ToString(), std::string("char(5)"));

// Test copy constructor
CharType t2 = t1;
ASSERT_EQ(t2.type, Type::CHAR);
ASSERT_EQ(t2.size, 5);
}

TEST(TypesTest, TestVarcharType) {
VarcharType t1(5);

ASSERT_EQ(t1.type, Type::VARCHAR);
ASSERT_EQ(t1.size, 5);

ASSERT_EQ(t1.ToString(), std::string("varchar(5)"));

// Test copy constructor
VarcharType t2 = t1;
ASSERT_EQ(t2.type, Type::VARCHAR);
ASSERT_EQ(t2.size, 5);
TEST(TypesTest, BinaryType) {
BinaryType t1;
BinaryType e1;
StringType t2;
EXPECT_TRUE(t1.Equals(&e1));
EXPECT_FALSE(t1.Equals(&t2));
ASSERT_EQ(t1.type, Type::BINARY);
ASSERT_EQ(t1.ToString(), std::string("binary"));
}

TEST(TypesTest, TestStringType) {
Expand Down Expand Up @@ -119,6 +101,7 @@ class TestStringContainer : public ::testing::Test {
TEST_F(TestStringContainer, TestArrayBasics) {
ASSERT_EQ(length_, strings_->length());
ASSERT_EQ(1, strings_->null_count());
ASSERT_OK(strings_->Validate());
}

TEST_F(TestStringContainer, TestType) {
Expand Down Expand Up @@ -163,7 +146,10 @@ class TestStringBuilder : public TestBuilder {
builder_.reset(new StringBuilder(pool_, type_));
}

void Done() { result_ = std::dynamic_pointer_cast<StringArray>(builder_->Finish()); }
void Done() {
result_ = std::dynamic_pointer_cast<StringArray>(builder_->Finish());
result_->Validate();
}

protected:
TypePtr type_;
Expand Down Expand Up @@ -216,4 +202,152 @@ TEST_F(TestStringBuilder, TestZeroLength) {
Done();
}

// Binary container type
// TODO(emkornfield) there should be some way to refactor these to avoid code duplicating
// with String
class TestBinaryContainer : public ::testing::Test {
public:
void SetUp() {
chars_ = {'a', 'b', 'b', 'c', 'c', 'c'};
offsets_ = {0, 1, 1, 1, 3, 6};
valid_bytes_ = {1, 1, 0, 1, 1};
expected_ = {"a", "", "", "bb", "ccc"};

MakeArray();
}

void MakeArray() {
length_ = offsets_.size() - 1;
int nchars = chars_.size();

value_buf_ = test::to_buffer(chars_);
values_ = ArrayPtr(new UInt8Array(nchars, value_buf_));

offsets_buf_ = test::to_buffer(offsets_);

null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_);
null_count_ = test::null_count(valid_bytes_);

strings_ = std::make_shared<BinaryArray>(
length_, offsets_buf_, values_, null_count_, null_bitmap_);
}

protected:
std::vector<int32_t> offsets_;
std::vector<char> chars_;
std::vector<uint8_t> valid_bytes_;

std::vector<std::string> expected_;

std::shared_ptr<Buffer> value_buf_;
std::shared_ptr<Buffer> offsets_buf_;
std::shared_ptr<Buffer> null_bitmap_;

int null_count_;
int length_;

ArrayPtr values_;
std::shared_ptr<BinaryArray> strings_;
};

TEST_F(TestBinaryContainer, TestArrayBasics) {
ASSERT_EQ(length_, strings_->length());
ASSERT_EQ(1, strings_->null_count());
ASSERT_OK(strings_->Validate());
}

TEST_F(TestBinaryContainer, TestType) {
TypePtr type = strings_->type();

ASSERT_EQ(Type::BINARY, type->type);
ASSERT_EQ(Type::BINARY, strings_->type_enum());
}

TEST_F(TestBinaryContainer, TestListFunctions) {
int pos = 0;
for (size_t i = 0; i < expected_.size(); ++i) {
ASSERT_EQ(pos, strings_->value_offset(i));
ASSERT_EQ(expected_[i].size(), strings_->value_length(i));
pos += expected_[i].size();
}
}

TEST_F(TestBinaryContainer, TestDestructor) {
auto arr = std::make_shared<BinaryArray>(
length_, offsets_buf_, values_, null_count_, null_bitmap_);
}

TEST_F(TestBinaryContainer, TestGetValue) {
for (size_t i = 0; i < expected_.size(); ++i) {
if (valid_bytes_[i] == 0) {
ASSERT_TRUE(strings_->IsNull(i));
} else {
int32_t len = -1;
const uint8_t* bytes = strings_->GetValue(i, &len);
ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len));
}
}
}

class TestBinaryBuilder : public TestBuilder {
public:
void SetUp() {
TestBuilder::SetUp();
type_ = TypePtr(new BinaryType());
builder_.reset(new BinaryBuilder(pool_, type_));
}

void Done() {
result_ = std::dynamic_pointer_cast<BinaryArray>(builder_->Finish());
result_->Validate();
}

protected:
TypePtr type_;

std::unique_ptr<BinaryBuilder> builder_;
std::shared_ptr<BinaryArray> result_;
};

TEST_F(TestBinaryBuilder, TestScalarAppend) {
std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
std::vector<uint8_t> is_null = {0, 0, 0, 1, 0};

int N = strings.size();
int reps = 1000;

for (int j = 0; j < reps; ++j) {
for (int i = 0; i < N; ++i) {
if (is_null[i]) {
builder_->AppendNull();
} else {
builder_->Append(
reinterpret_cast<const uint8_t*>(strings[i].data()), strings[i].size());
}
}
}
Done();
ASSERT_OK(result_->Validate());
ASSERT_EQ(reps * N, result_->length());
ASSERT_EQ(reps, result_->null_count());
ASSERT_EQ(reps * 6, result_->values()->length());

int32_t length;
for (int i = 0; i < N * reps; ++i) {
if (is_null[i % N]) {
ASSERT_TRUE(result_->IsNull(i));
} else {
ASSERT_FALSE(result_->IsNull(i));
const uint8_t* vals = result_->GetValue(i, &length);
ASSERT_EQ(strings[i % N].size(), length);
ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length));
}
}
}

TEST_F(TestBinaryBuilder, TestZeroLength) {
// All buffers are null
Done();
}

} // namespace arrow
Loading