Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

package org.apache.arrow.vector.dictionary;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.BaseIntVector;
import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.ValueVector;
Expand All @@ -31,29 +32,57 @@
*/
public class DictionaryEncoder {

// TODO recursively examine fields?
private final DictionaryHashTable hashTable;
private final Dictionary dictionary;
private final BufferAllocator allocator;

//TODO pass custom hasher.

/**
* Construct an instance.
*/
public DictionaryEncoder(Dictionary dictionary, BufferAllocator allocator) {
this.dictionary = dictionary;
this.allocator = allocator;
hashTable = new DictionaryHashTable(dictionary.getVector());
}

/**
* Dictionary encodes a vector with a provided dictionary. The dictionary must contain all values in the vector.
*
* @param vector vector to encode
* @param vector vector to encode
* @param dictionary dictionary used for encoding
* @return dictionary encoded vector
*/
public static ValueVector encode(ValueVector vector, Dictionary dictionary) {
// load dictionary indices into a hash table for lookup
DictionaryHashTable hashTable = new DictionaryHashTable(dictionary.getVector());
for (int i = 0; i < dictionary.getVector().getValueCount(); i++) {
hashTable.put(i);
}
DictionaryEncoder encoder = new DictionaryEncoder(dictionary, vector.getAllocator());
return encoder.encode(vector);
}

/**
* Decodes a dictionary encoded array using the provided dictionary.
*
* @param indices dictionary encoded values, must be int type
* @param dictionary dictionary used to decode the values
* @return vector with values restored from dictionary
*/
public static ValueVector decode(ValueVector indices, Dictionary dictionary) {
DictionaryEncoder encoder = new DictionaryEncoder(dictionary, indices.getAllocator());
return encoder.decode(indices);
}

/**
* Encodes a vector with the built hash table in this encoder.
*/
public ValueVector encode(ValueVector vector) {

Field valueField = vector.getField();
FieldType indexFieldType = new FieldType(valueField.isNullable(), dictionary.getEncoding().getIndexType(),
dictionary.getEncoding(), valueField.getMetadata());
Field indexField = new Field(valueField.getName(), indexFieldType, null);

// vector to hold our indices (dictionary encoded values)
FieldVector createdVector = indexField.createVector(vector.getAllocator());
FieldVector createdVector = indexField.createVector(allocator);
if (! (createdVector instanceof BaseIntVector)) {
throw new IllegalArgumentException("Dictionary encoding does not have a valid int type:" +
createdVector.getClass());
Expand Down Expand Up @@ -82,18 +111,14 @@ public static ValueVector encode(ValueVector vector, Dictionary dictionary) {
}

/**
* Decodes a dictionary encoded array using the provided dictionary.
*
* @param indices dictionary encoded values, must be int type
* @param dictionary dictionary used to decode the values
* @return vector with values restored from dictionary
* Decodes a vector with the built hash table in this encoder.
*/
public static ValueVector decode(ValueVector indices, Dictionary dictionary) {
public ValueVector decode(ValueVector indices) {
int count = indices.getValueCount();
ValueVector dictionaryVector = dictionary.getVector();
int dictionaryCount = dictionaryVector.getValueCount();
// copy the dictionary values into the decoded vector
TransferPair transfer = dictionaryVector.getTransferPair(indices.getAllocator());
TransferPair transfer = dictionaryVector.getTransferPair(allocator);
transfer.getTo().allocateNewSafe();

BaseIntVector baseIntVector = (BaseIntVector) indices;
Expand All @@ -106,7 +131,6 @@ public static ValueVector decode(ValueVector indices, Dictionary dictionary) {
transfer.copyValueSafe(indexAsInt, i);
}
}
// TODO do we need to worry about the field?
ValueVector decoded = transfer.getTo();
decoded.setValueCount(count);
return decoded;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ public DictionaryHashTable(int initialCapacity, ValueVector dictionary) {
this.threshold = initialCapacity;

this.dictionary = dictionary;

// build hash table
for (int i = 0; i < this.dictionary.getValueCount(); i++) {
put(i);
}
}

public DictionaryHashTable(ValueVector dictionary) {
Expand Down Expand Up @@ -146,7 +151,7 @@ public int getIndex(int indexInArray, ValueVector toEncode) {
/**
* put the index of dictionary vector to build hash table.
*/
public void put(int indexInDictionary) {
private void put(int indexInDictionary) {
if (table == EMPTY_TABLE) {
inflateTable(threshold);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ public void testEncodeStrings() {
Dictionary dictionary =
new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));

try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) {
try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
// verify indices
assertEquals(IntVector.class, encoded.getClass());

Expand Down Expand Up @@ -136,8 +136,7 @@ public void testEncodeLargeVector() {
Dictionary dictionary =
new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));


try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) {
try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
// verify indices
assertEquals(IntVector.class, encoded.getClass());

Expand Down Expand Up @@ -188,7 +187,7 @@ public void testEncodeList() {

Dictionary dictionary = new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));

try (final ValueVector encoded = (FieldVector) DictionaryEncoder.encode(vector, dictionary)) {
try (final ValueVector encoded = DictionaryEncoder.encode(vector, dictionary)) {
// verify indices
assertEquals(IntVector.class, encoded.getClass());

Expand Down Expand Up @@ -576,6 +575,136 @@ public void testUnionEquals() {
}
}

@Test
public void testEncodeWithEncoderInstance() {
// Create a new value vector
try (final VarCharVector vector = newVarCharVector("vector", allocator);
final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) {
vector.allocateNew(512, 5);

// set some values
vector.setSafe(0, zero, 0, zero.length);
vector.setSafe(1, one, 0, one.length);
vector.setSafe(2, one, 0, one.length);
vector.setSafe(3, two, 0, two.length);
vector.setSafe(4, zero, 0, zero.length);
vector.setValueCount(5);

// set some dictionary values
dictionaryVector.allocateNew(512, 3);
dictionaryVector.setSafe(0, zero, 0, zero.length);
dictionaryVector.setSafe(1, one, 0, one.length);
dictionaryVector.setSafe(2, two, 0, two.length);
dictionaryVector.setValueCount(3);

Dictionary dictionary =
new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
DictionaryEncoder encoder = new DictionaryEncoder(dictionary, allocator);

try (final ValueVector encoded = encoder.encode(vector)) {
// verify indices
assertEquals(IntVector.class, encoded.getClass());

IntVector index = ((IntVector) encoded);
assertEquals(5, index.getValueCount());
assertEquals(0, index.get(0));
assertEquals(1, index.get(1));
assertEquals(1, index.get(2));
assertEquals(2, index.get(3));
assertEquals(0, index.get(4));

// now run through the decoder and verify we get the original back
try (ValueVector decoded = encoder.decode(encoded)) {
assertEquals(vector.getClass(), decoded.getClass());
assertEquals(vector.getValueCount(), (decoded).getValueCount());
for (int i = 0; i < 5; i++) {
assertEquals(vector.getObject(i), ((VarCharVector) decoded).getObject(i));
}
}
}
}
}

@Test
public void testEncodeMultiVectors() {
// Create a new value vector
try (final VarCharVector vector1 = newVarCharVector("vector1", allocator);
final VarCharVector vector2 = newVarCharVector("vector2", allocator);
final VarCharVector dictionaryVector = newVarCharVector("dict", allocator);) {

vector1.allocateNew(512, 5);

// set some values
vector1.setSafe(0, zero, 0, zero.length);
vector1.setSafe(1, one, 0, one.length);
vector1.setSafe(2, one, 0, one.length);
vector1.setSafe(3, two, 0, two.length);
vector1.setSafe(4, zero, 0, zero.length);
vector1.setValueCount(5);

vector2.allocateNew(512, 3);

// set some values
vector2.setSafe(0, zero, 0, zero.length);
vector2.setSafe(1, one, 0, one.length);
vector2.setSafe(2, one, 0, one.length);
vector2.setValueCount(3);

// set some dictionary values
dictionaryVector.allocateNew(512, 3);
dictionaryVector.setSafe(0, zero, 0, zero.length);
dictionaryVector.setSafe(1, one, 0, one.length);
dictionaryVector.setSafe(2, two, 0, two.length);
dictionaryVector.setValueCount(3);

Dictionary dictionary =
new Dictionary(dictionaryVector, new DictionaryEncoding(1L, false, null));
DictionaryEncoder encoder = new DictionaryEncoder(dictionary, allocator);

try (final ValueVector encoded = encoder.encode(vector1)) {
// verify indices
assertEquals(IntVector.class, encoded.getClass());

IntVector index = ((IntVector) encoded);
assertEquals(5, index.getValueCount());
assertEquals(0, index.get(0));
assertEquals(1, index.get(1));
assertEquals(1, index.get(2));
assertEquals(2, index.get(3));
assertEquals(0, index.get(4));

// now run through the decoder and verify we get the original back
try (ValueVector decoded = encoder.decode(encoded)) {
assertEquals(vector1.getClass(), decoded.getClass());
assertEquals(vector1.getValueCount(), (decoded).getValueCount());
for (int i = 0; i < 5; i++) {
assertEquals(vector1.getObject(i), ((VarCharVector) decoded).getObject(i));
}
}
}

try (final ValueVector encoded = encoder.encode(vector2)) {
// verify indices
assertEquals(IntVector.class, encoded.getClass());

IntVector index = ((IntVector) encoded);
assertEquals(3, index.getValueCount());
assertEquals(0, index.get(0));
assertEquals(1, index.get(1));
assertEquals(1, index.get(2));

// now run through the decoder and verify we get the original back
try (ValueVector decoded = encoder.decode(encoded)) {
assertEquals(vector2.getClass(), decoded.getClass());
assertEquals(vector2.getValueCount(), (decoded).getValueCount());
for (int i = 0; i < 3; i++) {
assertEquals(vector2.getObject(i), ((VarCharVector) decoded).getObject(i));
}
}
}
}
}

private void writeStructVector(NullableStructWriter writer, int value1, long value2) {
writer.start();
writer.integer("f0").writeInt(value1);
Expand Down