Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8a51053
Add first implementation of mask_first_n and mask_last_n
augustoasilva Oct 25, 2021
dfa432a
Add unit tests for mask_first_n and mask_last_n
augustoasilva Oct 25, 2021
8767ad3
Add mask_first_n and mask_last_n to function registry
augustoasilva Oct 26, 2021
fd20f5a
Add mask_first_n and mask_last_n projector test
augustoasilva Oct 26, 2021
579998a
Update mask_first_n and mask_last_n to be more performant
augustoasilva Nov 5, 2021
8544adf
Add static array mapping asccii chars to be masked
augustoasilva Nov 5, 2021
ac7222e
Add test case when n to mask is greater than string to mask length
augustoasilva Nov 10, 2021
206cb4a
Fix runtime error that may occur when running TestProjector.TestMaskF…
augustoasilva Nov 10, 2021
f70e799
Fix possible overflow on memcpy
augustoasilva Nov 10, 2021
11fd91b
Move mask_first_n and mask_last_n to precompiled string_ops
augustoasilva Nov 11, 2021
3ed0c6d
Make mask_first_n and mask_last_n to properly handle multibyte utf8 c…
augustoasilva Nov 19, 2021
3a84370
Fix remaining conflicts after rebase with master
augustoasilva Nov 19, 2021
1b9809a
Fix building error on Ubuntu ASAN
augustoasilva Nov 19, 2021
dd4fd7e
Fix linter error
augustoasilva Nov 19, 2021
98ee8d1
Fix ubuntu asan build error
augustoasilva Nov 19, 2021
195e65e
Fix wrong utf8_char assignment to out data str
augustoasilva Nov 23, 2021
6862b5b
Rename mask_first/last_n vars and improve mask_last_n
augustoasilva Nov 23, 2021
1ec99a2
Fix cast from size_t to int32_t
augustoasilva Nov 23, 2021
d9833b6
Fix rebase leftover errors
augustoasilva Nov 23, 2021
0d46c4c
Fix possible loss of data on castings
augustoasilva Nov 23, 2021
0e7d45b
Fix building failures
augustoasilva Nov 24, 2021
c516663
Improve mask_first_n and mask_last_n
augustoasilva Nov 24, 2021
5383329
Improve mask_first_n to handle better multibyte utf8 chars
augustoasilva Nov 24, 2021
c1878b0
Improve mask_last_n to better handle utf8 chars
augustoasilva Nov 30, 2021
f4e45bf
Fix conversion narrowing from 'utf8proc_ssize_t' to 'int'
augustoasilva Nov 30, 2021
7eb2623
Fix rebase leftovers
augustoasilva Nov 30, 2021
65f37e2
Move memcpy before the masking the chars that will be masked
augustoasilva Dec 1, 2021
effb691
Improve mask_last_n by applying reviews
augustoasilva Dec 2, 2021
a234c73
Change the validation of non-utf8 chars for the utf8proc_decompose's …
augustoasilva Dec 7, 2021
8f9b671
Fix rebase leftover after resolving conflicts
augustoasilva Dec 8, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,15 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {

NativeFunction("aes_decrypt", {}, DataTypeVector{utf8(), utf8()}, utf8(),
kResultNullIfNull, "gdv_fn_aes_decrypt",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)};
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

NativeFunction("mask_first_n", {}, DataTypeVector{utf8(), int32()}, utf8(),
kResultNullIfNull, "gdv_mask_first_n_utf8_int32",
NativeFunction::kNeedsContext),

NativeFunction("mask_last_n", {}, DataTypeVector{utf8(), int32()}, utf8(),
kResultNullIfNull, "gdv_mask_last_n_utf8_int32",
NativeFunction::kNeedsContext)};

return string_fn_registry_;
}
Expand Down
233 changes: 232 additions & 1 deletion cpp/src/gandiva/gdv_function_stubs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

#include <utf8proc.h>

#include <iostream>
#include <string>
#include <vector>

Expand All @@ -45,6 +44,24 @@

extern "C" {

static char mask_array[256] = {
(char)0, (char)1, (char)2, (char)3, (char)4, (char)5, (char)6, (char)7,
(char)8, (char)9, (char)10, (char)11, (char)12, (char)13, (char)14, (char)15,
(char)16, (char)17, (char)18, (char)19, (char)20, (char)21, (char)22, (char)23,
(char)24, (char)25, (char)26, (char)27, (char)28, (char)29, (char)30, (char)31,
(char)32, (char)33, (char)34, (char)35, (char)36, (char)37, (char)38, (char)39,
(char)40, (char)41, (char)42, (char)43, (char)44, (char)45, (char)46, (char)47,
'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n',
'n', 'n', (char)58, (char)59, (char)60, (char)61, (char)62, (char)63,
(char)64, 'X', 'X', 'X', 'X', 'X', 'X', 'X',
'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',
'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',
'X', 'X', 'X', (char)91, (char)92, (char)93, (char)94, (char)95,
(char)96, 'x', 'x', 'x', 'x', 'x', 'x', 'x',
'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x',
'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x',
'x', 'x', 'x', (char)123, (char)124, (char)125, (char)126, (char)127};

bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len) {
gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
Expand Down Expand Up @@ -892,6 +909,203 @@ const char* gdv_fn_aes_decrypt(int64_t context, const char* data, int32_t data_l

return ret;
}

GANDIVA_EXPORT
const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
int32_t data_len, int32_t n_to_mask,
int32_t* out_len) {
if (data_len <= 0) {
*out_len = 0;
return nullptr;
}

if (n_to_mask > data_len) {
n_to_mask = data_len;
}

*out_len = data_len;

if (n_to_mask <= 0) {
return data;
}

char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
if (out == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return nullptr;
}

int bytes_masked;
for (bytes_masked = 0; bytes_masked < n_to_mask; bytes_masked++) {
unsigned char char_single_byte = data[bytes_masked];
if (char_single_byte > 127) {
// found a multi-byte utf-8 char
break;
}
out[bytes_masked] = mask_array[char_single_byte];
}

int chars_masked = bytes_masked;
int out_idx = bytes_masked;

// Handle multibyte utf8 characters
utf8proc_int32_t utf8_char;
while ((chars_masked < n_to_mask) && (bytes_masked < data_len)) {
auto char_len =
utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data + bytes_masked),
data_len, &utf8_char);

if (char_len < 0) {
gdv_fn_context_set_error_msg(context, utf8proc_errmsg(char_len));
*out_len = 0;
return nullptr;
}

switch (utf8proc_category(utf8_char)) {
case 1:
out[out_idx] = 'X';
out_idx++;
break;
case 2:
out[out_idx] = 'x';
out_idx++;
break;
case 9:
out[out_idx] = 'n';
out_idx++;
break;
case 10:
out[out_idx] = 'n';
out_idx++;
break;
default:
memcpy(out + out_idx, data + bytes_masked, char_len);
out_idx += static_cast<int>(char_len);
break;
}
bytes_masked += static_cast<int>(char_len);
chars_masked++;
}

// Correct the out_len after masking multibyte characters with single byte characters
*out_len = *out_len - (bytes_masked - out_idx);

if (bytes_masked < data_len) {
memcpy(out + out_idx, data + bytes_masked, data_len - bytes_masked);
}

return out;
}

GANDIVA_EXPORT
const char* gdv_mask_last_n_utf8_int32(int64_t context, const char* data,
int32_t data_len, int32_t n_to_mask,
int32_t* out_len) {
if (data_len <= 0) {
*out_len = 0;
return nullptr;
}

if (n_to_mask > data_len) {
n_to_mask = data_len;
}

*out_len = data_len;

if (n_to_mask <= 0) {
return data;
}

char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
if (out == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return nullptr;
}

bool has_multi_byte = false;
for (int i = 0; i < data_len; i++) {
unsigned char char_single_byte = data[i];
if (char_single_byte > 127) {
// found a multi-byte utf-8 char
has_multi_byte = true;
break;
}
}

if (!has_multi_byte) {
int start_idx = data_len - n_to_mask;
memcpy(out, data, start_idx);
for (int i = start_idx; i < data_len; ++i) {
unsigned char char_single_byte = data[i];
out[i] = mask_array[char_single_byte];
}
*out_len = data_len;
return out;
}

utf8proc_int32_t utf8_char_buffer;
int num_of_chars = static_cast<int>(
utf8proc_decompose(reinterpret_cast<const utf8proc_uint8_t*>(data), data_len,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does this function do? Is it using utf8proc_iterate() to find the number of chars?

Also, why does it need the data_len parameter twice?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It loops through the entire string looking for chars (valid codepoints) acording to the option that is passed. Differently from the utf8proc_iterate() that loops through a sequence of bytes to find the first valid char (valid code point).

It does not, the second data_len is passed as the size of the buffer to be used, for the tests I used data_len and 4 (as the utf8 chars can have only up to 4 bytes) And if forgot to change this to 4 before pushing it to the repo.

&utf8_char_buffer, 4, UTF8PROC_STABLE));

if (num_of_chars < 0) {
gdv_fn_context_set_error_msg(context, utf8proc_errmsg(num_of_chars));
*out_len = 0;
return nullptr;
}

utf8proc_int32_t utf8_char;
int chars_counter = 0;
int bytes_read = 0;
while ((bytes_read < data_len) && (chars_counter < (num_of_chars - n_to_mask))) {
auto char_len =
utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data + bytes_read),
data_len, &utf8_char);
chars_counter++;
bytes_read += static_cast<int>(char_len);
}

int out_idx = bytes_read;
int offset_idx = bytes_read;

// Populate the first chars, that are not masked
memcpy(out, data, offset_idx);

while (bytes_read < data_len) {
auto char_len =
utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data + bytes_read),
data_len, &utf8_char);
switch (utf8proc_category(utf8_char)) {
case 1:
out[out_idx] = 'X';
out_idx++;
break;
case 2:
out[out_idx] = 'x';
out_idx++;
break;
case 9:
out[out_idx] = 'n';
out_idx++;
break;
case 10:
out[out_idx] = 'n';
out_idx++;
break;
default:
memcpy(out + out_idx, data + bytes_read, char_len);
out_idx += static_cast<int>(char_len);
break;
}
bytes_read += static_cast<int>(char_len);
}

*out_len = out_idx;

return out;
}
}

namespace gandiva {
Expand Down Expand Up @@ -1938,5 +2152,22 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
engine->AddGlobalMappingForFunc("gdv_fn_aes_decrypt",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_aes_decrypt));

// gdv_mask_first_n and gdv_mask_last_n
std::vector<llvm::Type*> mask_args = {
types->i64_type(), // context
types->i8_ptr_type(), // data
types->i32_type(), // data_length
types->i32_type(), // n_to_mask
types->i32_ptr_type() // out_length
};

engine->AddGlobalMappingForFunc("gdv_mask_first_n_utf8_int32",
types->i8_ptr_type() /*return_type*/, mask_args,
reinterpret_cast<void*>(gdv_mask_first_n_utf8_int32));

engine->AddGlobalMappingForFunc("gdv_mask_last_n_utf8_int32",
types->i8_ptr_type() /*return_type*/, mask_args,
reinterpret_cast<void*>(gdv_mask_last_n_utf8_int32));
}
} // namespace gandiva
10 changes: 10 additions & 0 deletions cpp/src/gandiva/gdv_function_stubs.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,4 +187,14 @@ GANDIVA_EXPORT
const char* gdv_fn_aes_decrypt(int64_t context, const char* data, int32_t data_len,
const char* key_data, int32_t key_data_len,
int32_t* out_len);

GANDIVA_EXPORT
const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
int32_t data_len, int32_t n_to_mask,
int32_t* out_len);

GANDIVA_EXPORT
const char* gdv_mask_last_n_utf8_int32(int64_t context, const char* data,
int32_t data_len, int32_t n_to_mask,
int32_t* out_len);
}
Loading