-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-14482: [C++][Gandiva] Implement MASK_FIRST_N and MASK_LAST_N functions #11551
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8a51053
dfa432a
8767ad3
fd20f5a
579998a
8544adf
ac7222e
206cb4a
f70e799
11fd91b
3ed0c6d
3a84370
1b9809a
dd4fd7e
98ee8d1
195e65e
6862b5b
1ec99a2
d9833b6
0d46c4c
0e7d45b
c516663
5383329
c1878b0
f4e45bf
7eb2623
65f37e2
effb691
a234c73
8f9b671
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,7 +19,6 @@ | |
|
|
||
| #include <utf8proc.h> | ||
|
|
||
| #include <iostream> | ||
| #include <string> | ||
| #include <vector> | ||
|
|
||
|
|
@@ -45,6 +44,24 @@ | |
|
|
||
| extern "C" { | ||
|
|
||
| static char mask_array[256] = { | ||
| (char)0, (char)1, (char)2, (char)3, (char)4, (char)5, (char)6, (char)7, | ||
| (char)8, (char)9, (char)10, (char)11, (char)12, (char)13, (char)14, (char)15, | ||
| (char)16, (char)17, (char)18, (char)19, (char)20, (char)21, (char)22, (char)23, | ||
| (char)24, (char)25, (char)26, (char)27, (char)28, (char)29, (char)30, (char)31, | ||
| (char)32, (char)33, (char)34, (char)35, (char)36, (char)37, (char)38, (char)39, | ||
| (char)40, (char)41, (char)42, (char)43, (char)44, (char)45, (char)46, (char)47, | ||
| 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', | ||
| 'n', 'n', (char)58, (char)59, (char)60, (char)61, (char)62, (char)63, | ||
| (char)64, 'X', 'X', 'X', 'X', 'X', 'X', 'X', | ||
| 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', | ||
| 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X', | ||
| 'X', 'X', 'X', (char)91, (char)92, (char)93, (char)94, (char)95, | ||
| (char)96, 'x', 'x', 'x', 'x', 'x', 'x', 'x', | ||
| 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', | ||
| 'x', 'x', 'x', 'x', 'x', 'x', 'x', 'x', | ||
| 'x', 'x', 'x', (char)123, (char)124, (char)125, (char)126, (char)127}; | ||
|
|
||
| bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len, | ||
| const char* pattern, int pattern_len) { | ||
| gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr); | ||
|
|
@@ -892,6 +909,203 @@ const char* gdv_fn_aes_decrypt(int64_t context, const char* data, int32_t data_l | |
|
|
||
| return ret; | ||
| } | ||
|
|
||
| GANDIVA_EXPORT | ||
| const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data, | ||
| int32_t data_len, int32_t n_to_mask, | ||
| int32_t* out_len) { | ||
| if (data_len <= 0) { | ||
| *out_len = 0; | ||
| return nullptr; | ||
| } | ||
|
|
||
| if (n_to_mask > data_len) { | ||
| n_to_mask = data_len; | ||
| } | ||
|
|
||
| *out_len = data_len; | ||
|
|
||
| if (n_to_mask <= 0) { | ||
| return data; | ||
| } | ||
|
|
||
| char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len)); | ||
| if (out == nullptr) { | ||
| gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); | ||
| *out_len = 0; | ||
| return nullptr; | ||
| } | ||
|
|
||
| int bytes_masked; | ||
| for (bytes_masked = 0; bytes_masked < n_to_mask; bytes_masked++) { | ||
| unsigned char char_single_byte = data[bytes_masked]; | ||
| if (char_single_byte > 127) { | ||
| // found a multi-byte utf-8 char | ||
| break; | ||
| } | ||
| out[bytes_masked] = mask_array[char_single_byte]; | ||
| } | ||
|
|
||
| int chars_masked = bytes_masked; | ||
| int out_idx = bytes_masked; | ||
|
|
||
| // Handle multibyte utf8 characters | ||
| utf8proc_int32_t utf8_char; | ||
| while ((chars_masked < n_to_mask) && (bytes_masked < data_len)) { | ||
| auto char_len = | ||
| utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data + bytes_masked), | ||
| data_len, &utf8_char); | ||
|
|
||
| if (char_len < 0) { | ||
| gdv_fn_context_set_error_msg(context, utf8proc_errmsg(char_len)); | ||
| *out_len = 0; | ||
| return nullptr; | ||
| } | ||
|
|
||
| switch (utf8proc_category(utf8_char)) { | ||
| case 1: | ||
| out[out_idx] = 'X'; | ||
| out_idx++; | ||
| break; | ||
| case 2: | ||
| out[out_idx] = 'x'; | ||
| out_idx++; | ||
| break; | ||
| case 9: | ||
| out[out_idx] = 'n'; | ||
| out_idx++; | ||
| break; | ||
| case 10: | ||
| out[out_idx] = 'n'; | ||
| out_idx++; | ||
| break; | ||
| default: | ||
| memcpy(out + out_idx, data + bytes_masked, char_len); | ||
| out_idx += static_cast<int>(char_len); | ||
| break; | ||
| } | ||
| bytes_masked += static_cast<int>(char_len); | ||
| chars_masked++; | ||
| } | ||
|
|
||
| // Correct the out_len after masking multibyte characters with single byte characters | ||
| *out_len = *out_len - (bytes_masked - out_idx); | ||
|
|
||
| if (bytes_masked < data_len) { | ||
| memcpy(out + out_idx, data + bytes_masked, data_len - bytes_masked); | ||
| } | ||
|
|
||
| return out; | ||
| } | ||
|
|
||
| GANDIVA_EXPORT | ||
| const char* gdv_mask_last_n_utf8_int32(int64_t context, const char* data, | ||
| int32_t data_len, int32_t n_to_mask, | ||
| int32_t* out_len) { | ||
| if (data_len <= 0) { | ||
| *out_len = 0; | ||
| return nullptr; | ||
| } | ||
|
|
||
| if (n_to_mask > data_len) { | ||
| n_to_mask = data_len; | ||
| } | ||
|
|
||
| *out_len = data_len; | ||
|
|
||
| if (n_to_mask <= 0) { | ||
| return data; | ||
| } | ||
|
|
||
| char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len)); | ||
| if (out == nullptr) { | ||
| gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); | ||
| *out_len = 0; | ||
| return nullptr; | ||
| } | ||
|
|
||
| bool has_multi_byte = false; | ||
| for (int i = 0; i < data_len; i++) { | ||
| unsigned char char_single_byte = data[i]; | ||
| if (char_single_byte > 127) { | ||
| // found a multi-byte utf-8 char | ||
| has_multi_byte = true; | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| if (!has_multi_byte) { | ||
| int start_idx = data_len - n_to_mask; | ||
| memcpy(out, data, start_idx); | ||
| for (int i = start_idx; i < data_len; ++i) { | ||
| unsigned char char_single_byte = data[i]; | ||
| out[i] = mask_array[char_single_byte]; | ||
| } | ||
| *out_len = data_len; | ||
| return out; | ||
| } | ||
|
|
||
| utf8proc_int32_t utf8_char_buffer; | ||
| int num_of_chars = static_cast<int>( | ||
| utf8proc_decompose(reinterpret_cast<const utf8proc_uint8_t*>(data), data_len, | ||
|
||
| &utf8_char_buffer, 4, UTF8PROC_STABLE)); | ||
|
|
||
| if (num_of_chars < 0) { | ||
| gdv_fn_context_set_error_msg(context, utf8proc_errmsg(num_of_chars)); | ||
| *out_len = 0; | ||
| return nullptr; | ||
| } | ||
|
|
||
| utf8proc_int32_t utf8_char; | ||
| int chars_counter = 0; | ||
| int bytes_read = 0; | ||
| while ((bytes_read < data_len) && (chars_counter < (num_of_chars - n_to_mask))) { | ||
| auto char_len = | ||
| utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data + bytes_read), | ||
| data_len, &utf8_char); | ||
| chars_counter++; | ||
| bytes_read += static_cast<int>(char_len); | ||
| } | ||
|
|
||
| int out_idx = bytes_read; | ||
| int offset_idx = bytes_read; | ||
|
|
||
| // Populate the first chars, that are not masked | ||
| memcpy(out, data, offset_idx); | ||
|
|
||
| while (bytes_read < data_len) { | ||
| auto char_len = | ||
| utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data + bytes_read), | ||
| data_len, &utf8_char); | ||
| switch (utf8proc_category(utf8_char)) { | ||
| case 1: | ||
| out[out_idx] = 'X'; | ||
| out_idx++; | ||
| break; | ||
| case 2: | ||
| out[out_idx] = 'x'; | ||
| out_idx++; | ||
| break; | ||
| case 9: | ||
| out[out_idx] = 'n'; | ||
| out_idx++; | ||
| break; | ||
| case 10: | ||
| out[out_idx] = 'n'; | ||
| out_idx++; | ||
| break; | ||
| default: | ||
| memcpy(out + out_idx, data + bytes_read, char_len); | ||
| out_idx += static_cast<int>(char_len); | ||
| break; | ||
| } | ||
| bytes_read += static_cast<int>(char_len); | ||
| } | ||
|
|
||
| *out_len = out_idx; | ||
|
|
||
| return out; | ||
| } | ||
| } | ||
|
|
||
| namespace gandiva { | ||
|
|
@@ -1938,5 +2152,22 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { | |
| engine->AddGlobalMappingForFunc("gdv_fn_aes_decrypt", | ||
| types->i8_ptr_type() /*return_type*/, args, | ||
| reinterpret_cast<void*>(gdv_fn_aes_decrypt)); | ||
|
|
||
| // gdv_mask_first_n and gdv_mask_last_n | ||
| std::vector<llvm::Type*> mask_args = { | ||
| types->i64_type(), // context | ||
| types->i8_ptr_type(), // data | ||
| types->i32_type(), // data_length | ||
| types->i32_type(), // n_to_mask | ||
| types->i32_ptr_type() // out_length | ||
| }; | ||
|
|
||
| engine->AddGlobalMappingForFunc("gdv_mask_first_n_utf8_int32", | ||
| types->i8_ptr_type() /*return_type*/, mask_args, | ||
| reinterpret_cast<void*>(gdv_mask_first_n_utf8_int32)); | ||
|
|
||
| engine->AddGlobalMappingForFunc("gdv_mask_last_n_utf8_int32", | ||
| types->i8_ptr_type() /*return_type*/, mask_args, | ||
| reinterpret_cast<void*>(gdv_mask_last_n_utf8_int32)); | ||
| } | ||
| } // namespace gandiva | ||
Uh oh!
There was an error while loading. Please reload this page.