Skip to content

Commit 23979e7

Browse files
authored
Remove boost library. (#3215)
* Remove boost library. * add conditional include for gtest * Add test, demo exclude
1 parent 6fa2df5 commit 23979e7

File tree

13 files changed

+2893
-98
lines changed

13 files changed

+2893
-98
lines changed

faster_tokenizer/CMakeLists.txt

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ endforeach()
102102

103103
ELSE(WIN32)
104104
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fPIC")
105-
IF (LINUX)
105+
IF (NOT APPLE)
106106
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ldl -lpthread")
107107
ENDIF()
108108
set (PUBLIC_DEPEND_LIBS ${CMAKE_DL_LIBS})
109109
ENDIF(WIN32)
110110

111111
# For OpenMP
112112
# openmp not support well for now on windows
113-
if (LINUX)
113+
if (NOT APPLE AND NOT WIN32) # Linux
114114
find_package(OpenMP)
115115
if (OPENMP_FOUND)
116116
add_definitions(-DWITH_OMP)
@@ -143,7 +143,7 @@ if(WITH_PYTHON)
143143

144144
add_subdirectory(python)
145145

146-
if(LINUX)
146+
if (NOT APPLE AND NOT WIN32) # Linux
147147
add_custom_target(build_tokenizers_bdist_wheel ALL
148148
COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel --plat-name=manylinux1_x86_64
149149
COMMENT "Packing whl packages------>>>"
@@ -168,6 +168,8 @@ file(COPY ${PROJECT_SOURCE_DIR}/FasterTokenizer.cmake DESTINATION ${CPP_PACKAGE_
168168
# copy headers
169169
file(COPY ${PROJECT_SOURCE_DIR}/faster_tokenizer/ DESTINATION ${CPP_PACKAGE_DIR}/include/faster_tokenizer/
170170
FILES_MATCHING PATTERN "*.h"
171+
PATTERN "test" EXCLUDE
172+
PATTERN "demo" EXCLUDE
171173
PATTERN "pybind" EXCLUDE)
172174

173175
add_custom_target(copy_third_party_headers ALL
@@ -177,11 +179,6 @@ add_custom_target(copy_third_party_headers ALL
177179
${CPP_PACKAGE_DIR}/third_party/include
178180
DEPENDS build_cpp_package_dir)
179181

180-
add_custom_target(copy_boost_headers ALL
181-
COMMAND ${CMAKE_COMMAND} -E copy_directory
182-
${BOOST_INCLUDE_DIR}/boost ${CPP_PACKAGE_DIR}/third_party/include/boost
183-
DEPENDS build_cpp_package_dir)
184-
185182
# copy library
186183
set(TOKENIZER_CORE_NAME "core_tokenizers")
187184
set(TOKENIZER_CORE_PATH ${CMAKE_BINARY_DIR}/faster_tokenizer)

faster_tokenizer/cmake/external/boost.cmake

Lines changed: 0 additions & 49 deletions
This file was deleted.

faster_tokenizer/cmake/third_party.cmake

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,12 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
1818
"A path setting third party libraries download & build directories.")
1919

2020
include(external/icu)
21-
include(external/gtest)
21+
if(WITH_TESTING)
22+
include(external/gtest)
23+
endif()
2224
include(external/gflags)
2325
include(external/glog)
2426
include(external/re2)
25-
include(external/boost)
2627
include(external/nlohmann_json)
2728
include(external/dart) # For trie
2829
if (WITH_PYTHON)
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
cc_library(added_vocabulary SRCS added_vocabulary.cc DEPS normalizers pretokenizers json)
22
cc_library(tokenizer SRCS tokenizer.cc DEPS added_vocabulary json decoders trie models postprocessors)
33
cc_library(core SRCS encoding.cc DEPS json)
4-
add_dependencies(tokenizer extern_boost)

faster_tokenizer/faster_tokenizer/core/tokenizer.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ bool Tokenizer::DoPreTokenize(
163163
return true;
164164
}
165165

166-
struct InputStringVisitor : public boost::static_visitor<> {
166+
struct InputStringVisitor {
167167
InputStringVisitor(const Tokenizer* tokenizer,
168168
uint32_t type_id,
169169
OffsetType offset_type,
@@ -190,8 +190,8 @@ void Tokenizer::EncodeSingleString(const InputString& input_string,
190190
uint32_t type_id,
191191
OffsetType offset_type,
192192
Encoding* encodings) const {
193-
boost::apply_visitor(
194-
InputStringVisitor(this, type_id, offset_type, encodings), input_string);
193+
paddlenlp::visit(InputStringVisitor(this, type_id, offset_type, encodings),
194+
input_string);
195195
}
196196

197197
void Tokenizer::PostProcess(Encoding* encoding,
@@ -234,13 +234,13 @@ void Tokenizer::EncodePairStrings(const EncodeInput& encode_input,
234234
bool add_special_tokens) const {
235235
Encoding encoding;
236236
if (encode_input.type() == typeid(InputString)) {
237-
const auto& input_string = boost::get<InputString>(encode_input);
237+
const auto& input_string = paddlenlp::get<InputString>(encode_input);
238238
EncodeSingleString(input_string, 0, OffsetType::CHAR, &encoding);
239239
PostProcess(&encoding, nullptr, add_special_tokens, encodings);
240240
} else {
241241
Encoding pair_encoding;
242242
const auto& input_string_pair =
243-
boost::get<std::pair<InputString, InputString>>(encode_input);
243+
paddlenlp::get<std::pair<InputString, InputString>>(encode_input);
244244
EncodeSingleString(input_string_pair.first, 0, OffsetType::CHAR, &encoding);
245245
EncodeSingleString(
246246
input_string_pair.second, 1, OffsetType::CHAR, &pair_encoding);
@@ -273,9 +273,9 @@ void Tokenizer::EncodeBatchStrings(
273273
void Tokenizer::EncodePairStringsCharOffsets(const EncodeInput& encode_input,
274274
Encoding* encodings,
275275
bool add_special_tokens) const {
276-
const auto& input_string = boost::get<InputString>(&encode_input);
276+
const auto& input_string = paddlenlp::get_if<InputString>(&encode_input);
277277
const auto& input_string_pair =
278-
boost::get<std::pair<InputString, InputString>>(&encode_input);
278+
paddlenlp::get_if<std::pair<InputString, InputString>>(&encode_input);
279279
Encoding encoding;
280280
Encoding pair_encoding;
281281
if (input_string != nullptr) {

faster_tokenizer/faster_tokenizer/core/tokenizer.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ limitations under the License. */
1919
#include "faster_tokenizer/core/added_vocabulary.h"
2020
#include "faster_tokenizer/core/base.h"
2121
#include "faster_tokenizer/utils/utils.h"
22-
#include "boost/variant.hpp"
22+
#include "faster_tokenizer/utils/variant.h"
2323
#include "nlohmann/json.hpp"
2424

2525
namespace paddlenlp {
@@ -56,9 +56,9 @@ namespace core {
5656
class AddedVocabulary;
5757
class Encoding;
5858

59-
using InputString = boost::variant<std::string, std::vector<std::string>>;
59+
using InputString = paddlenlp::variant<std::string, std::vector<std::string>>;
6060
using EncodeInput =
61-
boost::variant<InputString, std::pair<InputString, InputString>>;
61+
paddlenlp::variant<InputString, std::pair<InputString, InputString>>;
6262

6363
class FASTERTOKENIZER_DECL Tokenizer {
6464
public:
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
cc_library(decoders SRCS wordpiece.cc DEPS json utils)
2-
add_dependencies(decoders extern_boost)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
cc_library(models
22
SRCS wordpiece.cc faster_wordpiece.cc bpe.cc unigram.cc
3-
DEPS core json boost trie failure icuuc icudata lattice utils)
3+
DEPS core json trie failure icuuc icudata lattice utils)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc DEPS core json boost)
1+
cc_library(postprocessors SRCS bert.cc postprocessor.cc template.cc DEPS core json)

faster_tokenizer/faster_tokenizer/postprocessors/template.cc

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
#include <string>
1717

1818
#include "faster_tokenizer/core/encoding.h"
19-
#include "glog/logging.h"
2019
#include "faster_tokenizer/postprocessors/template.h"
20+
#include "glog/logging.h"
2121

2222
namespace paddlenlp {
2323
namespace faster_tokenizer {
@@ -27,7 +27,7 @@ void ParseIdFromString(const std::string& template_id_string,
2727
TemplatePiece* template_piece) {
2828
if (template_id_string.find_first_of("$") == 0) {
2929
*template_piece = TemplateSequence();
30-
auto& seq = boost::get<TemplateSequence>(*template_piece);
30+
auto& seq = paddlenlp::get<TemplateSequence>(*template_piece);
3131
std::string rest =
3232
template_id_string.substr(template_id_string.find_first_not_of("$"));
3333
if (rest == "" || rest == "A" || rest == "a") {
@@ -48,15 +48,16 @@ void ParseIdFromString(const std::string& template_id_string,
4848
}
4949
} else {
5050
*template_piece = TemplateSpecialToken();
51-
boost::get<TemplateSpecialToken>(*template_piece) = {template_id_string, 0};
51+
paddlenlp::get<TemplateSpecialToken>(*template_piece) = {template_id_string,
52+
0};
5253
}
5354
}
5455

5556
void SetTypeId(uint32_t type_id, TemplatePiece* template_piece) {
56-
if (boost::get<TemplateSequence>(template_piece) != nullptr) {
57-
boost::get<TemplateSequence>(*template_piece).second = type_id;
57+
if (paddlenlp::get_if<TemplateSequence>(template_piece) != nullptr) {
58+
paddlenlp::get<TemplateSequence>(*template_piece).second = type_id;
5859
} else {
59-
boost::get<TemplateSpecialToken>(*template_piece).second = type_id;
60+
paddlenlp::get<TemplateSpecialToken>(*template_piece).second = type_id;
6061
}
6162
}
6263

@@ -84,8 +85,8 @@ void GetTemplatePieceFromString(const std::string& template_string,
8485
}
8586

8687
void to_json(nlohmann::json& j, const TemplatePiece& template_piece) {
87-
if (boost::get<TemplateSequence>(&template_piece) != nullptr) {
88-
auto& template_sequence = boost::get<TemplateSequence>(template_piece);
88+
if (paddlenlp::get_if<TemplateSequence>(&template_piece) != nullptr) {
89+
auto& template_sequence = paddlenlp::get<TemplateSequence>(template_piece);
8990
j = {
9091
{"Sequence",
9192
{
@@ -95,7 +96,7 @@ void to_json(nlohmann::json& j, const TemplatePiece& template_piece) {
9596
};
9697
} else {
9798
auto& template_special_token =
98-
boost::get<TemplateSpecialToken>(template_piece);
99+
paddlenlp::get<TemplateSpecialToken>(template_piece);
99100
j = {
100101
{"SpecialToken",
101102
{
@@ -135,7 +136,7 @@ size_t TemplatePostProcessor::CountAdded(
135136
size_t count = 0;
136137
for (auto& piece : template_->pieces_) {
137138
TemplateSpecialToken* special_token =
138-
boost::get<TemplateSpecialToken>(&piece);
139+
paddlenlp::get_if<TemplateSpecialToken>(&piece);
139140
if (special_token != nullptr) {
140141
auto token_iter =
141142
special_tokens_map.tokens_map_.find(special_token->first);
@@ -244,8 +245,8 @@ void TemplatePostProcessor::ApplyTemplate(
244245
core::Encoding* result_encoding) const {
245246
size_t new_size = 0;
246247
for (auto&& piece : pieces.pieces_) {
247-
if (boost::get<TemplateSequence>(&piece) != nullptr) {
248-
auto seq_type = boost::get<TemplateSequence>(piece).first;
248+
if (paddlenlp::get_if<TemplateSequence>(&piece) != nullptr) {
249+
auto seq_type = paddlenlp::get<TemplateSequence>(piece).first;
249250
if (seq_type == SequenceType::SEQ_A) {
250251
new_size += encoding->GetLen();
251252
} else {
@@ -257,7 +258,8 @@ void TemplatePostProcessor::ApplyTemplate(
257258
}
258259
} else {
259260
if (add_special_tokens) {
260-
auto&& special_token = boost::get<TemplateSpecialToken>(piece).first;
261+
auto&& special_token =
262+
paddlenlp::get<TemplateSpecialToken>(piece).first;
261263
if (special_tokens_map_.tokens_map_.find(special_token) !=
262264
special_tokens_map_.tokens_map_.end()) {
263265
new_size +=
@@ -330,8 +332,8 @@ void TemplatePostProcessor::ApplyTemplate(
330332
}
331333
VLOG(6) << "Template pieces num: " << pieces.pieces_.size();
332334
for (auto& piece : pieces.pieces_) {
333-
if (boost::get<TemplateSequence>(&piece) != nullptr) {
334-
auto& template_sequence = boost::get<TemplateSequence>(piece);
335+
if (paddlenlp::get_if<TemplateSequence>(&piece) != nullptr) {
336+
auto& template_sequence = paddlenlp::get<TemplateSequence>(piece);
335337
if (template_sequence.first == SequenceType::SEQ_A) {
336338
auto seq_start = ids.size();
337339
auto seq_end = seq_start + encoding->GetLen();
@@ -385,7 +387,7 @@ void TemplatePostProcessor::ApplyTemplate(
385387
pair_encoding->GetAttentionMask().end());
386388
}
387389
} else {
388-
auto& special_token = boost::get<TemplateSpecialToken>(piece);
390+
auto& special_token = paddlenlp::get<TemplateSpecialToken>(piece);
389391
if (add_special_tokens) {
390392
const std::string& id = special_token.first;
391393
uint32_t type_id = special_token.second;

0 commit comments

Comments
 (0)