File tree Expand file tree Collapse file tree 3 files changed +21
-7
lines changed Expand file tree Collapse file tree 3 files changed +21
-7
lines changed Original file line number Diff line number Diff line change @@ -108,7 +108,7 @@ impl LanguageType {
108
108
// first character in the column, so removing starting whitespace
109
109
// could cause a miscount.
110
110
let line = if is_fortran { line } else { line. trim ( ) } ;
111
- let tokens = Self :: count_tokens ( & String :: from_utf8_lossy ( line) ) ;
111
+ let tokens = crate :: tokens :: count_tokens_from_bytes ( line) ;
112
112
if line. trim ( ) . is_empty ( ) {
113
113
( 1 , 0 , 0 , tokens)
114
114
} else if is_literate
@@ -134,11 +134,6 @@ impl LanguageType {
134
134
}
135
135
}
136
136
137
- fn count_tokens ( text : & str ) -> usize {
138
- let bpe = tiktoken_rs:: p50k_base ( ) . unwrap ( ) ;
139
- bpe. encode_with_special_tokens ( text) . len ( )
140
- }
141
-
142
137
#[ inline]
143
138
fn parse_lines (
144
139
self ,
@@ -218,7 +213,7 @@ impl LanguageType {
218
213
}
219
214
}
220
215
221
- let tokens = Self :: count_tokens ( & String :: from_utf8_lossy ( lines) ) ;
216
+ let tokens = crate :: tokens :: count_tokens_from_bytes ( lines) ;
222
217
stats. tokens += tokens;
223
218
224
219
stats
Original file line number Diff line number Diff line change @@ -54,6 +54,7 @@ mod consts;
54
54
mod language;
55
55
mod sort;
56
56
mod stats;
57
+ mod tokens;
57
58
58
59
pub use self :: {
59
60
config:: Config ,
Original file line number Diff line number Diff line change
1
+ use once_cell:: sync:: Lazy ;
2
+ use tiktoken_rs:: CoreBPE ;
3
+
4
+ static TOKENIZER : Lazy < CoreBPE > = Lazy :: new ( || tiktoken_rs:: p50k_base ( ) . unwrap ( ) ) ;
5
+
6
+ pub fn count_tokens ( text : & str ) -> usize {
7
+ TOKENIZER . encode_with_special_tokens ( text) . len ( )
8
+ }
9
+
10
+ pub fn count_tokens_from_bytes ( bytes : & [ u8 ] ) -> usize {
11
+ match std:: str:: from_utf8 ( bytes) {
12
+ Ok ( text) => count_tokens ( text) ,
13
+ Err ( _) => {
14
+ let text = String :: from_utf8_lossy ( bytes) ;
15
+ count_tokens ( & text)
16
+ }
17
+ }
18
+ }
You can’t perform that action at this time.
0 commit comments