Skip to content

Commit a545a37

Browse files
committed
TextDecoder: simplify API and introduce ByteView similar to string_view
The `Append(str, buffer)` approach was not deemed worth it, just to save 1 extra string allocation in very uncommon use cases. The new API `std::string BytesToUtf8(ByteView bytes, ECI eci)` is much cleaner. I deliberately opted against std::basic_string_view<uint8_t> and std::span<const uint8_t> for "reasons".
1 parent 8d702ce commit a545a37

File tree

7 files changed

+158
-174
lines changed

7 files changed

+158
-174
lines changed

core/src/ByteArray.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#pragma once
88

9+
#include "Range.h"
10+
911
#include <cstdint>
1012
#include <cstdio>
1113
#include <string>
@@ -15,7 +17,7 @@
1517
namespace ZXing {
1618

1719
/**
18-
ByteArray is an extension of std::vector<unsigned char>.
20+
ByteArray is an extension of std::vector<uint8_t>.
1921
*/
2022
class ByteArray : public std::vector<uint8_t>
2123
{
@@ -25,15 +27,20 @@ class ByteArray : public std::vector<uint8_t>
2527
explicit ByteArray(int len) : std::vector<uint8_t>(len, 0) {}
2628
explicit ByteArray(const std::string& str) : std::vector<uint8_t>(str.begin(), str.end()) {}
2729

28-
void append(const ByteArray& other) { insert(end(), other.begin(), other.end()); }
30+
void append(ByteView other) { insert(end(), other.begin(), other.end()); }
2931

3032
std::string_view asString(size_t pos = 0, size_t len = std::string_view::npos) const
3133
{
3234
return std::string_view(reinterpret_cast<const char*>(data()), size()).substr(pos, len);
3335
}
36+
37+
ByteView asView(size_t pos = 0, size_t len = size_t(-1)) const
38+
{
39+
return ByteView(*this).subview(pos, len);
40+
}
3441
};
3542

36-
inline std::string ToHex(const ByteArray& bytes)
43+
inline std::string ToHex(ByteView bytes)
3744
{
3845
std::string res(bytes.size() * 3, ' ');
3946

core/src/Content.cpp

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -104,39 +104,36 @@ std::string Content::render(bool withECI) const
104104

105105
#ifdef ZXING_READERS
106106
std::string res;
107+
res.reserve(bytes.size() * 2);
107108
if (withECI)
108-
res = symbology.toString(true);
109+
res += symbology.toString(true);
109110
ECI lastECI = ECI::Unknown;
110111
auto fallbackCS = defaultCharset;
111112
if (!hasECI && fallbackCS == CharacterSet::Unknown)
112113
fallbackCS = guessEncoding();
113114

114115
ForEachECIBlock([&](ECI eci, int begin, int end) {
115-
// first determine how to decode the content (choose character set)
116-
// * eci == ECI::Unknown implies !hasECI and we guess
117-
// * if !IsText(eci) the ToCharcterSet(eci) will return Unknown and we decode as binary
118-
CharacterSet cs = eci == ECI::Unknown ? fallbackCS : ToCharacterSet(eci);
119-
116+
// basic idea: if IsText(eci), we transcode it to UTF8, otherwise we treat it as binary but
117+
// transcoded it to valid UTF8 bytes seqences representing the code points 0-255. The eci we report
118+
// back to the caller by inserting their "\XXXXXX" ECI designator is UTF8 for text and
119+
// the original ECI for everything else.
120+
// first determine how to decode the content (use fallback if unknown)
121+
auto inEci = IsText(eci) ? eci : eci == ECI::Unknown ? ToECI(fallbackCS) : ECI::Binary;
120122
if (withECI) {
121123
// then find the eci to report back in the ECI designator
122-
if (IsText(ToECI(cs))) // everything decoded as text is reported as utf8
123-
eci = ECI::UTF8;
124-
else if (eci == ECI::Unknown) // implies !hasECI and fallbackCS is Unknown or Binary
125-
eci = ECI::Binary;
126-
127-
if (lastECI != eci)
128-
res += ToString(eci);
129-
lastECI = eci;
130-
131-
std::string tmp;
132-
TextDecoder::Append(tmp, bytes.data() + begin, end - begin, cs);
133-
for (auto c : tmp) {
124+
auto outEci = IsText(inEci) ? ECI::UTF8 : eci;
125+
126+
if (lastECI != outEci)
127+
res += ToString(outEci);
128+
lastECI = outEci;
129+
130+
for (auto c : BytesToUtf8(bytes.asView(begin, end - begin), inEci)) {
134131
res += c;
135-
if (c == '\\') // in the ECI protocol a '\' has to be doubled
132+
if (c == '\\') // in the ECI protocol a '\' (0x5c) has to be doubled, works only because 0x5c can only mean `\`
136133
res += c;
137134
}
138135
} else {
139-
TextDecoder::Append(res, bytes.data() + begin, end - begin, cs);
136+
res += BytesToUtf8(bytes.asView(begin, end - begin), inEci);
140137
}
141138
});
142139

@@ -183,6 +180,7 @@ ByteArray Content::bytesECI() const
183180
return {};
184181

185182
std::string res = symbology.toString(true);
183+
res.reserve(res.size() + bytes.size() + encodings.size() * 8);
186184

187185
ForEachECIBlock([&](ECI eci, int begin, int end) {
188186
if (hasECI)
@@ -206,13 +204,13 @@ CharacterSet Content::guessEncoding() const
206204
ByteArray input;
207205
ForEachECIBlock([&](ECI eci, int begin, int end) {
208206
if (eci == ECI::Unknown)
209-
input.insert(input.end(), bytes.begin() + begin, bytes.begin() + end);
207+
input.append(bytes.asView(begin, end - begin));
210208
});
211209

212210
if (input.empty())
213211
return CharacterSet::Unknown;
214212

215-
return TextDecoder::GuessEncoding(input.data(), input.size(), CharacterSet::ISO8859_1);
213+
return GuessTextEncoding(input);
216214
#else
217215
return CharacterSet::ISO8859_1;
218216
#endif

core/src/Content.h

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,6 @@
1010
#include "ReaderOptions.h"
1111
#include "ZXAlgorithms.h"
1212

13-
#if __has_include(<span>) // c++20
14-
#include <span>
15-
#endif
1613
#include <string>
1714
#include <string_view>
1815
#include <vector>
@@ -70,12 +67,7 @@ class Content
7067
void push_back(uint8_t val) { bytes.push_back(val); }
7168
void push_back(int val) { bytes.push_back(narrow_cast<uint8_t>(val)); }
7269
void append(std::string_view str) { bytes.insert(bytes.end(), str.begin(), str.end()); }
73-
#ifdef __cpp_lib_span
74-
void append(std::span<const uint8_t> ba) { bytes.insert(bytes.end(), ba.begin(), ba.end()); }
75-
#else
76-
void append(const ByteArray& ba) { bytes.insert(bytes.end(), ba.begin(), ba.end()); }
77-
void append(std::basic_string_view<uint8_t> ba) { bytes.insert(bytes.end(), ba.begin(), ba.end()); }
78-
#endif
70+
void append(ByteView bv) { bytes.insert(bytes.end(), bv.begin(), bv.end()); }
7971
void append(const Content& other);
8072

8173
void erase(int pos, int n);

core/src/Range.h

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include "ZXAlgorithms.h"
99

10+
#include <cstdint>
1011
#include <iterator>
1112

1213
namespace ZXing {
@@ -56,4 +57,50 @@ struct Range
5657
template <typename C>
5758
Range(const C&) -> Range<typename C::const_iterator>;
5859

60+
/**
61+
* ArrayView is a lightweight, non-owning, non-mutable view over a contiguous sequence of elements.
62+
* Similar to std::span<const T>. See also Range template for general iterator use case.
63+
*/
64+
template <typename T>
65+
class ArrayView
66+
{
67+
const T* _data = nullptr;
68+
std::size_t _size = 0;
69+
70+
public:
71+
using value_type = T;
72+
using pointer = const value_type*;
73+
using const_pointer = const value_type*;
74+
using reference = const value_type&;
75+
using const_reference = const value_type&;
76+
using size_type = std::size_t;
77+
78+
constexpr ArrayView() noexcept = default;
79+
80+
constexpr ArrayView(pointer data, size_type size) noexcept : _data(data), _size(size) {}
81+
82+
template <typename Container,
83+
typename = std::enable_if_t<std::is_convertible_v<decltype(std::data(std::declval<Container&>())), const_pointer>>>
84+
constexpr ArrayView(const Container& c) noexcept : _data(std::data(c)), _size(std::size(c))
85+
{}
86+
87+
constexpr pointer data() const noexcept { return _data; }
88+
constexpr size_type size() const noexcept { return _size; }
89+
constexpr bool empty() const noexcept { return _size == 0; }
90+
91+
constexpr const_reference operator[](size_type index) const noexcept { return _data[index]; }
92+
93+
constexpr pointer begin() const noexcept { return _data; }
94+
constexpr pointer end() const noexcept { return _data + _size; }
95+
96+
constexpr ArrayView<T> subview(size_type pos, size_type len = size_type(-1)) const noexcept
97+
{
98+
if (pos > _size)
99+
return {};
100+
return {_data + pos, std::min(len, _size - pos)};
101+
}
102+
};
103+
104+
using ByteView = ArrayView<uint8_t>;
105+
59106
} // namespace ZXing

core/src/TextDecoder.cpp

Lines changed: 19 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
/*
22
* Copyright 2016 Nu-book Inc.
33
* Copyright 2022 gitlost
4+
* Copyright 2025 Axel Waggershauser
45
*/
56
// SPDX-License-Identifier: Apache-2.0
67

78
#include "TextDecoder.h"
89

9-
#include "CharacterSet.h"
10-
#include "ECI.h"
11-
#include "Utf.h"
1210
#include "ZXAlgorithms.h"
1311
#include "libzueci/zueci.h"
1412

@@ -17,38 +15,29 @@
1715

1816
namespace ZXing {
1917

20-
void TextDecoder::Append(std::string& str, const uint8_t* bytes, size_t length, CharacterSet charset, bool sjisASCII)
18+
std::string BytesToUtf8(ByteView bytes, ECI eci)
2119
{
22-
int eci = ToInt(ToECI(charset));
23-
const size_t str_len = str.length();
24-
const int bytes_len = narrow_cast<int>(length);
2520
constexpr unsigned int replacement = 0xFFFD;
26-
const unsigned int flags = ZUECI_FLAG_SB_STRAIGHT_THRU | (sjisASCII ? ZUECI_FLAG_SJIS_STRAIGHT_THRU : 0);
21+
constexpr unsigned int flags = ZUECI_FLAG_SB_STRAIGHT_THRU | ZUECI_FLAG_SJIS_STRAIGHT_THRU;
2722
int utf8_len;
2823

29-
if (eci == -1)
30-
eci = 899; // Binary
24+
if (eci == ECI::Unknown)
25+
eci = ECI::Binary;
3126

32-
int error_number = zueci_dest_len_utf8(eci, bytes, bytes_len, replacement, flags, &utf8_len);
27+
int error_number = zueci_dest_len_utf8(ToInt(eci), bytes.data(), bytes.size(), replacement, flags, &utf8_len);
3328
if (error_number >= ZUECI_ERROR)
3429
throw std::runtime_error("zueci_dest_len_utf8 failed");
3530

36-
str.resize(str_len + utf8_len); // Precise length
37-
unsigned char *utf8_buf = reinterpret_cast<unsigned char *>(str.data()) + str_len;
31+
std::string utf8(utf8_len, 0);
3832

39-
error_number = zueci_eci_to_utf8(eci, bytes, bytes_len, replacement, flags, utf8_buf, &utf8_len);
40-
if (error_number >= ZUECI_ERROR) {
41-
str.resize(str_len);
33+
error_number = zueci_eci_to_utf8(ToInt(eci), bytes.data(), bytes.size(), replacement, flags,
34+
reinterpret_cast<uint8_t*>(utf8.data()), &utf8_len);
35+
if (error_number >= ZUECI_ERROR)
4236
throw std::runtime_error("zueci_eci_to_utf8 failed");
43-
}
44-
assert(str.length() == str_len + utf8_len);
45-
}
4637

47-
void TextDecoder::Append(std::wstring& str, const uint8_t* bytes, size_t length, CharacterSet charset)
48-
{
49-
std::string u8str;
50-
Append(u8str, bytes, length, charset);
51-
str.append(FromUtf8(u8str));
38+
assert(Size(utf8) == utf8_len);
39+
40+
return utf8;
5241
}
5342

5443
/**
@@ -57,8 +46,7 @@ void TextDecoder::Append(std::wstring& str, const uint8_t* bytes, size_t length,
5746
* {@link #SHIFT_JIS}, {@link #UTF8}, {@link #ISO88591}, or the platform
5847
* default encoding if none of these can possibly be correct
5948
*/
60-
CharacterSet
61-
TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fallback)
49+
CharacterSet GuessTextEncoding(ByteView bytes, CharacterSet fallback)
6250
{
6351
// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
6452
// which should be by far the most common encodings.
@@ -82,11 +70,12 @@ TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fal
8270
//int isoHighChars = 0;
8371
int isoHighOther = 0;
8472

85-
bool utf8bom = length > 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF;
73+
bool utf8bom = bytes.size() > 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF;
8674

87-
for (size_t i = 0; i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); ++i)
75+
for (int value : bytes)
8876
{
89-
int value = bytes[i];
77+
if(!(canBeISO88591 || canBeShiftJIS || canBeUTF8))
78+
break;
9079

9180
// UTF-8 stuff
9281
if (canBeUTF8) {
@@ -208,7 +197,7 @@ TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fal
208197
// - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
209198
// - then we conclude Shift_JIS, else ISO-8859-1
210199
if (canBeISO88591 && canBeShiftJIS) {
211-
return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= (int)length
200+
return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= Size(bytes)
212201
? CharacterSet::Shift_JIS : CharacterSet::ISO8859_1;
213202
}
214203

@@ -226,10 +215,4 @@ TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fal
226215
return fallback;
227216
}
228217

229-
CharacterSet
230-
TextDecoder::DefaultEncoding()
231-
{
232-
return CharacterSet::ISO8859_1;
233-
}
234-
235218
} // ZXing

core/src/TextDecoder.h

Lines changed: 10 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,25 @@
11
/*
2-
* Copyright 2016 Nu-book Inc.
2+
* Copyright 2025 Axel Waggershauser
33
*/
44
// SPDX-License-Identifier: Apache-2.0
55

66
#pragma once
77

88
#include "CharacterSet.h"
9+
#include "ECI.h"
10+
#include "Range.h"
911

10-
#include <cstddef>
11-
#include <cstdint>
1212
#include <string>
1313

1414
namespace ZXing {
1515

16-
class TextDecoder
16+
std::string BytesToUtf8(ByteView bytes, ECI eci);
17+
18+
inline std::string BytesToUtf8(ByteView bytes, CharacterSet cs)
1719
{
18-
public:
19-
static CharacterSet DefaultEncoding();
20-
static CharacterSet GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fallback = DefaultEncoding());
21-
22-
// If `sjisASCII` set then for Shift_JIS maps ASCII directly (straight-thru), i.e. does not map ASCII backslash & tilde
23-
// to Yen sign & overline resp. (JIS X 0201 Roman)
24-
static void Append(std::string& str, const uint8_t* bytes, size_t length, CharacterSet charset, bool sjisASCII = true);
25-
26-
static void Append(std::wstring& str, const uint8_t* bytes, size_t length, CharacterSet charset);
27-
28-
static void AppendLatin1(std::wstring& str, const std::string& latin1) {
29-
auto ptr = (const uint8_t*)latin1.data();
30-
str.append(ptr, ptr + latin1.length());
31-
}
32-
33-
static std::wstring FromLatin1(const std::string& latin1) {
34-
auto ptr = (const uint8_t*)latin1.data();
35-
return std::wstring(ptr, ptr + latin1.length());
36-
}
37-
};
20+
return BytesToUtf8(bytes, ToECI(cs));
21+
}
22+
23+
CharacterSet GuessTextEncoding(ByteView bytes, CharacterSet fallback = CharacterSet::ISO8859_1);
3824

3925
} // ZXing

0 commit comments

Comments
 (0)