TextDecoder: simplify API and introduce ByteView similar to string_view

axxel · axxel · commit a545a371c243 · 2025-04-16T18:52:16.000+02:00
The `Append(str, buffer)` approach was not deemed worth it, just to save
1 extra string allocation in very uncommon use cases. The new API
`std::string BytesToUtf8(ByteView bytes, ECI eci)` is much cleaner.

I deliberately opted against std::basic_string_view&lt;uint8_t&gt; and
std::span&lt;const uint8_t&gt; for "reasons".
diff --git a/core/src/ByteArray.h b/core/src/ByteArray.h
@@ -6,6 +6,8 @@
 
 #pragma once
 
+#include "Range.h"
+
 #include <cstdint>
 #include <cstdio>
 #include <string>
@@ -15,7 +17,7 @@
 namespace ZXing {
 
 /**
-	ByteArray is an extension of std::vector<unsigned char>.
+	ByteArray is an extension of std::vector<uint8_t>.
 */
 class ByteArray : public std::vector<uint8_t>
 {
@@ -25,15 +27,20 @@ class ByteArray : public std::vector<uint8_t>
 	explicit ByteArray(int len) : std::vector<uint8_t>(len, 0) {}
 	explicit ByteArray(const std::string& str) : std::vector<uint8_t>(str.begin(), str.end()) {}
 
-	void append(const ByteArray& other) { insert(end(), other.begin(), other.end()); }
+	void append(ByteView other) { insert(end(), other.begin(), other.end()); }
 
 	std::string_view asString(size_t pos = 0, size_t len = std::string_view::npos) const
 	{
 		return std::string_view(reinterpret_cast<const char*>(data()), size()).substr(pos, len);
 	}
+
+	ByteView asView(size_t pos = 0, size_t len = size_t(-1)) const
+	{
+		return ByteView(*this).subview(pos, len);
+	}
 };
 
-inline std::string ToHex(const ByteArray& bytes)
+inline std::string ToHex(ByteView bytes)
 {
 	std::string res(bytes.size() * 3, ' ');
 
diff --git a/core/src/Content.cpp b/core/src/Content.cpp
@@ -104,39 +104,36 @@ std::string Content::render(bool withECI) const
 
 #ifdef ZXING_READERS
 	std::string res;
+	res.reserve(bytes.size() * 2);
 	if (withECI)
-		res = symbology.toString(true);
+		res += symbology.toString(true);
 	ECI lastECI = ECI::Unknown;
 	auto fallbackCS = defaultCharset;
 	if (!hasECI && fallbackCS == CharacterSet::Unknown)
 		fallbackCS = guessEncoding();
 
 	ForEachECIBlock([&](ECI eci, int begin, int end) {
-		// first determine how to decode the content (choose character set)
-		//  * eci == ECI::Unknown implies !hasECI and we guess
-		//  * if !IsText(eci) the ToCharcterSet(eci) will return Unknown and we decode as binary
-		CharacterSet cs = eci == ECI::Unknown ? fallbackCS : ToCharacterSet(eci);
-
+		// basic idea: if IsText(eci), we transcode it to UTF8, otherwise we treat it as binary but
+		// transcoded it to valid UTF8 bytes seqences representing the code points 0-255. The eci we report
+		// back to the caller by inserting their "\XXXXXX" ECI designator is UTF8 for text and
+		// the original ECI for everything else.
+		// first determine how to decode the content (use fallback if unknown)
+		auto inEci = IsText(eci) ? eci : eci == ECI::Unknown ? ToECI(fallbackCS) : ECI::Binary;
 		if (withECI) {
 			// then find the eci to report back in the ECI designator
-			if (IsText(ToECI(cs))) // everything decoded as text is reported as utf8
-				eci = ECI::UTF8;
-			else if (eci == ECI::Unknown) // implies !hasECI and fallbackCS is Unknown or Binary
-				eci = ECI::Binary;
-
-			if (lastECI != eci)
-				res += ToString(eci);
-			lastECI = eci;
-
-			std::string tmp;
-			TextDecoder::Append(tmp, bytes.data() + begin, end - begin, cs);
-			for (auto c : tmp) {
+			auto outEci = IsText(inEci) ? ECI::UTF8 : eci;
+
+			if (lastECI != outEci)
+				res += ToString(outEci);
+			lastECI = outEci;
+
+			for (auto c : BytesToUtf8(bytes.asView(begin, end - begin), inEci)) {
 				res += c;
-				if (c == '\\') // in the ECI protocol a '\' has to be doubled
+				if (c == '\\') // in the ECI protocol a '\' (0x5c) has to be doubled, works only because 0x5c can only mean `\`
 					res += c;
 			}
 		} else {
-			TextDecoder::Append(res, bytes.data() + begin, end - begin, cs);
+			res += BytesToUtf8(bytes.asView(begin, end - begin), inEci);
 		}
 	});
 
@@ -183,6 +180,7 @@ ByteArray Content::bytesECI() const
 		return {};
 
 	std::string res = symbology.toString(true);
+	res.reserve(res.size() + bytes.size() + encodings.size() * 8);
 
 	ForEachECIBlock([&](ECI eci, int begin, int end) {
 		if (hasECI)
@@ -206,13 +204,13 @@ CharacterSet Content::guessEncoding() const
 	ByteArray input;
 	ForEachECIBlock([&](ECI eci, int begin, int end) {
 		if (eci == ECI::Unknown)
-			input.insert(input.end(), bytes.begin() + begin, bytes.begin() + end);
+			input.append(bytes.asView(begin, end - begin));
 	});
 
 	if (input.empty())
 		return CharacterSet::Unknown;
 
-	return TextDecoder::GuessEncoding(input.data(), input.size(), CharacterSet::ISO8859_1);
+	return GuessTextEncoding(input);
 #else
 	return CharacterSet::ISO8859_1;
 #endif
diff --git a/core/src/Content.h b/core/src/Content.h
@@ -10,9 +10,6 @@
 #include "ReaderOptions.h"
 #include "ZXAlgorithms.h"
 
-#if __has_include(<span>) // c++20
-#include <span>
-#endif
 #include <string>
 #include <string_view>
 #include <vector>
@@ -70,12 +67,7 @@ class Content
 	void push_back(uint8_t val) { bytes.push_back(val); }
 	void push_back(int val) { bytes.push_back(narrow_cast<uint8_t>(val)); }
 	void append(std::string_view str) { bytes.insert(bytes.end(), str.begin(), str.end()); }
-#ifdef __cpp_lib_span
-	void append(std::span<const uint8_t> ba) { bytes.insert(bytes.end(), ba.begin(), ba.end()); }
-#else
-	void append(const ByteArray& ba) { bytes.insert(bytes.end(), ba.begin(), ba.end()); }
-	void append(std::basic_string_view<uint8_t> ba) { bytes.insert(bytes.end(), ba.begin(), ba.end()); }
-#endif
+	void append(ByteView bv) { bytes.insert(bytes.end(), bv.begin(), bv.end()); }
 	void append(const Content& other);
 
 	void erase(int pos, int n);
diff --git a/core/src/Range.h b/core/src/Range.h
@@ -7,6 +7,7 @@
 
 #include "ZXAlgorithms.h"
 
+#include <cstdint>
 #include <iterator>
 
 namespace ZXing {
@@ -56,4 +57,50 @@ struct Range
 template <typename C>
 Range(const C&) -> Range<typename C::const_iterator>;
 
+/**
+ * ArrayView is a lightweight, non-owning, non-mutable view over a contiguous sequence of elements.
+ * Similar to std::span<const T>. See also Range template for general iterator use case.
+ */
+template <typename T>
+class ArrayView
+{
+	const T* _data = nullptr;
+	std::size_t _size = 0;
+
+public:
+	using value_type = T;
+	using pointer = const value_type*;
+	using const_pointer = const value_type*;
+	using reference = const value_type&;
+	using const_reference = const value_type&;
+	using size_type = std::size_t;
+
+	constexpr ArrayView() noexcept = default;
+
+	constexpr ArrayView(pointer data, size_type size) noexcept : _data(data), _size(size) {}
+
+	template <typename Container,
+			  typename = std::enable_if_t<std::is_convertible_v<decltype(std::data(std::declval<Container&>())), const_pointer>>>
+	constexpr ArrayView(const Container& c) noexcept : _data(std::data(c)), _size(std::size(c))
+	{}
+
+	constexpr pointer data() const noexcept { return _data; }
+	constexpr size_type size() const noexcept { return _size; }
+	constexpr bool empty() const noexcept { return _size == 0; }
+
+	constexpr const_reference operator[](size_type index) const noexcept { return _data[index]; }
+
+	constexpr pointer begin() const noexcept { return _data; }
+	constexpr pointer end() const noexcept { return _data + _size; }
+
+	constexpr ArrayView<T> subview(size_type pos, size_type len = size_type(-1)) const noexcept
+	{
+		if (pos > _size)
+			return {};
+		return {_data + pos, std::min(len, _size - pos)};
+	}
+};
+
+using ByteView = ArrayView<uint8_t>;
+
 } // namespace ZXing
diff --git a/core/src/TextDecoder.cpp b/core/src/TextDecoder.cpp
@@ -1,14 +1,12 @@
 /*
 * Copyright 2016 Nu-book Inc.
 * Copyright 2022 gitlost
+* Copyright 2025 Axel Waggershauser
 */
 // SPDX-License-Identifier: Apache-2.0
 
 #include "TextDecoder.h"
 
-#include "CharacterSet.h"
-#include "ECI.h"
-#include "Utf.h"
 #include "ZXAlgorithms.h"
 #include "libzueci/zueci.h"
 
@@ -17,38 +15,29 @@
 
 namespace ZXing {
 
-void TextDecoder::Append(std::string& str, const uint8_t* bytes, size_t length, CharacterSet charset, bool sjisASCII)
+std::string BytesToUtf8(ByteView bytes, ECI eci)
 {
-	int eci = ToInt(ToECI(charset));
-	const size_t str_len = str.length();
-	const int bytes_len = narrow_cast<int>(length);
 	constexpr unsigned int replacement = 0xFFFD;
-	const unsigned int flags = ZUECI_FLAG_SB_STRAIGHT_THRU | (sjisASCII ? ZUECI_FLAG_SJIS_STRAIGHT_THRU : 0);
+	constexpr unsigned int flags = ZUECI_FLAG_SB_STRAIGHT_THRU | ZUECI_FLAG_SJIS_STRAIGHT_THRU;
 	int utf8_len;
 
-	if (eci == -1)
-		eci = 899; // Binary
+	if (eci == ECI::Unknown)
+		eci = ECI::Binary;
 
-	int error_number = zueci_dest_len_utf8(eci, bytes, bytes_len, replacement, flags, &utf8_len);
+	int error_number = zueci_dest_len_utf8(ToInt(eci), bytes.data(), bytes.size(), replacement, flags, &utf8_len);
 	if (error_number >= ZUECI_ERROR)
 		throw std::runtime_error("zueci_dest_len_utf8 failed");
 
-	str.resize(str_len + utf8_len); // Precise length
-	unsigned char *utf8_buf = reinterpret_cast<unsigned char *>(str.data()) + str_len;
+	std::string utf8(utf8_len, 0);
 
-	error_number = zueci_eci_to_utf8(eci, bytes, bytes_len, replacement, flags, utf8_buf, &utf8_len);
-	if (error_number >= ZUECI_ERROR) {
-		str.resize(str_len);
+	error_number = zueci_eci_to_utf8(ToInt(eci), bytes.data(), bytes.size(), replacement, flags,
+									 reinterpret_cast<uint8_t*>(utf8.data()), &utf8_len);
+	if (error_number >= ZUECI_ERROR)
 		throw std::runtime_error("zueci_eci_to_utf8 failed");
-	}
-	assert(str.length() == str_len + utf8_len);
-}
 
-void TextDecoder::Append(std::wstring& str, const uint8_t* bytes, size_t length, CharacterSet charset)
-{
-	std::string u8str;
-	Append(u8str, bytes, length, charset);
-	str.append(FromUtf8(u8str));
+	assert(Size(utf8) == utf8_len);
+
+	return utf8;
 }
 
 /**
@@ -57,8 +46,7 @@ void TextDecoder::Append(std::wstring& str, const uint8_t* bytes, size_t length,
 *  {@link #SHIFT_JIS}, {@link #UTF8}, {@link #ISO88591}, or the platform
 *  default encoding if none of these can possibly be correct
 */
-CharacterSet
-TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fallback)
+CharacterSet GuessTextEncoding(ByteView bytes, CharacterSet fallback)
 {
 	// For now, merely tries to distinguish ISO-8859-1, UTF-8 and Shift_JIS,
 	// which should be by far the most common encodings.
@@ -82,11 +70,12 @@ TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fal
 	//int isoHighChars = 0;
 	int isoHighOther = 0;
 
-	bool utf8bom = length > 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF;
+	bool utf8bom = bytes.size() > 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF;
 
-	for (size_t i = 0; i < length && (canBeISO88591 || canBeShiftJIS || canBeUTF8); ++i)
+	for (int value : bytes)
 	{
-		int value = bytes[i];
+		if(!(canBeISO88591 || canBeShiftJIS || canBeUTF8))
+			break;
 
 		// UTF-8 stuff
 		if (canBeUTF8) {
@@ -208,7 +197,7 @@ TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fal
 	//   - at least 10% of bytes that could be "upper" not-alphanumeric Latin1,
 	// - then we conclude Shift_JIS, else ISO-8859-1
 	if (canBeISO88591 && canBeShiftJIS) {
-		return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= (int)length
+		return (sjisMaxKatakanaWordLength == 2 && sjisKatakanaChars == 2) || isoHighOther * 10 >= Size(bytes)
 			? CharacterSet::Shift_JIS : CharacterSet::ISO8859_1;
 	}
 
@@ -226,10 +215,4 @@ TextDecoder::GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fal
 	return fallback;
 }
 
-CharacterSet
-TextDecoder::DefaultEncoding()
-{
-	return CharacterSet::ISO8859_1;
-}
-
 } // ZXing
diff --git a/core/src/TextDecoder.h b/core/src/TextDecoder.h
@@ -1,39 +1,25 @@
 /*
-* Copyright 2016 Nu-book Inc.
+* Copyright 2025 Axel Waggershauser
 */
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
 
 #include "CharacterSet.h"
+#include "ECI.h"
+#include "Range.h"
 
-#include <cstddef>
-#include <cstdint>
 #include <string>
 
 namespace ZXing {
 
-class TextDecoder
+std::string BytesToUtf8(ByteView bytes, ECI eci);
+
+inline std::string BytesToUtf8(ByteView bytes, CharacterSet cs)
 {
-public:
-	static CharacterSet DefaultEncoding();
-	static CharacterSet GuessEncoding(const uint8_t* bytes, size_t length, CharacterSet fallback = DefaultEncoding());
-
-	// If `sjisASCII` set then for Shift_JIS maps ASCII directly (straight-thru), i.e. does not map ASCII backslash & tilde
-	// to Yen sign & overline resp. (JIS X 0201 Roman)
-	static void Append(std::string& str, const uint8_t* bytes, size_t length, CharacterSet charset, bool sjisASCII = true);
-
-	static void Append(std::wstring& str, const uint8_t* bytes, size_t length, CharacterSet charset);
-
-	static void AppendLatin1(std::wstring& str, const std::string& latin1) {
-		auto ptr = (const uint8_t*)latin1.data();
-		str.append(ptr, ptr + latin1.length());
-	}
-	
-	static std::wstring FromLatin1(const std::string& latin1) {
-		auto ptr = (const uint8_t*)latin1.data();
-		return std::wstring(ptr, ptr + latin1.length());
-	}
-};
+	return BytesToUtf8(bytes, ToECI(cs));
+}
+
+CharacterSet GuessTextEncoding(ByteView bytes, CharacterSet fallback = CharacterSet::ISO8859_1);
 
 } // ZXing
diff --git a/test/unit/TextDecoderTest.cpp b/test/unit/TextDecoderTest.cpp