Skip to content

Commit 6aa6268

Browse files
committed
wip: improve compression by splitting lines
- keep newlines in the dictionary we do this because this doesn't add much overhead and greatly simplify the line splitting. And this way the recreated shader is identical. - fix a bug where we relied on the last line ending on a newline
1 parent 05003ea commit 6aa6268

File tree

5 files changed

+299
-61
lines changed

5 files changed

+299
-61
lines changed

libs/filaflat/src/MaterialChunk.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,10 +140,9 @@ bool MaterialChunk::getTextShader(Unflattener unflattener,
140140
}
141141
const auto& content = dictionary[lineIndex];
142142

143-
// Replace null with newline.
143+
// remove the terminating null character.
144144
memcpy(&shaderContent[cursor], content.data(), content.size() - 1);
145145
cursor += content.size() - 1;
146-
shaderContent[cursor++] = '\n';
147146
}
148147

149148
// Write the terminating null character.

libs/filamat/src/eiff/DictionaryTextChunk.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,27 @@
1515
*/
1616

1717
#include "DictionaryTextChunk.h"
18+
#include "Chunk.h"
19+
#include "Flattener.h"
20+
#include "LineDictionary.h"
21+
22+
#include <filament/MaterialChunkType.h>
23+
24+
#include <cstddef>
25+
#include <utility>
1826

1927
namespace filamat {
2028

21-
DictionaryTextChunk::DictionaryTextChunk(LineDictionary&& dictionary, ChunkType chunkType) :
29+
DictionaryTextChunk::DictionaryTextChunk(LineDictionary&& dictionary, ChunkType const chunkType) :
2230
Chunk(chunkType), mDictionary(std::move(dictionary)) {
2331
}
2432

2533
void DictionaryTextChunk::flatten(Flattener& f) {
2634
// NumStrings
27-
f.writeUint32(mDictionary.getLineCount());
35+
f.writeUint32(mDictionary.getDictionaryLineCount());
2836

2937
// Strings
30-
for (size_t i = 0 ; i < mDictionary.getLineCount() ; i++) {
38+
for (LineDictionary::index_t i = 0, c = mDictionary.getDictionaryLineCount() ; i < c ; i++) {
3139
f.writeString(mDictionary.getString(i).data());
3240
}
3341
}
Lines changed: 206 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2017 The Android Open Source Project
2+
* Copyright (C) 2025 The Android Open Source Project
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,50 +16,229 @@
1616

1717
#include "LineDictionary.h"
1818

19+
#include <utils/debug.h>
20+
#include <utils/Log.h>
21+
#include <utils/ostream.h>
22+
23+
#include <algorithm>
24+
#include <cctype>
25+
#include <cstddef>
26+
#include <cstdint>
27+
#include <iterator>
28+
#include <memory>
29+
#include <string>
30+
#include <string_view>
31+
#include <utility>
32+
#include <vector>
33+
1934
namespace filamat {
2035

21-
std::string_view LineDictionary::getString(size_t index) const noexcept {
36+
LineDictionary::LineDictionary() = default;
37+
38+
LineDictionary::~LineDictionary() noexcept {
39+
printStatistics(utils::slog.d);
40+
}
41+
42+
std::string const& LineDictionary::getString(index_t const index) const noexcept {
2243
return *mStrings[index];
2344
}
2445

25-
size_t LineDictionary::getLineCount() const {
46+
size_t LineDictionary::getDictionaryLineCount() const {
2647
return mStrings.size();
2748
}
49+
std::vector<LineDictionary::index_t> LineDictionary::getIndices(
50+
std::string_view const& line) const noexcept {
51+
std::vector<index_t> result;
52+
std::vector<std::string_view> const sublines = splitString(line);
53+
for (std::string_view const& subline : sublines) {
54+
if (auto iter = mLineIndices.find(subline); iter != mLineIndices.end()) {
55+
result.push_back(iter->second.index);
56+
}
57+
}
58+
return result;
59+
}
2860

29-
size_t LineDictionary::getIndex(std::string_view s) const noexcept {
30-
if (auto iter = mLineIndices.find(s); iter != mLineIndices.end()) {
31-
return iter->second;
61+
void LineDictionary::addText(std::string_view const text) noexcept {
62+
size_t cur = 0;
63+
size_t const len = text.length();
64+
const char* s = text.data();
65+
while (cur < len) {
66+
// Start of the current line
67+
size_t const pos = cur;
68+
// Find the end of the current line or end of text
69+
while (cur < len && s[cur] != '\n') {
70+
cur++;
71+
}
72+
// If we found a newline, advance past it for the next iteration, ensuring '\n' is included
73+
if (cur < len) {
74+
cur++;
75+
}
76+
addLine({ s + pos, cur - pos });
3277
}
33-
return SIZE_MAX;
3478
}
3579

36-
void LineDictionary::addText(const std::string& line) noexcept {
37-
const char* s = line.c_str();
80+
void LineDictionary::addLine(std::string_view const line) noexcept {
81+
auto const lines = splitString(line);
82+
for (std::string_view const& subline : lines) {
83+
// Never add a line twice.
84+
auto pos = mLineIndices.find(subline);
85+
if (pos != mLineIndices.end()) {
86+
pos->second.count++;
87+
continue;
88+
}
89+
mStrings.emplace_back(std::make_unique<std::string>(subline));
90+
mLineIndices.emplace(*mStrings.back(),
91+
LineInfo{
92+
.index = index_t(mStrings.size() - 1),
93+
.count = 1 });
94+
}
95+
}
3896

39-
size_t cur = 0;
40-
size_t pos = 0;
41-
size_t len = 0;
97+
std::string_view LineDictionary::ltrim(std::string_view s) {
98+
s.remove_prefix(std::distance(s.begin(), std::find_if(s.begin(), s.end(),
99+
[](unsigned char const c) { return !std::isspace(c); })));
100+
return { s.data(), s.size() };
101+
}
42102

43-
while (s[cur] != '\0') {
44-
pos = cur;
45-
len = 0;
46-
while (s[cur] != '\n') {
47-
cur++;
48-
len++;
103+
std::pair<size_t, size_t> LineDictionary::findPattern(
104+
std::string_view const line, size_t const offset) {
105+
// Patterns are ordered from longest to shortest to ensure correct prefix matching.
106+
static constexpr std::string_view kPatterns[] = { "hp_copy_", "mp_copy_", "_" };
107+
108+
const size_t line_len = line.length();
109+
for (size_t i = offset; i < line_len; ++i) {
110+
// A pattern must be a whole word (or at the start of the string).
111+
if (i > 0 && std::isalnum(line[i - 1])) {
112+
continue;
113+
}
114+
115+
for (const auto& prefix : kPatterns) {
116+
if (line.size() - i >= prefix.size() && line.substr(i, prefix.size()) == prefix) {
117+
// A known prefix has been matched. Now, check for a sequence of digits.
118+
size_t const startOfDigits = i + prefix.size();
119+
if (startOfDigits < line_len && std::isdigit(line[startOfDigits])) {
120+
size_t j = startOfDigits;
121+
while (j < line_len && (j < startOfDigits + 6) && std::isdigit(line[j])) {
122+
j++;
123+
}
124+
// We have a full pattern match (prefix + digits).
125+
return { i, j - i };
126+
}
127+
// If a prefix is matched but not followed by digits, it's not a valid pattern.
128+
// We break to the outer loop to continue searching from the next character,
129+
// because we've already checked the longest possible prefix at this position.
130+
break;
131+
}
49132
}
50-
std::string newLine(s + pos, len);
51-
addLine(std::move(newLine));
52-
cur++;
53133
}
134+
return { std::string_view::npos, 0 }; // No pattern found
54135
}
55136

56-
void LineDictionary::addLine(const std::string&& line) noexcept {
57-
// Never add a line twice.
58-
if (mLineIndices.find(line) != mLineIndices.end()) {
59-
return;
137+
std::vector<std::string_view> LineDictionary::splitString(std::string_view const line) {
138+
std::vector<std::string_view> result;
139+
size_t current_pos = 0;
140+
141+
if (line.empty()) {
142+
result.push_back({});
143+
return result;
60144
}
61-
mStrings.emplace_back(std::make_unique<std::string>(line));
62-
mLineIndices.emplace(*mStrings.back(), mStrings.size() - 1);
145+
146+
while (current_pos < line.length()) {
147+
auto const [match_pos, match_len] = findPattern(line, current_pos);
148+
149+
if (match_pos == std::string_view::npos) {
150+
// No more patterns found, add the rest of the string.
151+
result.push_back(line.substr(current_pos));
152+
break;
153+
}
154+
155+
// Add the part before the match.
156+
if (match_pos > current_pos) {
157+
result.push_back(line.substr(current_pos, match_pos - current_pos));
158+
}
159+
160+
// Add the match itself.
161+
result.push_back(line.substr(match_pos, match_len));
162+
163+
// Move cursor past the match.
164+
current_pos = match_pos + match_len;
165+
}
166+
167+
return result;
168+
}
169+
170+
void LineDictionary::printStatistics(utils::io::ostream& stream) const noexcept {
171+
std::vector<std::pair<std::string_view, LineInfo>> info;
172+
for (auto const& pair : mLineIndices) {
173+
info.push_back(pair);
174+
}
175+
176+
// Sort by count, then by index.
177+
std::sort(info.begin(), info.end(),
178+
[](auto const& lhs, auto const& rhs) {
179+
if (lhs.second.count != rhs.second.count) {
180+
return lhs.second.count > rhs.second.count;
181+
}
182+
return lhs.second.index < rhs.second.index;
183+
});
184+
185+
size_t total_size = 0;
186+
size_t compressed_size = 0;
187+
size_t total_lines = 0;
188+
size_t indices_size = 0;
189+
size_t indices_size_if_varlen = 0;
190+
size_t indices_size_if_varlen_sorted = 0;
191+
size_t i = 0;
192+
using namespace utils;
193+
// Print the dictionary.
194+
stream << "Line dictionary:" << io::endl;
195+
for (auto const& pair : info) {
196+
compressed_size += pair.first.length();
197+
total_size += pair.first.length() * pair.second.count;
198+
total_lines += pair.second.count;
199+
indices_size += sizeof(uint16_t) * pair.second.count;
200+
if (pair.second.index <= 127) {
201+
indices_size_if_varlen += sizeof(uint8_t) * pair.second.count;
202+
} else {
203+
indices_size_if_varlen += sizeof(uint16_t) * pair.second.count;
204+
}
205+
if (i <= 128) {
206+
indices_size_if_varlen_sorted += sizeof(uint8_t) * pair.second.count;
207+
} else {
208+
indices_size_if_varlen_sorted += sizeof(uint16_t) * pair.second.count;
209+
}
210+
i++;
211+
stream << " " << pair.second.count << ": " << pair.first << io::endl;
212+
}
213+
stream << "Total size: " << total_size << ", compressed size: " << compressed_size << io::endl;
214+
stream << "Saved size: " << total_size - compressed_size << io::endl;
215+
stream << "Unique lines: " << mLineIndices.size() << io::endl;
216+
stream << "Total lines: " << total_lines << io::endl;
217+
stream << "Compression ratio: " << double(total_size) / compressed_size << io::endl;
218+
stream << "Average line length (total): " << double(total_size) / total_lines << io::endl;
219+
stream << "Average line length (compressed): " << double(compressed_size) / mLineIndices.size() << io::endl;
220+
stream << "Indices size: " << indices_size << io::endl;
221+
stream << "Indices size (if varlen): " << indices_size_if_varlen << io::endl;
222+
stream << "Indices size (if varlen, sorted): " << indices_size_if_varlen_sorted << io::endl;
223+
224+
// some data we gathered
225+
226+
// Total size: 751161, compressed size: 59818
227+
// Saved size: 691343
228+
// Unique lines: 3659
229+
// Total lines: 61686
230+
// Compression ratio: 12.557440904075696
231+
// Average line length (total): 12.177171481373406
232+
// Average line length (compressed): 16.34818256354195
233+
234+
235+
// Total size: 751161, compressed size: 263215
236+
// Saved size: 487946
237+
// Unique lines: 4672
238+
// Total lines: 23258
239+
// Compression ratio: 2.8537925270216364
240+
// Average line length (total): 32.296887092613296
241+
// Average line length (compressed): 56.338827054794521
63242
}
64243

65244
} // namespace filamat

libs/filamat/src/eiff/LineDictionary.h

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,38 +17,70 @@
1717
#ifndef TNT_FILAMAT_LINEDICTIONARY_H
1818
#define TNT_FILAMAT_LINEDICTIONARY_H
1919

20+
#include <cstddef>
2021
#include <memory>
2122
#include <string>
2223
#include <string_view>
2324
#include <unordered_map>
25+
#include <utility>
2426
#include <vector>
2527

28+
namespace utils::io {
29+
class ostream;
30+
}
31+
2632
namespace filamat {
2733

28-
// Establish a line <-> id mapping. Use for shader compression when each shader is sliced in lines
29-
// and each line encoded into a 16 bit id.
3034
class LineDictionary {
3135
public:
32-
LineDictionary() = default;
36+
using index_t = uint32_t;
37+
38+
LineDictionary();
39+
~LineDictionary() noexcept;
3340

3441
// Due to the presence of unique_ptr, disallow copy construction but allow move construction.
3542
LineDictionary(LineDictionary const&) = delete;
3643
LineDictionary(LineDictionary&&) = default;
3744

38-
void addText(const std::string& text) noexcept;
39-
size_t getLineCount() const;
45+
// Adds text to the dictionary, parsing it into lines.
46+
void addText(std::string_view text) noexcept;
47+
48+
// Returns the total number of unique lines stored in the dictionary.
49+
size_t getDictionaryLineCount() const;
4050

51+
// Checks if the dictionary is empty.
4152
bool isEmpty() const noexcept {
4253
return mStrings.empty();
4354
}
4455

45-
std::string_view getString(size_t index) const noexcept;
46-
size_t getIndex(std::string_view s) const noexcept;
56+
// Retrieves a string by its index.
57+
std::string const& getString(index_t index) const noexcept;
58+
59+
// Retrieves the indices of lines that match the given string view.
60+
std::vector<index_t> getIndices(std::string_view const& line) const noexcept;
61+
62+
// Prints statistics about the dictionary to the given output stream.
63+
void printStatistics(utils::io::ostream& stream) const noexcept;
4764

4865
private:
49-
void addLine(const std::string&& line) noexcept;
66+
// Adds a single line to the dictionary.
67+
void addLine(std::string_view line) noexcept;
68+
69+
// Trims leading whitespace from a string view.
70+
static std::string_view ltrim(std::string_view s);
71+
72+
// Splits a string view into a vector of string views based on delimiters.
73+
static std::vector<std::string_view> splitString(std::string_view line);
74+
75+
// Finds a pattern within a string view starting from an offset.
76+
static std::pair<size_t, size_t> findPattern(std::string_view line, size_t offset);
77+
78+
struct LineInfo {
79+
index_t index;
80+
uint32_t count;
81+
};
5082

51-
std::unordered_map<std::string_view, size_t> mLineIndices;
83+
std::unordered_map<std::string_view, LineInfo> mLineIndices;
5284
std::vector<std::unique_ptr<std::string>> mStrings;
5385
};
5486

0 commit comments

Comments
 (0)