Skip to content

Commit 53bc802

Browse files
committed
matc: improvements to LineDictionary compression
We can significantly improve the effectiveness of the line dictionary by splitting lines strategically. spirv optimization tends to create new variable names (e.g. _1234) that are not always stable between variants, hereby preventing lines from being duplicated. By simply splitting lines around these patterns, we improve the effectiveness of the compression. The decompression is almost unchanged. To make the code easier, we now keep the '\n' at the end of lines in the dictionary. That's the only change to the decompression side. On certain materials like aiDefault.mat the compression ratio is improved from 2.8x to 15x. This also fixes a small but where the line dictionary relied on the last line of the input to be a newline.
1 parent f91ba08 commit 53bc802

File tree

6 files changed

+300
-61
lines changed

6 files changed

+300
-61
lines changed

NEW_RELEASE_NOTES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@ appropriate header in [RELEASE_NOTES.md](./RELEASE_NOTES.md).
1414
depth-only passes. [**Requires recompiling materials**]
1515
- material: fix specularFactor in `LOW_QUALITY` mode. [**Requires recompiling materials**] to take effect.
1616
- material: Add CRC32 validation for material packages [⚠️ **New Material Version**]
17+
- material: Improve LineDictionary compression [⚠️ **New Material Version**]

libs/filaflat/src/MaterialChunk.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,9 @@ bool MaterialChunk::getTextShader(Unflattener unflattener,
144144
}
145145
const auto& content = dictionary[lineIndex];
146146

147-
// Replace null with newline.
147+
// remove the terminating null character.
148148
memcpy(&shaderContent[cursor], content.data(), content.size() - 1);
149149
cursor += content.size() - 1;
150-
shaderContent[cursor++] = '\n';
151150
}
152151

153152
// Write the terminating null character.

libs/filamat/src/eiff/DictionaryTextChunk.cpp

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,27 @@
1515
*/
1616

1717
#include "DictionaryTextChunk.h"
18+
#include "Chunk.h"
19+
#include "Flattener.h"
20+
#include "LineDictionary.h"
21+
22+
#include <filament/MaterialChunkType.h>
23+
24+
#include <cstddef>
25+
#include <utility>
1826

1927
namespace filamat {
2028

21-
DictionaryTextChunk::DictionaryTextChunk(LineDictionary&& dictionary, ChunkType chunkType) :
29+
DictionaryTextChunk::DictionaryTextChunk(LineDictionary&& dictionary, ChunkType const chunkType) :
2230
Chunk(chunkType), mDictionary(std::move(dictionary)) {
2331
}
2432

2533
void DictionaryTextChunk::flatten(Flattener& f) {
2634
// NumStrings
27-
f.writeUint32(mDictionary.getLineCount());
35+
f.writeUint32(mDictionary.getDictionaryLineCount());
2836

2937
// Strings
30-
for (size_t i = 0 ; i < mDictionary.getLineCount() ; i++) {
38+
for (LineDictionary::index_t i = 0, c = mDictionary.getDictionaryLineCount() ; i < c ; i++) {
3139
f.writeString(mDictionary.getString(i).data());
3240
}
3341
}
Lines changed: 206 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2017 The Android Open Source Project
2+
* Copyright (C) 2025 The Android Open Source Project
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,50 +16,229 @@
1616

1717
#include "LineDictionary.h"
1818

19+
#include <utils/debug.h>
20+
#include <utils/Log.h>
21+
#include <utils/ostream.h>
22+
23+
#include <algorithm>
24+
#include <cctype>
25+
#include <cstddef>
26+
#include <cstdint>
27+
#include <iterator>
28+
#include <memory>
29+
#include <string>
30+
#include <string_view>
31+
#include <utility>
32+
#include <vector>
33+
1934
namespace filamat {
2035

21-
std::string_view LineDictionary::getString(size_t index) const noexcept {
36+
LineDictionary::LineDictionary() = default;
37+
38+
LineDictionary::~LineDictionary() noexcept {
39+
//printStatistics(utils::slog.d);
40+
}
41+
42+
std::string const& LineDictionary::getString(index_t const index) const noexcept {
2243
return *mStrings[index];
2344
}
2445

25-
size_t LineDictionary::getLineCount() const {
46+
size_t LineDictionary::getDictionaryLineCount() const {
2647
return mStrings.size();
2748
}
49+
std::vector<LineDictionary::index_t> LineDictionary::getIndices(
50+
std::string_view const& line) const noexcept {
51+
std::vector<index_t> result;
52+
std::vector<std::string_view> const sublines = splitString(line);
53+
for (std::string_view const& subline : sublines) {
54+
if (auto iter = mLineIndices.find(subline); iter != mLineIndices.end()) {
55+
result.push_back(iter->second.index);
56+
}
57+
}
58+
return result;
59+
}
2860

29-
size_t LineDictionary::getIndex(std::string_view s) const noexcept {
30-
if (auto iter = mLineIndices.find(s); iter != mLineIndices.end()) {
31-
return iter->second;
61+
void LineDictionary::addText(std::string_view const text) noexcept {
62+
size_t cur = 0;
63+
size_t const len = text.length();
64+
const char* s = text.data();
65+
while (cur < len) {
66+
// Start of the current line
67+
size_t const pos = cur;
68+
// Find the end of the current line or end of text
69+
while (cur < len && s[cur] != '\n') {
70+
cur++;
71+
}
72+
// If we found a newline, advance past it for the next iteration, ensuring '\n' is included
73+
if (cur < len) {
74+
cur++;
75+
}
76+
addLine({ s + pos, cur - pos });
3277
}
33-
return SIZE_MAX;
3478
}
3579

36-
void LineDictionary::addText(const std::string& line) noexcept {
37-
const char* s = line.c_str();
80+
void LineDictionary::addLine(std::string_view const line) noexcept {
81+
auto const lines = splitString(line);
82+
for (std::string_view const& subline : lines) {
83+
// Never add a line twice.
84+
auto pos = mLineIndices.find(subline);
85+
if (pos != mLineIndices.end()) {
86+
pos->second.count++;
87+
continue;
88+
}
89+
mStrings.emplace_back(std::make_unique<std::string>(subline));
90+
mLineIndices.emplace(*mStrings.back(),
91+
LineInfo{
92+
.index = index_t(mStrings.size() - 1),
93+
.count = 1 });
94+
}
95+
}
3896

39-
size_t cur = 0;
40-
size_t pos = 0;
41-
size_t len = 0;
97+
std::string_view LineDictionary::ltrim(std::string_view s) {
98+
s.remove_prefix(std::distance(s.begin(), std::find_if(s.begin(), s.end(),
99+
[](unsigned char const c) { return !std::isspace(c); })));
100+
return { s.data(), s.size() };
101+
}
42102

43-
while (s[cur] != '\0') {
44-
pos = cur;
45-
len = 0;
46-
while (s[cur] != '\n') {
47-
cur++;
48-
len++;
103+
std::pair<size_t, size_t> LineDictionary::findPattern(
104+
std::string_view const line, size_t const offset) {
105+
// Patterns are ordered from longest to shortest to ensure correct prefix matching.
106+
static constexpr std::string_view kPatterns[] = { "hp_copy_", "mp_copy_", "_" };
107+
108+
const size_t line_len = line.length();
109+
for (size_t i = offset; i < line_len; ++i) {
110+
// A pattern must be a whole word (or at the start of the string).
111+
if (i > 0 && std::isalnum(line[i - 1])) {
112+
continue;
113+
}
114+
115+
for (const auto& prefix : kPatterns) {
116+
if (line.size() - i >= prefix.size() && line.substr(i, prefix.size()) == prefix) {
117+
// A known prefix has been matched. Now, check for a sequence of digits.
118+
size_t const startOfDigits = i + prefix.size();
119+
if (startOfDigits < line_len && std::isdigit(line[startOfDigits])) {
120+
size_t j = startOfDigits;
121+
while (j < line_len && (j < startOfDigits + 6) && std::isdigit(line[j])) {
122+
j++;
123+
}
124+
// We have a full pattern match (prefix + digits).
125+
return { i, j - i };
126+
}
127+
// If a prefix is matched but not followed by digits, it's not a valid pattern.
128+
// We break to the outer loop to continue searching from the next character,
129+
// because we've already checked the longest possible prefix at this position.
130+
break;
131+
}
49132
}
50-
std::string newLine(s + pos, len);
51-
addLine(std::move(newLine));
52-
cur++;
53133
}
134+
return { std::string_view::npos, 0 }; // No pattern found
54135
}
55136

56-
void LineDictionary::addLine(const std::string&& line) noexcept {
57-
// Never add a line twice.
58-
if (mLineIndices.find(line) != mLineIndices.end()) {
59-
return;
137+
std::vector<std::string_view> LineDictionary::splitString(std::string_view const line) {
138+
std::vector<std::string_view> result;
139+
size_t current_pos = 0;
140+
141+
if (line.empty()) {
142+
result.push_back({});
143+
return result;
60144
}
61-
mStrings.emplace_back(std::make_unique<std::string>(line));
62-
mLineIndices.emplace(*mStrings.back(), mStrings.size() - 1);
145+
146+
while (current_pos < line.length()) {
147+
auto const [match_pos, match_len] = findPattern(line, current_pos);
148+
149+
if (match_pos == std::string_view::npos) {
150+
// No more patterns found, add the rest of the string.
151+
result.push_back(line.substr(current_pos));
152+
break;
153+
}
154+
155+
// Add the part before the match.
156+
if (match_pos > current_pos) {
157+
result.push_back(line.substr(current_pos, match_pos - current_pos));
158+
}
159+
160+
// Add the match itself.
161+
result.push_back(line.substr(match_pos, match_len));
162+
163+
// Move cursor past the match.
164+
current_pos = match_pos + match_len;
165+
}
166+
167+
return result;
168+
}
169+
170+
void LineDictionary::printStatistics(utils::io::ostream& stream) const noexcept {
171+
std::vector<std::pair<std::string_view, LineInfo>> info;
172+
for (auto const& pair : mLineIndices) {
173+
info.push_back(pair);
174+
}
175+
176+
// Sort by count, then by index.
177+
std::sort(info.begin(), info.end(),
178+
[](auto const& lhs, auto const& rhs) {
179+
if (lhs.second.count != rhs.second.count) {
180+
return lhs.second.count > rhs.second.count;
181+
}
182+
return lhs.second.index < rhs.second.index;
183+
});
184+
185+
size_t total_size = 0;
186+
size_t compressed_size = 0;
187+
size_t total_lines = 0;
188+
size_t indices_size = 0;
189+
size_t indices_size_if_varlen = 0;
190+
size_t indices_size_if_varlen_sorted = 0;
191+
size_t i = 0;
192+
using namespace utils;
193+
// Print the dictionary.
194+
stream << "Line dictionary:" << io::endl;
195+
for (auto const& pair : info) {
196+
compressed_size += pair.first.length();
197+
total_size += pair.first.length() * pair.second.count;
198+
total_lines += pair.second.count;
199+
indices_size += sizeof(uint16_t) * pair.second.count;
200+
if (pair.second.index <= 127) {
201+
indices_size_if_varlen += sizeof(uint8_t) * pair.second.count;
202+
} else {
203+
indices_size_if_varlen += sizeof(uint16_t) * pair.second.count;
204+
}
205+
if (i <= 128) {
206+
indices_size_if_varlen_sorted += sizeof(uint8_t) * pair.second.count;
207+
} else {
208+
indices_size_if_varlen_sorted += sizeof(uint16_t) * pair.second.count;
209+
}
210+
i++;
211+
stream << " " << pair.second.count << ": " << pair.first << io::endl;
212+
}
213+
stream << "Total size: " << total_size << ", compressed size: " << compressed_size << io::endl;
214+
stream << "Saved size: " << total_size - compressed_size << io::endl;
215+
stream << "Unique lines: " << mLineIndices.size() << io::endl;
216+
stream << "Total lines: " << total_lines << io::endl;
217+
stream << "Compression ratio: " << double(total_size) / compressed_size << io::endl;
218+
stream << "Average line length (total): " << double(total_size) / total_lines << io::endl;
219+
stream << "Average line length (compressed): " << double(compressed_size) / mLineIndices.size() << io::endl;
220+
stream << "Indices size: " << indices_size << io::endl;
221+
stream << "Indices size (if varlen): " << indices_size_if_varlen << io::endl;
222+
stream << "Indices size (if varlen, sorted): " << indices_size_if_varlen_sorted << io::endl;
223+
224+
// some data we gathered
225+
226+
// Total size: 751161, compressed size: 59818
227+
// Saved size: 691343
228+
// Unique lines: 3659
229+
// Total lines: 61686
230+
// Compression ratio: 12.557440904075696
231+
// Average line length (total): 12.177171481373406
232+
// Average line length (compressed): 16.34818256354195
233+
234+
235+
// Total size: 751161, compressed size: 263215
236+
// Saved size: 487946
237+
// Unique lines: 4672
238+
// Total lines: 23258
239+
// Compression ratio: 2.8537925270216364
240+
// Average line length (total): 32.296887092613296
241+
// Average line length (compressed): 56.338827054794521
63242
}
64243

65244
} // namespace filamat

libs/filamat/src/eiff/LineDictionary.h

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,38 +17,70 @@
1717
#ifndef TNT_FILAMAT_LINEDICTIONARY_H
1818
#define TNT_FILAMAT_LINEDICTIONARY_H
1919

20+
#include <cstddef>
2021
#include <memory>
2122
#include <string>
2223
#include <string_view>
2324
#include <unordered_map>
25+
#include <utility>
2426
#include <vector>
2527

28+
namespace utils::io {
29+
class ostream;
30+
}
31+
2632
namespace filamat {
2733

28-
// Establish a line <-> id mapping. Use for shader compression when each shader is sliced in lines
29-
// and each line encoded into a 16 bit id.
3034
class LineDictionary {
3135
public:
32-
LineDictionary() = default;
36+
using index_t = uint32_t;
37+
38+
LineDictionary();
39+
~LineDictionary() noexcept;
3340

3441
// Due to the presence of unique_ptr, disallow copy construction but allow move construction.
3542
LineDictionary(LineDictionary const&) = delete;
3643
LineDictionary(LineDictionary&&) = default;
3744

38-
void addText(const std::string& text) noexcept;
39-
size_t getLineCount() const;
45+
// Adds text to the dictionary, parsing it into lines.
46+
void addText(std::string_view text) noexcept;
47+
48+
// Returns the total number of unique lines stored in the dictionary.
49+
size_t getDictionaryLineCount() const;
4050

51+
// Checks if the dictionary is empty.
4152
bool isEmpty() const noexcept {
4253
return mStrings.empty();
4354
}
4455

45-
std::string_view getString(size_t index) const noexcept;
46-
size_t getIndex(std::string_view s) const noexcept;
56+
// Retrieves a string by its index.
57+
std::string const& getString(index_t index) const noexcept;
58+
59+
// Retrieves the indices of lines that match the given string view.
60+
std::vector<index_t> getIndices(std::string_view const& line) const noexcept;
61+
62+
// Prints statistics about the dictionary to the given output stream.
63+
void printStatistics(utils::io::ostream& stream) const noexcept;
4764

4865
private:
49-
void addLine(const std::string&& line) noexcept;
66+
// Adds a single line to the dictionary.
67+
void addLine(std::string_view line) noexcept;
68+
69+
// Trims leading whitespace from a string view.
70+
static std::string_view ltrim(std::string_view s);
71+
72+
// Splits a string view into a vector of string views based on delimiters.
73+
static std::vector<std::string_view> splitString(std::string_view line);
74+
75+
// Finds a pattern within a string view starting from an offset.
76+
static std::pair<size_t, size_t> findPattern(std::string_view line, size_t offset);
77+
78+
struct LineInfo {
79+
index_t index;
80+
uint32_t count;
81+
};
5082

51-
std::unordered_map<std::string_view, size_t> mLineIndices;
83+
std::unordered_map<std::string_view, LineInfo> mLineIndices;
5284
std::vector<std::unique_ptr<std::string>> mStrings;
5385
};
5486

0 commit comments

Comments
 (0)