matc: improvements to LineDictionary compression

pixelflinger · pixelflinger · commit 53bc80274d9d · 2025-08-11T15:51:18.000-07:00
We can significantly improve the effectiveness of the line dictionary
by splitting lines strategically. spirv optimization tends to create
new variable names (e.g. _1234) that are not always stable between
variants, hereby preventing lines from being duplicated. By simply
splitting lines around these patterns, we improve the effectiveness
of the compression.

The decompression is almost unchanged. To make the code easier, we 
now keep the '\n' at the end of lines in the dictionary. That's the
only change to the decompression side.

On certain materials like aiDefault.mat the compression ratio is
improved from 2.8x to 15x.


This also fixes a small but where the line dictionary relied on the
last line of the input to be a newline.
diff --git a/NEW_RELEASE_NOTES.md b/NEW_RELEASE_NOTES.md
@@ -14,3 +14,4 @@ appropriate header in [RELEASE_NOTES.md](./RELEASE_NOTES.md).
   depth-only passes. [**Requires recompiling materials**]
 - material: fix specularFactor in `LOW_QUALITY` mode. [**Requires recompiling materials**] to take effect.
 - material: Add CRC32 validation for material packages [⚠️ **New Material Version**]
+- material: Improve LineDictionary compression [⚠️ **New Material Version**]
diff --git a/libs/filaflat/src/MaterialChunk.cpp b/libs/filaflat/src/MaterialChunk.cpp
@@ -144,10 +144,9 @@ bool MaterialChunk::getTextShader(Unflattener unflattener,
         }
         const auto& content = dictionary[lineIndex];
 
-        // Replace null with newline.
+        // remove the terminating null character.
         memcpy(&shaderContent[cursor], content.data(), content.size() - 1);
         cursor += content.size() - 1;
-        shaderContent[cursor++] = '\n';
     }
 
     // Write the terminating null character.
diff --git a/libs/filamat/src/eiff/DictionaryTextChunk.cpp b/libs/filamat/src/eiff/DictionaryTextChunk.cpp
@@ -15,19 +15,27 @@
  */
 
 #include "DictionaryTextChunk.h"
+#include "Chunk.h"
+#include "Flattener.h"
+#include "LineDictionary.h"
+
+#include <filament/MaterialChunkType.h>
+
+#include <cstddef>
+#include <utility>
 
 namespace filamat {
 
-DictionaryTextChunk::DictionaryTextChunk(LineDictionary&& dictionary, ChunkType chunkType) :
+DictionaryTextChunk::DictionaryTextChunk(LineDictionary&& dictionary, ChunkType const chunkType) :
         Chunk(chunkType), mDictionary(std::move(dictionary)) {
 }
 
 void DictionaryTextChunk::flatten(Flattener& f) {
     // NumStrings
-    f.writeUint32(mDictionary.getLineCount());
+    f.writeUint32(mDictionary.getDictionaryLineCount());
 
     // Strings
-    for (size_t i = 0 ; i < mDictionary.getLineCount() ; i++) {
+    for (LineDictionary::index_t i = 0, c = mDictionary.getDictionaryLineCount() ; i < c ; i++) {
         f.writeString(mDictionary.getString(i).data());
     }
 }
diff --git a/libs/filamat/src/eiff/LineDictionary.cpp b/libs/filamat/src/eiff/LineDictionary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2017 The Android Open Source Project
+ * Copyright (C) 2025 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,50 +16,229 @@
 
 #include "LineDictionary.h"
 
+#include <utils/debug.h>
+#include <utils/Log.h>
+#include <utils/ostream.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
 namespace filamat {
 
-std::string_view LineDictionary::getString(size_t index) const noexcept {
+LineDictionary::LineDictionary() = default;
+
+LineDictionary::~LineDictionary() noexcept {
+    //printStatistics(utils::slog.d);
+}
+
+std::string const& LineDictionary::getString(index_t const index) const noexcept {
     return *mStrings[index];
 }
 
-size_t LineDictionary::getLineCount() const {
+size_t LineDictionary::getDictionaryLineCount() const {
     return mStrings.size();
 }
+std::vector<LineDictionary::index_t> LineDictionary::getIndices(
+        std::string_view const& line) const noexcept {
+    std::vector<index_t> result;
+    std::vector<std::string_view> const sublines = splitString(line);
+    for (std::string_view const& subline : sublines) {
+        if (auto iter = mLineIndices.find(subline); iter != mLineIndices.end()) {
+            result.push_back(iter->second.index);
+        }
+    }
+    return result;
+}
 
-size_t LineDictionary::getIndex(std::string_view s) const noexcept {
-    if (auto iter = mLineIndices.find(s); iter != mLineIndices.end()) {
-        return iter->second;
+void LineDictionary::addText(std::string_view const text) noexcept {
+    size_t cur = 0;
+    size_t const len = text.length();
+    const char* s = text.data();
+    while (cur < len) {
+        // Start of the current line
+        size_t const pos = cur;
+        // Find the end of the current line or end of text
+        while (cur < len && s[cur] != '\n') {
+            cur++;
+        }
+        // If we found a newline, advance past it for the next iteration, ensuring '\n' is included
+        if (cur < len) {
+            cur++;
+        }
+        addLine({ s + pos, cur - pos });
     }
-    return SIZE_MAX;
 }
 
-void LineDictionary::addText(const std::string& line) noexcept {
-    const char* s = line.c_str();
+void LineDictionary::addLine(std::string_view const line) noexcept {
+    auto const lines = splitString(line);
+    for (std::string_view const& subline : lines) {
+        // Never add a line twice.
+        auto pos = mLineIndices.find(subline);
+        if (pos != mLineIndices.end()) {
+            pos->second.count++;
+            continue;
+        }
+        mStrings.emplace_back(std::make_unique<std::string>(subline));
+        mLineIndices.emplace(*mStrings.back(),
+                LineInfo{
+                    .index = index_t(mStrings.size() - 1),
+                    .count = 1 });
+    }
+}
 
-    size_t cur = 0;
-    size_t pos = 0;
-    size_t len = 0;
+std::string_view LineDictionary::ltrim(std::string_view s) {
+    s.remove_prefix(std::distance(s.begin(), std::find_if(s.begin(), s.end(),
+            [](unsigned char const c) { return !std::isspace(c); })));
+    return { s.data(), s.size() };
+}
 
-    while (s[cur] != '\0') {
-        pos = cur;
-        len = 0;
-        while (s[cur] != '\n') {
-            cur++;
-            len++;
+std::pair<size_t, size_t> LineDictionary::findPattern(
+        std::string_view const line, size_t const offset) {
+    // Patterns are ordered from longest to shortest to ensure correct prefix matching.
+    static constexpr std::string_view kPatterns[] = { "hp_copy_", "mp_copy_", "_" };
+
+    const size_t line_len = line.length();
+    for (size_t i = offset; i < line_len; ++i) {
+        // A pattern must be a whole word (or at the start of the string).
+        if (i > 0 && std::isalnum(line[i - 1])) {
+            continue;
+        }
+
+        for (const auto& prefix : kPatterns) {
+            if (line.size() - i >= prefix.size() && line.substr(i, prefix.size()) == prefix) {
+                // A known prefix has been matched. Now, check for a sequence of digits.
+                size_t const startOfDigits = i + prefix.size();
+                if (startOfDigits < line_len && std::isdigit(line[startOfDigits])) {
+                    size_t j = startOfDigits;
+                    while (j < line_len && (j < startOfDigits + 6) && std::isdigit(line[j])) {
+                        j++;
+                    }
+                    // We have a full pattern match (prefix + digits).
+                    return { i, j - i };
+                }
+                // If a prefix is matched but not followed by digits, it's not a valid pattern.
+                // We break to the outer loop to continue searching from the next character,
+                // because we've already checked the longest possible prefix at this position.
+                break;
+            }
         }
-        std::string newLine(s + pos, len);
-        addLine(std::move(newLine));
-        cur++;
     }
+    return { std::string_view::npos, 0 }; // No pattern found
 }
 
-void LineDictionary::addLine(const std::string&& line) noexcept {
-    // Never add a line twice.
-    if (mLineIndices.find(line) != mLineIndices.end()) {
-        return;
+std::vector<std::string_view> LineDictionary::splitString(std::string_view const line) {
+    std::vector<std::string_view> result;
+    size_t current_pos = 0;
+
+    if (line.empty()) {
+        result.push_back({});
+        return result;
     }
-    mStrings.emplace_back(std::make_unique<std::string>(line));
-    mLineIndices.emplace(*mStrings.back(), mStrings.size() - 1);
+
+    while (current_pos < line.length()) {
+        auto const [match_pos, match_len] = findPattern(line, current_pos);
+
+        if (match_pos == std::string_view::npos) {
+            // No more patterns found, add the rest of the string.
+            result.push_back(line.substr(current_pos));
+            break;
+        }
+
+        // Add the part before the match.
+        if (match_pos > current_pos) {
+            result.push_back(line.substr(current_pos, match_pos - current_pos));
+        }
+
+        // Add the match itself.
+        result.push_back(line.substr(match_pos, match_len));
+
+        // Move cursor past the match.
+        current_pos = match_pos + match_len;
+    }
+
+    return result;
+}
+
+void LineDictionary::printStatistics(utils::io::ostream& stream) const noexcept {
+    std::vector<std::pair<std::string_view, LineInfo>> info;
+    for (auto const& pair : mLineIndices) {
+        info.push_back(pair);
+    }
+
+    // Sort by count, then by index.
+    std::sort(info.begin(), info.end(),
+            [](auto const& lhs, auto const& rhs) {
+        if (lhs.second.count != rhs.second.count) {
+            return lhs.second.count > rhs.second.count;
+        }
+        return lhs.second.index < rhs.second.index;
+    });
+
+    size_t total_size = 0;
+    size_t compressed_size = 0;
+    size_t total_lines = 0;
+    size_t indices_size = 0;
+    size_t indices_size_if_varlen = 0;
+    size_t indices_size_if_varlen_sorted = 0;
+    size_t i = 0;
+    using namespace utils;
+    // Print the dictionary.
+    stream << "Line dictionary:" << io::endl;
+    for (auto const& pair : info) {
+        compressed_size += pair.first.length();
+        total_size += pair.first.length() * pair.second.count;
+        total_lines += pair.second.count;
+        indices_size += sizeof(uint16_t) * pair.second.count;
+        if (pair.second.index <= 127) {
+            indices_size_if_varlen += sizeof(uint8_t) * pair.second.count;
+        } else {
+            indices_size_if_varlen += sizeof(uint16_t) * pair.second.count;
+        }
+        if (i <= 128) {
+            indices_size_if_varlen_sorted += sizeof(uint8_t) * pair.second.count;
+        } else {
+            indices_size_if_varlen_sorted += sizeof(uint16_t) * pair.second.count;
+        }
+        i++;
+        stream << "  " << pair.second.count << ": " << pair.first << io::endl;
+    }
+    stream << "Total size: " << total_size << ", compressed size: " << compressed_size << io::endl;
+    stream << "Saved size: " << total_size - compressed_size << io::endl;
+    stream << "Unique lines: " << mLineIndices.size() << io::endl;
+    stream << "Total lines: " << total_lines << io::endl;
+    stream << "Compression ratio: " << double(total_size) / compressed_size << io::endl;
+    stream << "Average line length (total): " << double(total_size) / total_lines << io::endl;
+    stream << "Average line length (compressed): " << double(compressed_size) / mLineIndices.size() << io::endl;
+    stream << "Indices size: " << indices_size << io::endl;
+    stream << "Indices size (if varlen): " << indices_size_if_varlen << io::endl;
+    stream << "Indices size (if varlen, sorted): " << indices_size_if_varlen_sorted << io::endl;
+
+    // some data we gathered
+
+    // Total size: 751161, compressed size: 59818
+    // Saved size: 691343
+    // Unique lines: 3659
+    // Total lines: 61686
+    // Compression ratio: 12.557440904075696
+    // Average line length (total): 12.177171481373406
+    // Average line length (compressed): 16.34818256354195
+
+
+    // Total size: 751161, compressed size: 263215
+    // Saved size: 487946
+    // Unique lines: 4672
+    // Total lines: 23258
+    // Compression ratio: 2.8537925270216364
+    // Average line length (total): 32.296887092613296
+    // Average line length (compressed): 56.338827054794521
 }
 
 } // namespace filamat
diff --git a/libs/filamat/src/eiff/LineDictionary.h b/libs/filamat/src/eiff/LineDictionary.h
@@ -17,38 +17,70 @@
 #ifndef TNT_FILAMAT_LINEDICTIONARY_H
 #define TNT_FILAMAT_LINEDICTIONARY_H
 
+#include <cstddef>
 #include <memory>
 #include <string>
 #include <string_view>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
+namespace utils::io {
+class ostream;
+}
+
 namespace filamat {
 
-// Establish a line <-> id mapping. Use for shader compression when each shader is sliced in lines
-// and each line encoded into a 16 bit id.
 class LineDictionary {
 public:
-    LineDictionary() = default;
+    using index_t = uint32_t;
+
+    LineDictionary();
+    ~LineDictionary() noexcept;
 
     // Due to the presence of unique_ptr, disallow copy construction but allow move construction.
     LineDictionary(LineDictionary const&) = delete;
     LineDictionary(LineDictionary&&) = default;
 
-    void addText(const std::string& text) noexcept;
-    size_t getLineCount() const;
+    // Adds text to the dictionary, parsing it into lines.
+    void addText(std::string_view text) noexcept;
+
+    // Returns the total number of unique lines stored in the dictionary.
+    size_t getDictionaryLineCount() const;
 
+    // Checks if the dictionary is empty.
     bool isEmpty() const noexcept {
         return mStrings.empty();
     }
 
-    std::string_view getString(size_t index) const noexcept;
-    size_t getIndex(std::string_view s) const noexcept;
+    // Retrieves a string by its index.
+    std::string const& getString(index_t index) const noexcept;
+
+    // Retrieves the indices of lines that match the given string view.
+    std::vector<index_t> getIndices(std::string_view const& line) const noexcept;
+
+    // Prints statistics about the dictionary to the given output stream.
+    void printStatistics(utils::io::ostream& stream) const noexcept;
 
 private:
-    void addLine(const std::string&& line) noexcept;
+    // Adds a single line to the dictionary.
+    void addLine(std::string_view line) noexcept;
+
+    // Trims leading whitespace from a string view.
+    static std::string_view ltrim(std::string_view s);
+
+    // Splits a string view into a vector of string views based on delimiters.
+    static std::vector<std::string_view> splitString(std::string_view line);
+
+    // Finds a pattern within a string view starting from an offset.
+    static std::pair<size_t, size_t> findPattern(std::string_view line, size_t offset);
+
+    struct LineInfo {
+        index_t index;
+        uint32_t count;
+    };
 
-    std::unordered_map<std::string_view, size_t> mLineIndices;
+    std::unordered_map<std::string_view, LineInfo> mLineIndices;
     std::vector<std::unique_ptr<std::string>> mStrings;
 };
 
diff --git a/libs/filamat/src/eiff/MaterialTextChunk.cpp b/libs/filamat/src/eiff/MaterialTextChunk.cpp

Original file line number	Diff line number	Diff line change
`@@ -144,10 +144,9 @@ bool MaterialChunk::getTextShader(Unflattener unflattener,`
`144`	`144`	`}`
`145`	`145`	`const auto& content = dictionary[lineIndex];`
`146`	`146`
`147`		`- // Replace null with newline.`
	`147`	`+ // remove the terminating null character.`
`148`	`148`	`memcpy(&shaderContent[cursor], content.data(), content.size() - 1);`
`149`	`149`	`cursor += content.size() - 1;`
`150`		`- shaderContent[cursor++] = '\n';`
`151`	`150`	`}`
`152`	`151`
`153`	`152`	`// Write the terminating null character.`