|
1 | 1 | /*
|
2 |
| - * Copyright (C) 2017 The Android Open Source Project |
| 2 | + * Copyright (C) 2025 The Android Open Source Project |
3 | 3 | *
|
4 | 4 | * Licensed under the Apache License, Version 2.0 (the "License");
|
5 | 5 | * you may not use this file except in compliance with the License.
|
|
16 | 16 |
|
17 | 17 | #include "LineDictionary.h"
|
18 | 18 |
|
| 19 | +#include <utils/debug.h> |
| 20 | +#include <utils/Log.h> |
| 21 | +#include <utils/ostream.h> |
| 22 | + |
| 23 | +#include <algorithm> |
| 24 | +#include <cctype> |
| 25 | +#include <cstddef> |
| 26 | +#include <cstdint> |
| 27 | +#include <iterator> |
| 28 | +#include <memory> |
| 29 | +#include <string> |
| 30 | +#include <string_view> |
| 31 | +#include <utility> |
| 32 | +#include <vector> |
| 33 | + |
19 | 34 | namespace filamat {
|
20 | 35 |
|
21 |
| -std::string_view LineDictionary::getString(size_t index) const noexcept { |
| 36 | +LineDictionary::LineDictionary() = default; |
| 37 | + |
| 38 | +LineDictionary::~LineDictionary() noexcept { |
| 39 | + //printStatistics(utils::slog.d); |
| 40 | +} |
| 41 | + |
| 42 | +std::string const& LineDictionary::getString(index_t const index) const noexcept { |
22 | 43 | return *mStrings[index];
|
23 | 44 | }
|
24 | 45 |
|
25 |
| -size_t LineDictionary::getLineCount() const { |
| 46 | +size_t LineDictionary::getDictionaryLineCount() const { |
26 | 47 | return mStrings.size();
|
27 | 48 | }
|
| 49 | +std::vector<LineDictionary::index_t> LineDictionary::getIndices( |
| 50 | + std::string_view const& line) const noexcept { |
| 51 | + std::vector<index_t> result; |
| 52 | + std::vector<std::string_view> const sublines = splitString(line); |
| 53 | + for (std::string_view const& subline : sublines) { |
| 54 | + if (auto iter = mLineIndices.find(subline); iter != mLineIndices.end()) { |
| 55 | + result.push_back(iter->second.index); |
| 56 | + } |
| 57 | + } |
| 58 | + return result; |
| 59 | +} |
28 | 60 |
|
29 |
| -size_t LineDictionary::getIndex(std::string_view s) const noexcept { |
30 |
| - if (auto iter = mLineIndices.find(s); iter != mLineIndices.end()) { |
31 |
| - return iter->second; |
| 61 | +void LineDictionary::addText(std::string_view const text) noexcept { |
| 62 | + size_t cur = 0; |
| 63 | + size_t const len = text.length(); |
| 64 | + const char* s = text.data(); |
| 65 | + while (cur < len) { |
| 66 | + // Start of the current line |
| 67 | + size_t const pos = cur; |
| 68 | + // Find the end of the current line or end of text |
| 69 | + while (cur < len && s[cur] != '\n') { |
| 70 | + cur++; |
| 71 | + } |
| 72 | + // If we found a newline, advance past it for the next iteration, ensuring '\n' is included |
| 73 | + if (cur < len) { |
| 74 | + cur++; |
| 75 | + } |
| 76 | + addLine({ s + pos, cur - pos }); |
32 | 77 | }
|
33 |
| - return SIZE_MAX; |
34 | 78 | }
|
35 | 79 |
|
36 |
| -void LineDictionary::addText(const std::string& line) noexcept { |
37 |
| - const char* s = line.c_str(); |
| 80 | +void LineDictionary::addLine(std::string_view const line) noexcept { |
| 81 | + auto const lines = splitString(line); |
| 82 | + for (std::string_view const& subline : lines) { |
| 83 | + // Never add a line twice. |
| 84 | + auto pos = mLineIndices.find(subline); |
| 85 | + if (pos != mLineIndices.end()) { |
| 86 | + pos->second.count++; |
| 87 | + continue; |
| 88 | + } |
| 89 | + mStrings.emplace_back(std::make_unique<std::string>(subline)); |
| 90 | + mLineIndices.emplace(*mStrings.back(), |
| 91 | + LineInfo{ |
| 92 | + .index = index_t(mStrings.size() - 1), |
| 93 | + .count = 1 }); |
| 94 | + } |
| 95 | +} |
38 | 96 |
|
39 |
| - size_t cur = 0; |
40 |
| - size_t pos = 0; |
41 |
| - size_t len = 0; |
| 97 | +std::string_view LineDictionary::ltrim(std::string_view s) { |
| 98 | + s.remove_prefix(std::distance(s.begin(), std::find_if(s.begin(), s.end(), |
| 99 | + [](unsigned char const c) { return !std::isspace(c); }))); |
| 100 | + return { s.data(), s.size() }; |
| 101 | +} |
42 | 102 |
|
43 |
| - while (s[cur] != '\0') { |
44 |
| - pos = cur; |
45 |
| - len = 0; |
46 |
| - while (s[cur] != '\n') { |
47 |
| - cur++; |
48 |
| - len++; |
| 103 | +std::pair<size_t, size_t> LineDictionary::findPattern( |
| 104 | + std::string_view const line, size_t const offset) { |
| 105 | + // Patterns are ordered from longest to shortest to ensure correct prefix matching. |
| 106 | + static constexpr std::string_view kPatterns[] = { "hp_copy_", "mp_copy_", "_" }; |
| 107 | + |
| 108 | + const size_t line_len = line.length(); |
| 109 | + for (size_t i = offset; i < line_len; ++i) { |
| 110 | + // A pattern must be a whole word (or at the start of the string). |
| 111 | + if (i > 0 && std::isalnum(line[i - 1])) { |
| 112 | + continue; |
| 113 | + } |
| 114 | + |
| 115 | + for (const auto& prefix : kPatterns) { |
| 116 | + if (line.size() - i >= prefix.size() && line.substr(i, prefix.size()) == prefix) { |
| 117 | + // A known prefix has been matched. Now, check for a sequence of digits. |
| 118 | + size_t const startOfDigits = i + prefix.size(); |
| 119 | + if (startOfDigits < line_len && std::isdigit(line[startOfDigits])) { |
| 120 | + size_t j = startOfDigits; |
| 121 | + while (j < line_len && (j < startOfDigits + 6) && std::isdigit(line[j])) { |
| 122 | + j++; |
| 123 | + } |
| 124 | + // We have a full pattern match (prefix + digits). |
| 125 | + return { i, j - i }; |
| 126 | + } |
| 127 | + // If a prefix is matched but not followed by digits, it's not a valid pattern. |
| 128 | + // We break to the outer loop to continue searching from the next character, |
| 129 | + // because we've already checked the longest possible prefix at this position. |
| 130 | + break; |
| 131 | + } |
49 | 132 | }
|
50 |
| - std::string newLine(s + pos, len); |
51 |
| - addLine(std::move(newLine)); |
52 |
| - cur++; |
53 | 133 | }
|
| 134 | + return { std::string_view::npos, 0 }; // No pattern found |
54 | 135 | }
|
55 | 136 |
|
56 |
| -void LineDictionary::addLine(const std::string&& line) noexcept { |
57 |
| - // Never add a line twice. |
58 |
| - if (mLineIndices.find(line) != mLineIndices.end()) { |
59 |
| - return; |
| 137 | +std::vector<std::string_view> LineDictionary::splitString(std::string_view const line) { |
| 138 | + std::vector<std::string_view> result; |
| 139 | + size_t current_pos = 0; |
| 140 | + |
| 141 | + if (line.empty()) { |
| 142 | + result.push_back({}); |
| 143 | + return result; |
60 | 144 | }
|
61 |
| - mStrings.emplace_back(std::make_unique<std::string>(line)); |
62 |
| - mLineIndices.emplace(*mStrings.back(), mStrings.size() - 1); |
| 145 | + |
| 146 | + while (current_pos < line.length()) { |
| 147 | + auto const [match_pos, match_len] = findPattern(line, current_pos); |
| 148 | + |
| 149 | + if (match_pos == std::string_view::npos) { |
| 150 | + // No more patterns found, add the rest of the string. |
| 151 | + result.push_back(line.substr(current_pos)); |
| 152 | + break; |
| 153 | + } |
| 154 | + |
| 155 | + // Add the part before the match. |
| 156 | + if (match_pos > current_pos) { |
| 157 | + result.push_back(line.substr(current_pos, match_pos - current_pos)); |
| 158 | + } |
| 159 | + |
| 160 | + // Add the match itself. |
| 161 | + result.push_back(line.substr(match_pos, match_len)); |
| 162 | + |
| 163 | + // Move cursor past the match. |
| 164 | + current_pos = match_pos + match_len; |
| 165 | + } |
| 166 | + |
| 167 | + return result; |
| 168 | +} |
| 169 | + |
| 170 | +void LineDictionary::printStatistics(utils::io::ostream& stream) const noexcept { |
| 171 | + std::vector<std::pair<std::string_view, LineInfo>> info; |
| 172 | + for (auto const& pair : mLineIndices) { |
| 173 | + info.push_back(pair); |
| 174 | + } |
| 175 | + |
| 176 | + // Sort by count, then by index. |
| 177 | + std::sort(info.begin(), info.end(), |
| 178 | + [](auto const& lhs, auto const& rhs) { |
| 179 | + if (lhs.second.count != rhs.second.count) { |
| 180 | + return lhs.second.count > rhs.second.count; |
| 181 | + } |
| 182 | + return lhs.second.index < rhs.second.index; |
| 183 | + }); |
| 184 | + |
| 185 | + size_t total_size = 0; |
| 186 | + size_t compressed_size = 0; |
| 187 | + size_t total_lines = 0; |
| 188 | + size_t indices_size = 0; |
| 189 | + size_t indices_size_if_varlen = 0; |
| 190 | + size_t indices_size_if_varlen_sorted = 0; |
| 191 | + size_t i = 0; |
| 192 | + using namespace utils; |
| 193 | + // Print the dictionary. |
| 194 | + stream << "Line dictionary:" << io::endl; |
| 195 | + for (auto const& pair : info) { |
| 196 | + compressed_size += pair.first.length(); |
| 197 | + total_size += pair.first.length() * pair.second.count; |
| 198 | + total_lines += pair.second.count; |
| 199 | + indices_size += sizeof(uint16_t) * pair.second.count; |
| 200 | + if (pair.second.index <= 127) { |
| 201 | + indices_size_if_varlen += sizeof(uint8_t) * pair.second.count; |
| 202 | + } else { |
| 203 | + indices_size_if_varlen += sizeof(uint16_t) * pair.second.count; |
| 204 | + } |
| 205 | + if (i <= 128) { |
| 206 | + indices_size_if_varlen_sorted += sizeof(uint8_t) * pair.second.count; |
| 207 | + } else { |
| 208 | + indices_size_if_varlen_sorted += sizeof(uint16_t) * pair.second.count; |
| 209 | + } |
| 210 | + i++; |
| 211 | + stream << " " << pair.second.count << ": " << pair.first << io::endl; |
| 212 | + } |
| 213 | + stream << "Total size: " << total_size << ", compressed size: " << compressed_size << io::endl; |
| 214 | + stream << "Saved size: " << total_size - compressed_size << io::endl; |
| 215 | + stream << "Unique lines: " << mLineIndices.size() << io::endl; |
| 216 | + stream << "Total lines: " << total_lines << io::endl; |
| 217 | + stream << "Compression ratio: " << double(total_size) / compressed_size << io::endl; |
| 218 | + stream << "Average line length (total): " << double(total_size) / total_lines << io::endl; |
| 219 | + stream << "Average line length (compressed): " << double(compressed_size) / mLineIndices.size() << io::endl; |
| 220 | + stream << "Indices size: " << indices_size << io::endl; |
| 221 | + stream << "Indices size (if varlen): " << indices_size_if_varlen << io::endl; |
| 222 | + stream << "Indices size (if varlen, sorted): " << indices_size_if_varlen_sorted << io::endl; |
| 223 | + |
| 224 | + // some data we gathered |
| 225 | + |
| 226 | + // Total size: 751161, compressed size: 59818 |
| 227 | + // Saved size: 691343 |
| 228 | + // Unique lines: 3659 |
| 229 | + // Total lines: 61686 |
| 230 | + // Compression ratio: 12.557440904075696 |
| 231 | + // Average line length (total): 12.177171481373406 |
| 232 | + // Average line length (compressed): 16.34818256354195 |
| 233 | + |
| 234 | + |
| 235 | + // Total size: 751161, compressed size: 263215 |
| 236 | + // Saved size: 487946 |
| 237 | + // Unique lines: 4672 |
| 238 | + // Total lines: 23258 |
| 239 | + // Compression ratio: 2.8537925270216364 |
| 240 | + // Average line length (total): 32.296887092613296 |
| 241 | + // Average line length (compressed): 56.338827054794521 |
63 | 242 | }
|
64 | 243 |
|
65 | 244 | } // namespace filamat
|
0 commit comments