Skip to content

Commit 05a6198

Browse files
committed
froxelization optimizations
- by limiting the maximum number of point light to 255 instead of 256, we can simplify some loops. - hint the compiler that certain loop counters are multiple of 16 and non 0, which helps vectorizing.
1 parent 00bd30c commit 05a6198

File tree

3 files changed

+33
-19
lines changed

3 files changed

+33
-19
lines changed

filament/src/Froxelizer.cpp

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,8 @@ Froxelizer::Froxelizer(FEngine& engine)
159159

160160
size_t const froxelBufferByteCount = getFroxelBufferByteCount(driverApi);
161161
mFroxelBufferEntryCount = froxelBufferByteCount / sizeof(FroxelEntry);
162+
mFroxelBufferEntryCount &= ~0xF; // make sure it's a multiple of 16 (helps vectorizing)
163+
assert_invariant(mFroxelBufferEntryCount >= 16); // that's also needed elsewhere
162164

163165
size_t const froxelRecordBufferByteCount = getFroxelRecordBufferByteCount(driverApi);
164166
mFroxelRecordBufferEntryCount = froxelRecordBufferByteCount / sizeof(uint8_t);
@@ -710,11 +712,13 @@ void Froxelizer::froxelizeAssignRecordsCompress() noexcept {
710712
// this gets very well vectorized...
711713

712714
Slice records(mLightRecords);
713-
for (size_t j = 0, jc = getFroxelBufferEntryCount(); j < jc; j++) {
715+
for (size_t j = 0, jc = getFroxelBufferEntryCount() ; j < jc; j++) {
716+
using container_type = LightRecord::bitset::container_type;
717+
constexpr size_t r = sizeof(container_type) / sizeof(LightGroupType);
718+
UTILS_UNROLL
714719
for (size_t i = 0; i < LightRecord::bitset::WORLD_COUNT; i++) {
715-
using container_type = LightRecord::bitset::container_type;
716-
constexpr size_t r = sizeof(container_type) / sizeof(LightGroupType);
717720
container_type b = froxelThreadData[i * r][j];
721+
UTILS_UNROLL
718722
for (size_t k = 0; k < r; k++) {
719723
b |= (container_type(froxelThreadData[i * r + k][j]) << (LIGHT_PER_GROUP * k));
720724
}
@@ -733,19 +737,21 @@ void Froxelizer::froxelizeAssignRecordsCompress() noexcept {
733737
const size_t froxelCountX = mFroxelCountX;
734738
RecordBufferType* const UTILS_RESTRICT froxelRecords = mRecordBufferUser.data();
735739

736-
// initialize the first record with all lights in the scene -- this will be used only if
740+
// Initialize the first record with all lights in the scene -- this will be used only if
737741
// we run out of record space.
738-
const uint8_t allLightsCount = uint8_t(std::min(size_t(255), allLights.count()));
742+
743+
// Our light count cannot be larger than 255 because it's stored in a uint8_t. This should
744+
// be guaranteed by CONFIG_MAX_LIGHT_COUNT
745+
assert_invariant(allLights.count() <= std::numeric_limits<uint8_t>::max());
746+
747+
const uint8_t allLightsCount = allLights.count();
739748
offset += allLightsCount;
740-
allLights.forEachSetBit([point = froxelRecords, froxelRecords](size_t l) mutable {
749+
allLights.forEachSetBit([p = froxelRecords](size_t l) mutable {
741750
// make sure to keep this code branch-less
742751
const size_t word = l / LIGHT_PER_GROUP;
743752
const size_t bit = l % LIGHT_PER_GROUP;
744753
l = (bit * GROUP_COUNT) | (word % GROUP_COUNT);
745-
*point = RecordBufferType(l);
746-
// we need to "cancel" the write operation if we have more than 255 spot or point lights
747-
// (this is a limitation of the data type used to store the light counts per froxel)
748-
point += (point - froxelRecords < 255) ? 1 : 0;
754+
*p++ = RecordBufferType(l);
749755
});
750756

751757
// how many froxel record entries were reused (for debugging)
@@ -759,8 +765,10 @@ void Froxelizer::froxelizeAssignRecordsCompress() noexcept {
759765
}
760766

761767
// We have a limitation of 255 spot + 255 point lights per froxel.
768+
assert_invariant(b.lights.count() <= std::numeric_limits<uint8_t>::max());
769+
762770
// note: initializer list for union cannot have more than one element
763-
FroxelEntry entry{ offset, uint8_t(std::min(size_t(255), b.lights.count())) };
771+
FroxelEntry entry{ offset, uint8_t(b.lights.count()) };
764772
const size_t lightCount = entry.count();
765773

766774
if (UTILS_UNLIKELY(offset + lightCount >= mFroxelRecordBufferEntryCount)) {
@@ -778,15 +786,12 @@ void Froxelizer::froxelizeAssignRecordsCompress() noexcept {
778786

779787
// iterate the bitfield
780788
auto * const beginPoint = froxelRecords + offset;
781-
b.lights.forEachSetBit([point = beginPoint, beginPoint](size_t l) mutable {
789+
b.lights.forEachSetBit([p = beginPoint](size_t l) mutable {
782790
// make sure to keep this code branch-less
783791
const size_t word = l / LIGHT_PER_GROUP;
784792
const size_t bit = l % LIGHT_PER_GROUP;
785793
l = (bit * GROUP_COUNT) | (word % GROUP_COUNT);
786-
*point = RecordBufferType(l);
787-
// we need to "cancel" the write operation if we have more than 255 spot or point lights
788-
// (this is a limitation of the data type used to store the light counts per froxel)
789-
point += (point - beginPoint < 255) ? 1 : 0;
794+
*p++ = RecordBufferType(l);
790795
});
791796

792797
offset += lightCount;

filament/src/Froxelizer.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include <math/vec4.h>
3737

3838
#include <utils/compiler.h>
39+
#include <utils/debug.h>
3940
#include <utils/bitset.h>
4041
#include <utils/Slice.h>
4142

@@ -172,6 +173,12 @@ class Froxelizer {
172173

173174
private:
174175
size_t getFroxelBufferEntryCount() const noexcept {
176+
// We guarantee that mFroxelBufferEntryCount is a multiple of 16. With this knowledge
177+
// the compiler can do a much better job at vectorizing. For similar reasons, it's
178+
// important to keep mFroxelBufferEntryCount an uint32_t (as opposed to a size_t).
179+
assert_invariant((mFroxelBufferEntryCount & 0xF) == 0);
180+
UTILS_ASSUME((mFroxelBufferEntryCount & 0xF) == 0);
181+
UTILS_ASSUME(mFroxelBufferEntryCount >= 16);
175182
return mFroxelBufferEntryCount;
176183
}
177184

@@ -248,10 +255,10 @@ class Froxelizer {
248255
LinearAllocatorArena mArena; // ~256 KiB
249256

250257
// 4096 froxels fits in a 16KiB buffer, the minimum guaranteed in GLES 3.x and Vulkan 1.1
251-
size_t mFroxelBufferEntryCount = 4096;
258+
uint32_t mFroxelBufferEntryCount = 4096;
252259

253260
// 16384 entries is our minimum with a 16KiB buffer
254-
size_t mFroxelRecordBufferEntryCount = 16384;
261+
uint32_t mFroxelRecordBufferEntryCount = 16384;
255262

256263
// allocations in the private froxel arena
257264
float* mDistancesZ = nullptr;

libs/filabridge/include/private/filament/EngineEnums.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,9 @@ enum class PushConstantIds : uint8_t {
9494

9595
// This value is limited by UBO size, ES3.0 only guarantees 16 KiB.
9696
// It's also limited by the Froxelizer's record buffer data type (uint8_t).
97-
constexpr size_t CONFIG_MAX_LIGHT_COUNT = 256;
97+
// And it's limited by the Froxelizer's Froxel data structure, which stores
98+
// a light count in a uint8_t (so the count is limited to 255)
99+
constexpr size_t CONFIG_MAX_LIGHT_COUNT = 255;
98100
constexpr size_t CONFIG_MAX_LIGHT_INDEX = CONFIG_MAX_LIGHT_COUNT - 1;
99101

100102
// The number of specialization constants that Filament reserves for its own use. These are always

0 commit comments

Comments
 (0)