Skip to content

Commit 6116595

Browse files
committed
common: Fix false sharing in thread_rwlock_*() wrapper
I only fully grokked false sharing fairly recently, and I now have access to better hardware, which also made this issue more prominent. I wrote the initial *nix + Windows rwlock wrapper last year when I was working on making sure scene updates work while the renderer is running in interactive mode (10d965d). Moving the bvh_lock rwlock out of the shared scene struct into a separate heap allocation has a significant effect on performance on a 32c/64t AMD Epyc 9374F: Before: vkoskiv@Triton:~/c-ray$ hyperfine 'bin/c-ray input/hdr.json -s 128 --no-sdl -j 64' Benchmark 1: bin/c-ray input/hdr.json -s 128 --no-sdl -j 64 Time (mean ± σ): 16.680 s ± 0.036 s [User: 972.341 s, System: 0.199 s] Range (min … max): 16.621 s … 16.751 s 10 runs After: vkoskiv@Triton:~/c-ray$ hyperfine 'bin/c-ray input/hdr.json -s 128 --no-sdl -j 64' Benchmark 1: bin/c-ray input/hdr.json -s 128 --no-sdl -j 64 Time (mean ± σ): 9.278 s ± 0.028 s [User: 516.252 s, System: 0.189 s] Range (min … max): 9.244 s … 9.336 s 10 runs I didn't notice this issue when I tested it last year, since the performance degradation is much less pronounced on my 4c/8t CPU at home: Before: > hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 32' Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 32 Time (mean ± σ): 7.363 s ± 0.074 s [User: 49.317 s, System: 0.146 s] Range (min … max): 7.271 s … 7.515 s 10 runs After: > hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 32' Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 32 Time (mean ± σ): 7.359 s ± 0.126 s [User: 49.181 s, System: 0.130 s] Range (min … max): 7.220 s … 7.598 s 10 runs
1 parent 128027b commit 6116595

File tree

7 files changed

+77
-51
lines changed

7 files changed

+77
-51
lines changed

src/common/platform/thread.c

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -155,50 +155,73 @@ int thread_cond_broadcast(struct cr_cond *cond) {
155155
#endif
156156
}
157157

158-
int thread_rwlock_init(struct cr_rwlock *lock) {
159-
if (!lock) return -1;
160158
#ifdef WINDOWS
159+
/*
160+
Windows will presumably still suffer from false sharing, but
161+
their API is what it is :/
162+
*/
163+
struct cr_rwlock {
164+
SRWLOCK lock;
165+
bool exclusive;
166+
}
167+
#endif
168+
169+
struct cr_rwlock *thread_rwlock_init(void) {
170+
#ifdef WINDOWS
171+
struct cr_rwlock *lock = malloc(sizeof(*lock));
161172
InitializeSRWLock(&lock->lock);
162173
lock->exclusive = false;
163-
return 0;
174+
return lock;
164175
#else
165-
return pthread_rwlock_init(&lock->lock, NULL);
176+
pthread_rwlock_t *lock = malloc(sizeof(*lock));
177+
int ret = pthread_rwlock_init(lock, NULL);
178+
if (ret) {
179+
free(lock);
180+
return NULL;
181+
}
182+
return (struct cr_rwlock *)lock;
166183
#endif
167184
}
168185

169186
int thread_rwlock_destroy(struct cr_rwlock *lock) {
170-
if (!lock) return -1;
187+
if (!lock)
188+
return -1;
171189
#ifdef WINDOWS
172-
(void)lock;
190+
free(lock);
173191
return 0;
174192
#else
175-
return pthread_rwlock_destroy(&lock->lock);
193+
pthread_rwlock_destroy((pthread_rwlock_t *)lock);
194+
free(lock);
195+
return 0;
176196
#endif
177197
}
178198

179199
int thread_rwlock_rdlock(struct cr_rwlock *lock) {
180-
if (!lock) return -1;
200+
if (!lock)
201+
return -1;
181202
#ifdef WINDOWS
182203
AcquireSRWLockShared(&lock->lock);
183204
return 0;
184205
#else
185-
return pthread_rwlock_rdlock(&lock->lock);
206+
return pthread_rwlock_rdlock((pthread_rwlock_t *)lock);
186207
#endif
187208
}
188209

189210
int thread_rwlock_wrlock(struct cr_rwlock *lock) {
190-
if (!lock) return -1;
211+
if (!lock)
212+
return -1;
191213
#ifdef WINDOWS
192214
AcquireSRWLockExclusive(&lock->lock);
193215
lock->exclusive = true;
194216
return 0;
195217
#else
196-
return pthread_rwlock_wrlock(&lock->lock);
218+
return pthread_rwlock_wrlock((pthread_rwlock_t *)lock);
197219
#endif
198220
}
199221

200222
int thread_rwlock_unlock(struct cr_rwlock *lock) {
201-
if (!lock) return -1;
223+
if (!lock)
224+
return -1;
202225
#ifdef WINDOWS
203226
if (lock->exclusive) {
204227
lock->exclusive = false;
@@ -208,7 +231,7 @@ int thread_rwlock_unlock(struct cr_rwlock *lock) {
208231
}
209232
return 0;
210233
#else
211-
return pthread_rwlock_unlock(&lock->lock);
234+
return pthread_rwlock_unlock((pthread_rwlock_t *)lock);
212235
#endif
213236
}
214237

src/common/platform/thread.h

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -44,15 +44,6 @@ struct cr_cond {
4444
#endif
4545
};
4646

47-
struct cr_rwlock {
48-
#ifdef WINDOWS
49-
SRWLOCK lock;
50-
bool exclusive;
51-
#else
52-
pthread_rwlock_t lock;
53-
#endif
54-
};
55-
5647
typedef struct cr_thread cr_thread;
5748
dyn_array_def(cr_thread)
5849

@@ -80,7 +71,9 @@ int thread_cond_signal(struct cr_cond *cond);
8071

8172
int thread_cond_broadcast(struct cr_cond *cond);
8273

83-
int thread_rwlock_init(struct cr_rwlock *lock);
74+
struct cr_rwlock;
75+
76+
struct cr_rwlock *thread_rwlock_init(void);
8477
int thread_rwlock_destroy(struct cr_rwlock *lock);
8578
int thread_rwlock_rdlock(struct cr_rwlock *lock);
8679
int thread_rwlock_wrlock(struct cr_rwlock *lock);

src/lib/api/c-ray.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,10 +259,10 @@ void bvh_build_task(void *arg) {
259259
return;
260260
}
261261
//!//!//!//!//!//!//!//!//!//!//!//!
262-
thread_rwlock_wrlock(&bt->scene->bvh_lock);
262+
thread_rwlock_wrlock(bt->scene->bvh_lock);
263263
struct bvh *old_bvh = bt->scene->meshes.items[bt->mesh_idx].bvh;
264264
bt->scene->meshes.items[bt->mesh_idx].bvh = bvh;
265-
thread_rwlock_unlock(&bt->scene->bvh_lock);
265+
thread_rwlock_unlock(bt->scene->bvh_lock);
266266
//!//!//!//!//!//!//!//!//!//!//!//!
267267
logr(debug, "BVH %s for %s (%lums)\n", old_bvh ? "updated" : "built", bt->mesh.name, ms);
268268
destroy_bvh(old_bvh);
@@ -312,9 +312,9 @@ cr_mesh cr_scene_mesh_new(struct cr_scene *s_ext, const char *name) {
312312
struct world *scene = (struct world *)s_ext;
313313
struct mesh new = { 0 };
314314
if (name) new.name = stringCopy(name);
315-
thread_rwlock_wrlock(&scene->bvh_lock);
315+
thread_rwlock_wrlock(scene->bvh_lock);
316316
cr_mesh idx = mesh_arr_add(&scene->meshes, new);
317-
thread_rwlock_unlock(&scene->bvh_lock);
317+
thread_rwlock_unlock(scene->bvh_lock);
318318
return idx;
319319
}
320320

src/lib/datatypes/scene.c

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,13 @@
1010
#include "scene.h"
1111

1212
#include <accelerators/bvh.h>
13+
#include <common/cr_string.h>
1314
#include <common/hashtable.h>
1415
#include <common/textbuffer.h>
1516
#include <common/dyn_array.h>
1617
#include <common/node_parse.h>
18+
#include <common/platform/capabilities.h>
19+
#include <common/platform/thread_pool.h>
1720
#include <common/texture.h>
1821
#include "camera.h"
1922
#include "tile.h"
@@ -25,6 +28,16 @@ void tex_asset_free(struct texture_asset *a) {
2528
if (a->t) tex_destroy(a->t);
2629
}
2730

31+
struct world *scene_new(void) {
32+
struct world *s = calloc(1, sizeof(*s));
33+
s->asset_path = stringCopy("./");
34+
s->storage.node_pool = newBlock(NULL, 1024);
35+
s->storage.node_table = newHashtable(compareNodes, &s->storage.node_pool);
36+
s->bvh_lock = thread_rwlock_init();
37+
s->bg_worker = thread_pool_create(sys_get_cores());
38+
return s;
39+
}
40+
2841
void scene_destroy(struct world *scene) {
2942
if (scene) {
3043
scene->textures.elem_free = tex_asset_free;
@@ -33,9 +46,12 @@ void scene_destroy(struct world *scene) {
3346
scene->meshes.elem_free = mesh_free;
3447
mesh_arr_free(&scene->meshes);
3548

36-
thread_rwlock_wrlock(&scene->bvh_lock);
49+
thread_rwlock_wrlock(scene->bvh_lock);
3750
destroy_bvh(scene->topLevel);
38-
thread_rwlock_unlock(&scene->bvh_lock);
51+
thread_rwlock_unlock(scene->bvh_lock);
52+
53+
thread_pool_destroy(scene->bg_worker);
54+
thread_rwlock_destroy(scene->bvh_lock);
3955

4056
destroyHashtable(scene->storage.node_table);
4157
destroyBlocks(scene->storage.node_pool);

src/lib/datatypes/scene.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ struct world {
3737
bool instances_dirty; // Recompute top-level BVH?
3838
// Top-level bounding volume hierarchy,
3939
// contains all 3D assets in the scene.
40-
struct cr_rwlock bvh_lock;
40+
struct cr_rwlock *bvh_lock;
4141
struct bvh *topLevel; // FIXME: Move to state?
4242
bool top_level_dirty;
4343
struct cr_thread_pool *bg_worker;
@@ -53,4 +53,5 @@ struct world {
5353
char *asset_path;
5454
};
5555

56+
struct world *scene_new(void);
5657
void scene_destroy(struct world *scene);

src/lib/protocol/protocol.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,7 @@ struct world *deserialize_scene(const cJSON *in) {
771771
out->asset_path = stringCopy("./");
772772
out->storage.node_pool = newBlock(NULL, 1024);
773773
out->storage.node_table = newHashtable(compareNodes, &out->storage.node_pool);
774-
thread_rwlock_init(&out->bvh_lock);
774+
out->bvh_lock = thread_rwlock_init();
775775
out->bg_worker = thread_pool_create(sys_get_cores());
776776

777777
cJSON *asset_path = cJSON_GetObjectItem(in, "asset_path");
@@ -818,10 +818,10 @@ struct world *deserialize_scene(const cJSON *in) {
818818
if (cJSON_IsArray(meshes)) {
819819
cJSON *mesh = NULL;
820820
cJSON_ArrayForEach(mesh, meshes) {
821-
thread_rwlock_wrlock(&out->bvh_lock);
821+
thread_rwlock_wrlock(out->bvh_lock);
822822
cr_mesh idx = mesh_arr_add(&out->meshes, deserialize_mesh(mesh));
823823
cr_mesh_finalize((struct cr_scene *)out, idx);
824-
thread_rwlock_unlock(&out->bvh_lock);
824+
thread_rwlock_unlock(out->bvh_lock);
825825
}
826826
}
827827

src/lib/renderer/renderer.c

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,10 @@ void update_toplevel_bvh(struct world *s) {
130130
if (!s->top_level_dirty && s->topLevel) return;
131131
struct bvh *new = build_top_level_bvh(s->instances);
132132
//!//!//!//!//!//!//!//!//!//!//!//!
133-
thread_rwlock_wrlock(&s->bvh_lock);
133+
thread_rwlock_wrlock(s->bvh_lock);
134134
struct bvh *old = s->topLevel;
135135
s->topLevel = new;
136-
thread_rwlock_unlock(&s->bvh_lock);
136+
thread_rwlock_unlock(s->bvh_lock);
137137
//!//!//!//!//!//!//!//!//!//!//!//!
138138
destroy_bvh(old);
139139
// Bind shader buffers to instances
@@ -373,9 +373,9 @@ void *render_thread_interactive(void *arg) {
373373
sampler_init(sampler, SAMPLING_STRATEGY, r->state.finishedPasses, r->prefs.sampleCount, pixIdx);
374374

375375
struct color output = tex_get_px(*buf, x, y, false);
376-
thread_rwlock_rdlock(&r->scene->bvh_lock);
376+
thread_rwlock_rdlock(r->scene->bvh_lock);
377377
struct color sample = path_trace(cam_get_ray(cam, x, y, sampler), r->scene, r->prefs.bounces, sampler);
378-
thread_rwlock_unlock(&r->scene->bvh_lock);
378+
thread_rwlock_unlock(r->scene->bvh_lock);
379379

380380
nan_clamp(&sample, &output);
381381

@@ -453,9 +453,9 @@ void *render_thread(void *arg) {
453453
sampler_init(sampler, SAMPLING_STRATEGY, samples - 1, r->prefs.sampleCount, pixIdx);
454454

455455
struct color output = tex_get_px(*buf, x, y, false);
456-
thread_rwlock_rdlock(&r->scene->bvh_lock);
456+
thread_rwlock_rdlock(r->scene->bvh_lock);
457457
struct color sample = path_trace(cam_get_ray(cam, x, y, sampler), r->scene, r->prefs.bounces, sampler);
458-
thread_rwlock_unlock(&r->scene->bvh_lock);
458+
thread_rwlock_unlock(r->scene->bvh_lock);
459459

460460
// Clamp out fireflies - This is probably not a good way to do that.
461461
nan_clamp(&sample, &output);
@@ -527,9 +527,9 @@ void *render_single_iteration(void *arg) {
527527
sampler_init(sampler, SAMPLING_STRATEGY, samples - 1, r->prefs.sampleCount, pixIdx);
528528

529529
struct color output = tex_get_px(*buf, x, y, false);
530-
thread_rwlock_rdlock(&r->scene->bvh_lock);
530+
thread_rwlock_rdlock(r->scene->bvh_lock);
531531
struct color sample = path_trace(cam_get_ray(cam, x, y, sampler), r->scene, r->prefs.bounces, sampler);
532-
thread_rwlock_unlock(&r->scene->bvh_lock);
532+
thread_rwlock_unlock(r->scene->bvh_lock);
533533

534534
// Clamp out fireflies - This is probably not a good way to do that.
535535
nan_clamp(&sample, &output);
@@ -578,20 +578,13 @@ struct renderer *renderer_new(void) {
578578
struct renderer *r = calloc(1, sizeof(*r));
579579
r->prefs = default_prefs();
580580
r->state.finishedPasses = 1;
581-
582-
// Move these elsewhere
583-
r->scene = calloc(1, sizeof(*r->scene));
584-
r->scene->asset_path = stringCopy("./");
585-
r->scene->storage.node_pool = newBlock(NULL, 1024);
586-
r->scene->storage.node_table = newHashtable(compareNodes, &r->scene->storage.node_pool);
587-
thread_rwlock_init(&r->scene->bvh_lock);
588-
r->scene->bg_worker = thread_pool_create(sys_get_cores());
581+
r->scene = scene_new();
589582
return r;
590583
}
591584

592585
void renderer_destroy(struct renderer *r) {
593-
if (!r) return;
594-
thread_pool_destroy(r->scene->bg_worker);
586+
if (!r)
587+
return;
595588
scene_destroy(r->scene);
596589
worker_arr_free(&r->state.workers);
597590
render_client_arr_free(&r->state.clients);

0 commit comments

Comments
 (0)