common: Fix false sharing in thread_rwlock_*() wrapper

vkoskiv · vkoskiv · commit 6116595ca1e0 · 2025-08-14T02:02:34.000+03:00
I only fully grokked false sharing fairly recently, and I now have access to better hardware, which also made this issue more prominent. I wrote the initial *nix + Windows rwlock wrapper last year when I was working on making sure scene updates work while the renderer is running in interactive mode (10d965d). Moving the bvh_lock rwlock out of the shared scene struct into a separate heap allocation has a significant effect on performance on a 32c/64t AMD Epyc 9374F: Before: vkoskiv@Triton:~/c-ray$ hyperfine 'bin/c-ray input/hdr.json -s 128 --no-sdl -j 64' Benchmark 1: bin/c-ray input/hdr.json -s 128 --no-sdl -j 64 Time (mean ± σ): 16.680 s ± 0.036 s [User: 972.341 s, System: 0.199 s] Range (min … max): 16.621 s … 16.751 s 10 runs After: vkoskiv@Triton:~/c-ray$ hyperfine 'bin/c-ray input/hdr.json -s 128 --no-sdl -j 64' Benchmark 1: bin/c-ray input/hdr.json -s 128 --no-sdl -j 64 Time (mean ± σ): 9.278 s ± 0.028 s [User: 516.252 s, System: 0.189 s] Range (min … max): 9.244 s … 9.336 s 10 runs I didn't notice this issue when I tested it last year, since the performance degradation is much less pronounced on my 4c/8t CPU at home: Before: > hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 32' Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 32 Time (mean ± σ): 7.363 s ± 0.074 s [User: 49.317 s, System: 0.146 s] Range (min … max): 7.271 s … 7.515 s 10 runs After: > hyperfine 'bin/c-ray input/hdr.json --no-sdl -s 32' Benchmark 1: bin/c-ray input/hdr.json --no-sdl -s 32 Time (mean ± σ): 7.359 s ± 0.126 s [User: 49.181 s, System: 0.130 s] Range (min … max): 7.220 s … 7.598 s 10 runs
diff --git a/src/common/platform/thread.c b/src/common/platform/thread.c
@@ -155,50 +155,73 @@ int thread_cond_broadcast(struct cr_cond *cond) {
 #endif
 }
 
-int thread_rwlock_init(struct cr_rwlock *lock) {
-	if (!lock) return -1;
 #ifdef WINDOWS
+/*
+	Windows will presumably still suffer from false sharing, but
+	their API is what it is :/
+*/
+struct cr_rwlock {
+	SRWLOCK lock;
+	bool exclusive;
+}
+#endif
+
+struct cr_rwlock *thread_rwlock_init(void) {
+#ifdef WINDOWS
+	struct cr_rwlock *lock = malloc(sizeof(*lock));
 	InitializeSRWLock(&lock->lock);
 	lock->exclusive = false;
-	return 0;
+	return lock;
 #else
-	return pthread_rwlock_init(&lock->lock, NULL);
+	pthread_rwlock_t *lock = malloc(sizeof(*lock));
+	int ret = pthread_rwlock_init(lock, NULL);
+	if (ret) {
+		free(lock);
+		return NULL;
+	}
+	return (struct cr_rwlock *)lock;
 #endif
 }
 
 int thread_rwlock_destroy(struct cr_rwlock *lock) {
-	if (!lock) return -1;
+	if (!lock)
+		return -1;
 #ifdef WINDOWS
-	(void)lock;
+	free(lock);
 	return 0;
 #else
-	return pthread_rwlock_destroy(&lock->lock);
+	pthread_rwlock_destroy((pthread_rwlock_t *)lock);
+	free(lock);
+	return 0;
 #endif
 }
 
 int thread_rwlock_rdlock(struct cr_rwlock *lock) {
-	if (!lock) return -1;
+	if (!lock)
+		return -1;
 #ifdef WINDOWS
 	AcquireSRWLockShared(&lock->lock);
 	return 0;
 #else
-	return pthread_rwlock_rdlock(&lock->lock);
+	return pthread_rwlock_rdlock((pthread_rwlock_t *)lock);
 #endif
 }
 
 int thread_rwlock_wrlock(struct cr_rwlock *lock) {
-	if (!lock) return -1;
+	if (!lock)
+		return -1;
 #ifdef WINDOWS
 	AcquireSRWLockExclusive(&lock->lock);
 	lock->exclusive = true;
 	return 0;
 #else
-	return pthread_rwlock_wrlock(&lock->lock);
+	return pthread_rwlock_wrlock((pthread_rwlock_t *)lock);
 #endif
 }
 
 int thread_rwlock_unlock(struct cr_rwlock *lock) {
-	if (!lock) return -1;
+	if (!lock)
+		return -1;
 #ifdef WINDOWS
 	if (lock->exclusive) {
 		lock->exclusive = false;
@@ -208,7 +231,7 @@ int thread_rwlock_unlock(struct cr_rwlock *lock) {
 	}
 	return 0;
 #else
-	return pthread_rwlock_unlock(&lock->lock);
+	return pthread_rwlock_unlock((pthread_rwlock_t *)lock);
 #endif
 }
 
diff --git a/src/common/platform/thread.h b/src/common/platform/thread.h
@@ -44,15 +44,6 @@ struct cr_cond {
 #endif
 };
 
-struct cr_rwlock {
-#ifdef WINDOWS
-	SRWLOCK lock;
-	bool exclusive;
-#else
-	pthread_rwlock_t lock;
-#endif
-};
-
 typedef struct cr_thread cr_thread;
 dyn_array_def(cr_thread)
 
@@ -80,7 +71,9 @@ int thread_cond_signal(struct cr_cond *cond);
 
 int thread_cond_broadcast(struct cr_cond *cond);
 
-int thread_rwlock_init(struct cr_rwlock *lock);
+struct cr_rwlock;
+
+struct cr_rwlock *thread_rwlock_init(void);
 int thread_rwlock_destroy(struct cr_rwlock *lock);
 int thread_rwlock_rdlock(struct cr_rwlock *lock);
 int thread_rwlock_wrlock(struct cr_rwlock *lock);
diff --git a/src/lib/api/c-ray.c b/src/lib/api/c-ray.c
@@ -259,10 +259,10 @@ void bvh_build_task(void *arg) {
 		return;
 	}
 	//!//!//!//!//!//!//!//!//!//!//!//!
-	thread_rwlock_wrlock(&bt->scene->bvh_lock);
+	thread_rwlock_wrlock(bt->scene->bvh_lock);
 	struct bvh *old_bvh = bt->scene->meshes.items[bt->mesh_idx].bvh;
 	bt->scene->meshes.items[bt->mesh_idx].bvh = bvh;
-	thread_rwlock_unlock(&bt->scene->bvh_lock);
+	thread_rwlock_unlock(bt->scene->bvh_lock);
 	//!//!//!//!//!//!//!//!//!//!//!//!
 	logr(debug, "BVH %s for %s (%lums)\n", old_bvh ? "updated" : "built", bt->mesh.name, ms);
 	destroy_bvh(old_bvh);
@@ -312,9 +312,9 @@ cr_mesh cr_scene_mesh_new(struct cr_scene *s_ext, const char *name) {
 	struct world *scene = (struct world *)s_ext;
 	struct mesh new = { 0 };
 	if (name) new.name = stringCopy(name);
-	thread_rwlock_wrlock(&scene->bvh_lock);
+	thread_rwlock_wrlock(scene->bvh_lock);
 	cr_mesh idx = mesh_arr_add(&scene->meshes, new);
-	thread_rwlock_unlock(&scene->bvh_lock);
+	thread_rwlock_unlock(scene->bvh_lock);
 	return idx;
 }
 
diff --git a/src/lib/datatypes/scene.c b/src/lib/datatypes/scene.c
@@ -10,10 +10,13 @@
 #include "scene.h"
 
 #include <accelerators/bvh.h>
+#include <common/cr_string.h>
 #include <common/hashtable.h>
 #include <common/textbuffer.h>
 #include <common/dyn_array.h>
 #include <common/node_parse.h>
+#include <common/platform/capabilities.h>
+#include <common/platform/thread_pool.h>
 #include <common/texture.h>
 #include "camera.h"
 #include "tile.h"
@@ -25,6 +28,16 @@ void tex_asset_free(struct texture_asset *a) {
 	if (a->t) tex_destroy(a->t);
 }
 
+struct world *scene_new(void) {
+	struct world *s = calloc(1, sizeof(*s));
+	s->asset_path = stringCopy("./");
+	s->storage.node_pool = newBlock(NULL, 1024);
+	s->storage.node_table = newHashtable(compareNodes, &s->storage.node_pool);
+	s->bvh_lock = thread_rwlock_init();
+	s->bg_worker = thread_pool_create(sys_get_cores());
+	return s;
+}
+
 void scene_destroy(struct world *scene) {
 	if (scene) {
 		scene->textures.elem_free = tex_asset_free;
@@ -33,9 +46,12 @@ void scene_destroy(struct world *scene) {
 		scene->meshes.elem_free = mesh_free;
 		mesh_arr_free(&scene->meshes);
 
-		thread_rwlock_wrlock(&scene->bvh_lock);
+		thread_rwlock_wrlock(scene->bvh_lock);
 		destroy_bvh(scene->topLevel);
-		thread_rwlock_unlock(&scene->bvh_lock);
+		thread_rwlock_unlock(scene->bvh_lock);
+
+		thread_pool_destroy(scene->bg_worker);
+		thread_rwlock_destroy(scene->bvh_lock);
 
 		destroyHashtable(scene->storage.node_table);
 		destroyBlocks(scene->storage.node_pool);
diff --git a/src/lib/datatypes/scene.h b/src/lib/datatypes/scene.h
@@ -37,7 +37,7 @@ struct world {
 	bool instances_dirty; // Recompute top-level BVH?
 	// Top-level bounding volume hierarchy,
 	// contains all 3D assets in the scene.
-	struct cr_rwlock bvh_lock;
+	struct cr_rwlock *bvh_lock;
 	struct bvh *topLevel; // FIXME: Move to state?
 	bool top_level_dirty;
 	struct cr_thread_pool *bg_worker;
@@ -53,4 +53,5 @@ struct world {
 	char *asset_path;
 };
 
+struct world *scene_new(void);
 void scene_destroy(struct world *scene);
diff --git a/src/lib/protocol/protocol.c b/src/lib/protocol/protocol.c
@@ -771,7 +771,7 @@ struct world *deserialize_scene(const cJSON *in) {
 	out->asset_path = stringCopy("./");
 	out->storage.node_pool = newBlock(NULL, 1024);
 	out->storage.node_table = newHashtable(compareNodes, &out->storage.node_pool);
-	thread_rwlock_init(&out->bvh_lock);
+	out->bvh_lock = thread_rwlock_init();
 	out->bg_worker = thread_pool_create(sys_get_cores());
 
 	cJSON *asset_path = cJSON_GetObjectItem(in, "asset_path");
@@ -818,10 +818,10 @@ struct world *deserialize_scene(const cJSON *in) {
 	if (cJSON_IsArray(meshes)) {
 		cJSON *mesh = NULL;
 		cJSON_ArrayForEach(mesh, meshes) {
-			thread_rwlock_wrlock(&out->bvh_lock);
+			thread_rwlock_wrlock(out->bvh_lock);
 			cr_mesh idx = mesh_arr_add(&out->meshes, deserialize_mesh(mesh));
 			cr_mesh_finalize((struct cr_scene *)out, idx);
-			thread_rwlock_unlock(&out->bvh_lock);
+			thread_rwlock_unlock(out->bvh_lock);
 		}
 	}
 
diff --git a/src/lib/renderer/renderer.c b/src/lib/renderer/renderer.c
@@ -130,10 +130,10 @@ void update_toplevel_bvh(struct world *s) {
 	if (!s->top_level_dirty && s->topLevel) return;
 	struct bvh *new = build_top_level_bvh(s->instances);
 	//!//!//!//!//!//!//!//!//!//!//!//!
-	thread_rwlock_wrlock(&s->bvh_lock);
+	thread_rwlock_wrlock(s->bvh_lock);
 	struct bvh *old = s->topLevel;
 	s->topLevel = new;
-	thread_rwlock_unlock(&s->bvh_lock);
+	thread_rwlock_unlock(s->bvh_lock);
 	//!//!//!//!//!//!//!//!//!//!//!//!
 	destroy_bvh(old);
 	// Bind shader buffers to instances
@@ -373,9 +373,9 @@ void *render_thread_interactive(void *arg) {
 				sampler_init(sampler, SAMPLING_STRATEGY, r->state.finishedPasses, r->prefs.sampleCount, pixIdx);
 				
 				struct color output = tex_get_px(*buf, x, y, false);
-				thread_rwlock_rdlock(&r->scene->bvh_lock);
+				thread_rwlock_rdlock(r->scene->bvh_lock);
 				struct color sample = path_trace(cam_get_ray(cam, x, y, sampler), r->scene, r->prefs.bounces, sampler);
-				thread_rwlock_unlock(&r->scene->bvh_lock);
+				thread_rwlock_unlock(r->scene->bvh_lock);
 
 				nan_clamp(&sample, &output);
 				
@@ -453,9 +453,9 @@ void *render_thread(void *arg) {
 					sampler_init(sampler, SAMPLING_STRATEGY, samples - 1, r->prefs.sampleCount, pixIdx);
 					
 					struct color output = tex_get_px(*buf, x, y, false);
-					thread_rwlock_rdlock(&r->scene->bvh_lock);
+					thread_rwlock_rdlock(r->scene->bvh_lock);
 					struct color sample = path_trace(cam_get_ray(cam, x, y, sampler), r->scene, r->prefs.bounces, sampler);
-					thread_rwlock_unlock(&r->scene->bvh_lock);
+					thread_rwlock_unlock(r->scene->bvh_lock);
 					
 					// Clamp out fireflies - This is probably not a good way to do that.
 					nan_clamp(&sample, &output);
@@ -527,9 +527,9 @@ void *render_single_iteration(void *arg) {
 				sampler_init(sampler, SAMPLING_STRATEGY, samples - 1, r->prefs.sampleCount, pixIdx);
 
 				struct color output = tex_get_px(*buf, x, y, false);
-				thread_rwlock_rdlock(&r->scene->bvh_lock);
+				thread_rwlock_rdlock(r->scene->bvh_lock);
 				struct color sample = path_trace(cam_get_ray(cam, x, y, sampler), r->scene, r->prefs.bounces, sampler);
-				thread_rwlock_unlock(&r->scene->bvh_lock);
+				thread_rwlock_unlock(r->scene->bvh_lock);
 
 				// Clamp out fireflies - This is probably not a good way to do that.
 				nan_clamp(&sample, &output);
@@ -578,20 +578,13 @@ struct renderer *renderer_new(void) {
 	struct renderer *r = calloc(1, sizeof(*r));
 	r->prefs = default_prefs();
 	r->state.finishedPasses = 1;
-	
-	// Move these elsewhere
-	r->scene = calloc(1, sizeof(*r->scene));
-	r->scene->asset_path = stringCopy("./");
-	r->scene->storage.node_pool = newBlock(NULL, 1024);
-	r->scene->storage.node_table = newHashtable(compareNodes, &r->scene->storage.node_pool);
-	thread_rwlock_init(&r->scene->bvh_lock);
-	r->scene->bg_worker = thread_pool_create(sys_get_cores());
+	r->scene = scene_new();
 	return r;
 }
 
 void renderer_destroy(struct renderer *r) {
-	if (!r) return;
-	thread_pool_destroy(r->scene->bg_worker);
+	if (!r)
+		return;
 	scene_destroy(r->scene);
 	worker_arr_free(&r->state.workers);
 	render_client_arr_free(&r->state.clients);