Skip to content

Commit f524baf

Browse files
committed
use BufferSubData for uploading rf data on NVIDIA/AMD
It may seem counter-intuitive and goes against all of NVIDIA's presentations and other documentation but this is the fastest way to upload the data. Unfourtanetely this doesn't work on Intel for some reason so it gets a seperate path. The seperate path code is necessary anyways because we want to add CUDA support for NVIDIA.
1 parent c79b836 commit f524baf

File tree

3 files changed

+62
-25
lines changed

3 files changed

+62
-25
lines changed

beamformer.c

Lines changed: 36 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -69,12 +69,17 @@ alloc_shader_storage(BeamformerCtx *ctx, Arena a)
6969
glDeleteBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
7070
glGenBuffers(ARRAY_COUNT(cs->rf_data_ssbos), cs->rf_data_ssbos);
7171

72+
i32 storage_flags = GL_DYNAMIC_STORAGE_BIT;
73+
if (ctx->gl_vendor_id == GL_VENDOR_INTEL)
74+
storage_flags |= GL_MAP_WRITE_BIT;
7275
glDeleteBuffers(1, &cs->raw_data_ssbo);
73-
glGenBuffers(1, &cs->raw_data_ssbo);
76+
glCreateBuffers(1, &cs->raw_data_ssbo);
77+
glNamedBufferStorage(cs->raw_data_ssbo, ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size, 0,
78+
storage_flags);
7479

75-
glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->raw_data_ssbo);
76-
glBufferStorage(GL_SHADER_STORAGE_BUFFER, ARRAY_COUNT(cs->raw_data_fences) * rf_raw_size,
77-
0, GL_MAP_WRITE_BIT);
80+
/* TODO: allow this to grow if the raw data has been resized */
81+
if (cs->raw_data_arena.beg == 0)
82+
cs->raw_data_arena = os_new_arena(rf_raw_size);
7883

7984
for (u32 i = 0; i < ARRAY_COUNT(cs->rf_data_ssbos); i++) {
8085
glBindBuffer(GL_SHADER_STORAGE_BUFFER, cs->rf_data_ssbos[i]);
@@ -538,7 +543,8 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
538543
BeamformerParameters *bp = &ctx->params->raw;
539544
/* NOTE: Check for and Load RF Data into GPU */
540545
if (os_poll_pipe(ctx->data_pipe)) {
541-
if (!uv4_equal(ctx->csctx.dec_data_dim, bp->dec_data_dim) || ctx->flags & ALLOC_SSBOS)
546+
ComputeShaderCtx *cs = &ctx->csctx;
547+
if (!uv4_equal(cs->dec_data_dim, bp->dec_data_dim) || ctx->flags & ALLOC_SSBOS)
542548
alloc_shader_storage(ctx, arena);
543549
if (!uv4_equal(ctx->out_data_dim, bp->output_points) || ctx->flags & ALLOC_OUT_TEX)
544550
alloc_output_image(ctx);
@@ -547,31 +553,39 @@ do_beamformer(BeamformerCtx *ctx, Arena arena)
547553
/* NOTE: if this times out it means the command queue is more than 3 frames behind.
548554
* In that case we need to re-evaluate the buffer size */
549555
if (ctx->csctx.raw_data_fences[raw_index]) {
550-
i32 result = glClientWaitSync(ctx->csctx.raw_data_fences[raw_index], 0, 10000);
556+
i32 result = glClientWaitSync(cs->raw_data_fences[raw_index], 0, 10000);
551557
if (result == GL_TIMEOUT_EXPIRED) {
552558
//ASSERT(0);
553559
}
554-
glDeleteSync(ctx->csctx.raw_data_fences[raw_index]);
555-
ctx->csctx.raw_data_fences[raw_index] = NULL;
560+
glDeleteSync(cs->raw_data_fences[raw_index]);
561+
cs->raw_data_fences[raw_index] = NULL;
556562
}
557563

558-
uv2 rf_raw_dim = ctx->csctx.rf_raw_dim;
564+
uv2 rf_raw_dim = cs->rf_raw_dim;
559565
size rf_raw_size = rf_raw_dim.x * rf_raw_dim.y * sizeof(i16);
560566

561-
glBindBuffer(GL_SHADER_STORAGE_BUFFER, ctx->csctx.raw_data_ssbo);
562-
void *rf_data_buf = glMapBufferRange(GL_SHADER_STORAGE_BUFFER,
563-
raw_index * rf_raw_size, rf_raw_size,
564-
GL_MAP_UNSYNCHRONIZED_BIT | GL_MAP_WRITE_BIT);
565-
if (!rf_data_buf) {
566-
rlCheckErrors();
567-
ASSERT(0);
567+
if (ctx->gl_vendor_id == GL_VENDOR_INTEL) {
568+
/* TODO: intel complains about this buffer being busy even with
569+
* MAP_UNSYNCHRONIZED_BIT */
570+
void *rf_data_buf = glMapNamedBufferRange(cs->raw_data_ssbo,
571+
raw_index * rf_raw_size,
572+
rf_raw_size,
573+
GL_MAP_WRITE_BIT);
574+
size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size);
575+
glUnmapNamedBuffer(cs->raw_data_ssbo);
576+
if (rlen == rf_raw_size) ctx->flags |= DO_COMPUTE;
577+
else ctx->partial_transfer_count++;
578+
} else {
579+
void *rf_data_buf = cs->raw_data_arena.beg + raw_index * rf_raw_size;
580+
size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size);
581+
if (rlen == rf_raw_size) {
582+
ctx->flags |= DO_COMPUTE;
583+
glNamedBufferSubData(cs->raw_data_ssbo, raw_index * rf_raw_size,
584+
rf_raw_size, rf_data_buf);
585+
} else {
586+
ctx->partial_transfer_count++;
587+
}
568588
}
569-
size rlen = os_read_pipe_data(ctx->data_pipe, rf_data_buf, rf_raw_size);
570-
571-
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
572-
573-
if (rlen == rf_raw_size) ctx->flags |= DO_COMPUTE;
574-
else ctx->partial_transfer_count++;
575589
}
576590

577591
if (ctx->flags & UPLOAD_FILTER)

beamformer.h

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ enum program_flags {
6060
DO_COMPUTE = 1 << 30,
6161
};
6262

63+
enum gl_vendor_ids {
64+
GL_VENDOR_AMD,
65+
GL_VENDOR_INTEL,
66+
GL_VENDOR_NVIDIA,
67+
};
68+
6369
typedef struct {
6470
char buf[64];
6571
i32 buf_len;
@@ -98,13 +104,17 @@ typedef struct {
98104
GLsync timer_fence;
99105
f32 last_frame_time[CS_LAST];
100106

101-
/* NOTE: multiple raw data SSBOs for unsynchronized mapping.
102-
* Decoded data is only relavent in the context of a single frame, two are
103-
* used so that they can be swapped when chaining multiple compute stages */
107+
/* NOTE: the raw_data_ssbo is allocated at 3x the required size to allow for tiled
108+
* transfers when the GPU is running behind the CPU. It is not mapped because NVIDIA's
109+
* drivers _will_ store the buffer in the sytem memory in that case (this doesn't happen
110+
* for Intel or AMD). Instead BufferSubData is used to update the correct subrange */
104111
GLsync raw_data_fences[3];
112+
Arena raw_data_arena;
105113
u32 raw_data_ssbo;
106114
u32 raw_data_index;
107115

116+
/* NOTE: Decoded data is only relevant in the context of a single frame. We use two
117+
* buffers so that they can be swapped when chaining multiple compute stages */
108118
u32 rf_data_ssbos[2];
109119
u32 last_output_ssbo_index;
110120
u32 hadamard_ssbo;
@@ -134,6 +144,7 @@ typedef struct {
134144
typedef struct {
135145
uv2 window_size;
136146
u32 flags;
147+
enum gl_vendor_ids gl_vendor_id;
137148

138149
f32 dt;
139150

main.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,18 @@ main(void)
185185

186186
ctx.params->raw.output_points = ctx.out_data_dim;
187187

188+
/* NOTE: Determine which graphics vendor we are running on */
189+
{
190+
const u8 *vendor = glGetString(GL_VENDOR);
191+
if (!vendor)
192+
die("Failed to determine GL Vendor\n");
193+
switch (vendor[0]) {
194+
case 'A': ctx.gl_vendor_id = GL_VENDOR_AMD; break;
195+
case 'I': ctx.gl_vendor_id = GL_VENDOR_INTEL; break;
196+
case 'N': ctx.gl_vendor_id = GL_VENDOR_NVIDIA; break;
197+
default: die("Unknown GL Vendor: %s\n", vendor); break;
198+
}
199+
}
188200

189201
/* NOTE: set up OpenGL debug logging */
190202
glDebugMessageCallback(gl_debug_logger, NULL);

0 commit comments

Comments
 (0)