@@ -332,6 +332,23 @@ void VkCompute::record_write_timestamp(uint32_t query)
332
332
}
333
333
#endif // NCNN_BENCHMARK
334
334
335
+ void VkCompute::record_queue_transfer_acquire (const VkMat& m, uint32_t src_queue_family_index)
336
+ {
337
+ if (queue_family_index == src_queue_family_index)
338
+ return ;
339
+
340
+ if (vkdev->info .support_VK_KHR_push_descriptor )
341
+ return queue_transfer_acquire_barrier (m.buffer (), m.buffer_offset (), m.total () * m.elemsize , src_queue_family_index);
342
+
343
+ record_type r;
344
+ r.type = 16 ;
345
+ r.queue_transfer_acquire_barrier .buffer = m.buffer ();
346
+ r.queue_transfer_acquire_barrier .offset = m.buffer_offset ();
347
+ r.queue_transfer_acquire_barrier .size = m.total () * m.elemsize ;
348
+ r.queue_transfer_acquire_barrier .src_queue_family_index = src_queue_family_index;
349
+ delayed_records.push_back (r);
350
+ }
351
+
335
352
#if __ANDROID_API__ >= 26
336
353
void VkCompute::record_import_android_hardware_buffer (const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m)
337
354
{
@@ -870,6 +887,9 @@ int VkCompute::submit_and_wait()
870
887
case 15 :
871
888
compute_host_barrier (r.compute_host_barrier .buffer , r.compute_host_barrier .offset , r.compute_host_barrier .size );
872
889
break ;
890
+ case 16 :
891
+ queue_transfer_acquire_barrier (r.queue_transfer_acquire_barrier .buffer , r.queue_transfer_acquire_barrier .offset , r.queue_transfer_acquire_barrier .size , r.queue_transfer_acquire_barrier .src_queue_family_index );
892
+ break ;
873
893
}
874
894
}
875
895
@@ -1180,6 +1200,27 @@ void VkCompute::compute_host_barrier(VkBuffer buffer, size_t offset, size_t size
1180
1200
vkCmdPipelineBarrier (command_buffer, srcStageMask, dstStageMask, 0 , 0 , 0 , 1 , &bufferBarrier, 0 , 0 );
1181
1201
}
1182
1202
1203
+ void VkCompute::queue_transfer_acquire_barrier (VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index)
1204
+ {
1205
+ // fprintf(stderr, "cmd queue_transfer_acquire_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, src_queue_family_index, queue_family_index);
1206
+
1207
+ VkBufferMemoryBarrier bufferBarrier;
1208
+ bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
1209
+ bufferBarrier.pNext = 0 ;
1210
+ bufferBarrier.srcAccessMask = 0 ;
1211
+ bufferBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
1212
+ bufferBarrier.srcQueueFamilyIndex = src_queue_family_index;
1213
+ bufferBarrier.dstQueueFamilyIndex = queue_family_index;
1214
+ bufferBarrier.buffer = buffer;
1215
+ bufferBarrier.offset = offset;
1216
+ bufferBarrier.size = size;
1217
+
1218
+ VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1219
+ VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
1220
+
1221
+ vkCmdPipelineBarrier (command_buffer, srcStageMask, dstStageMask, 0 , 0 , 0 , 1 , &bufferBarrier, 0 , 0 );
1222
+ }
1223
+
1183
1224
void VkCompute::initial_image_compute_barrier (VkImage image)
1184
1225
{
1185
1226
// fprintf(stderr, "cmd initial_image_compute_barrier %p %lu %lu\n", image, oldlayout, newlayout);
@@ -1275,9 +1316,13 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt)
1275
1316
// set weight blob as readonly
1276
1317
dst.data ->state = 4 ;
1277
1318
1278
- if (dst.allocator ->mappable )
1319
+ // we can skip queue transfer and staging buffer allocation
1320
+ // only on unified memory architecture and unified compute/transfer queue
1321
+ // which is usually the case on integrated gpu / cpu
1322
+ if (dst.allocator ->mappable && queue_family_index == vkdev->info .compute_queue_family_index )
1279
1323
{
1280
1324
dst.upload (src_flattened);
1325
+
1281
1326
return ;
1282
1327
}
1283
1328
@@ -1317,6 +1362,8 @@ int VkTransfer::submit_and_wait()
1317
1362
mapped_ptr_offset += alignSize (r.size , buffer_offset_alignment);
1318
1363
}
1319
1364
1365
+ staging_vkallocator->flush (staging_data);
1366
+
1320
1367
begin_command_buffer ();
1321
1368
1322
1369
// fprintf(stderr, "cmd transfer %p %lu\n", staging_data->buffer, staging_buffer_size);
@@ -1332,32 +1379,32 @@ int VkTransfer::submit_and_wait()
1332
1379
staging_buffer_offset += alignSize (r.size , buffer_offset_alignment);
1333
1380
}
1334
1381
1335
- // // finish TODO queue owner transfer release
1336
- // std::vector<VkBufferMemoryBarrier> bufferBarriers(transfer_count);
1337
- // for (int i=0; i<transfer_count; i++)
1338
- // {
1339
- // const record_type& r = delayed_records[i];
1340
- //
1341
- // bufferBarriers[i].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
1342
- // bufferBarriers[i].pNext = 0;
1343
- // bufferBarriers[i].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
1344
- // bufferBarriers[i].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
1345
- // bufferBarriers[i].srcQueueFamilyIndex = queue_family_index;
1346
- // bufferBarriers[i].dstQueueFamilyIndex = vkdev->info.compute_queue_family_index;
1347
- // bufferBarriers[i].buffer = r.vkmat.buffer();
1348
- // bufferBarriers[i].offset = r.vkmat.buffer_offset();
1349
- // bufferBarriers[i].size = r.size;
1350
- // }
1351
- //
1352
- // VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
1353
- // VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
1354
- //
1355
- // vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, transfer_count, bufferBarriers.data(), 0, 0);
1382
+ // owner transfer release
1383
+ for (int i=0 ; i<transfer_count; i++)
1384
+ {
1385
+ const record_type& r = delayed_records[i];
1386
+
1387
+ queue_transfer_release_barrier (r.vkmat .buffer (), r.vkmat .buffer_offset (), r.size , vkdev->info .compute_queue_family_index );
1388
+ }
1356
1389
1357
1390
end_command_buffer ();
1358
1391
1359
1392
int ret = queue_submit_and_wait_fence ();
1360
1393
1394
+ // compute queue owner transfer acquire
1395
+ {
1396
+ VkCompute cmd (vkdev);
1397
+
1398
+ for (int i=0 ; i<transfer_count; i++)
1399
+ {
1400
+ const record_type& r = delayed_records[i];
1401
+
1402
+ cmd.record_queue_transfer_acquire (r.vkmat , queue_family_index);
1403
+ }
1404
+
1405
+ cmd.submit_and_wait ();
1406
+ }
1407
+
1361
1408
// deallocate staging buffer
1362
1409
staging_vkallocator->fastFree (staging_data);
1363
1410
staging_data = 0 ;
@@ -1386,6 +1433,27 @@ void VkTransfer::copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vect
1386
1433
vkCmdCopyBuffer (command_buffer, src, dst, regions.size (), regions.data ());
1387
1434
}
1388
1435
1436
+ void VkTransfer::queue_transfer_release_barrier (VkBuffer buffer, size_t offset, size_t size, uint32_t dst_queue_family_index)
1437
+ {
1438
+ // fprintf(stderr, "cmd queue_transfer_release_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, queue_family_index, dst_queue_family_index);
1439
+
1440
+ VkBufferMemoryBarrier bufferBarrier;
1441
+ bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
1442
+ bufferBarrier.pNext = 0 ;
1443
+ bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
1444
+ bufferBarrier.dstAccessMask = 0 ;
1445
+ bufferBarrier.srcQueueFamilyIndex = queue_family_index;
1446
+ bufferBarrier.dstQueueFamilyIndex = dst_queue_family_index;
1447
+ bufferBarrier.buffer = buffer;
1448
+ bufferBarrier.offset = offset;
1449
+ bufferBarrier.size = size;
1450
+
1451
+ VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
1452
+ VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
1453
+
1454
+ vkCmdPipelineBarrier (command_buffer, srcStageMask, dstStageMask, 0 , 0 , 0 , 1 , &bufferBarrier, 0 , 0 );
1455
+ }
1456
+
1389
1457
} // namespace ncnn
1390
1458
1391
1459
#endif // NCNN_VULKAN
0 commit comments