Skip to content

Commit 928d1c7

Browse files
authored
Merge pull request #7 from Tencent/master
merge ncnn
2 parents fa5f02c + 6f2ef19 commit 928d1c7

39 files changed

+4273
-2811
lines changed

benchmark/benchncnn.cpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,7 @@ int main(int argc, char** argv)
221221
if (!use_vulkan_compute)
222222
#endif // NCNN_VULKAN
223223
{
224-
opt.use_packing_layout = false;
225224
benchmark("squeezenet_int8", ncnn::Mat(227, 227, 3), opt);
226-
opt.use_packing_layout = true;
227225
}
228226

229227
benchmark("mobilenet", ncnn::Mat(224, 224, 3), opt);
@@ -232,9 +230,7 @@ int main(int argc, char** argv)
232230
if (!use_vulkan_compute)
233231
#endif // NCNN_VULKAN
234232
{
235-
opt.use_packing_layout = false;
236233
benchmark("mobilenet_int8", ncnn::Mat(224, 224, 3), opt);
237-
opt.use_packing_layout = true;
238234
}
239235

240236
benchmark("mobilenet_v2", ncnn::Mat(224, 224, 3), opt);
@@ -260,9 +256,7 @@ int main(int argc, char** argv)
260256
if (!use_vulkan_compute)
261257
#endif // NCNN_VULKAN
262258
{
263-
opt.use_packing_layout = false;
264259
benchmark("googlenet_int8", ncnn::Mat(224, 224, 3), opt);
265-
opt.use_packing_layout = true;
266260
}
267261

268262
benchmark("resnet18", ncnn::Mat(224, 224, 3), opt);
@@ -271,9 +265,7 @@ int main(int argc, char** argv)
271265
if (!use_vulkan_compute)
272266
#endif // NCNN_VULKAN
273267
{
274-
opt.use_packing_layout = false;
275268
benchmark("resnet18_int8", ncnn::Mat(224, 224, 3), opt);
276-
opt.use_packing_layout = true;
277269
}
278270

279271
benchmark("alexnet", ncnn::Mat(227, 227, 3), opt);
@@ -284,9 +276,7 @@ int main(int argc, char** argv)
284276
if (!use_vulkan_compute)
285277
#endif // NCNN_VULKAN
286278
{
287-
opt.use_packing_layout = false;
288279
benchmark("vgg16_int8", ncnn::Mat(224, 224, 3), opt);
289-
opt.use_packing_layout = true;
290280
}
291281

292282
benchmark("resnet50", ncnn::Mat(224, 224, 3), opt);
@@ -295,9 +285,7 @@ int main(int argc, char** argv)
295285
if (!use_vulkan_compute)
296286
#endif // NCNN_VULKAN
297287
{
298-
opt.use_packing_layout = false;
299288
benchmark("resnet50_int8", ncnn::Mat(224, 224, 3), opt);
300-
opt.use_packing_layout = true;
301289
}
302290

303291
benchmark("squeezenet_ssd", ncnn::Mat(300, 300, 3), opt);
@@ -306,9 +294,7 @@ int main(int argc, char** argv)
306294
if (!use_vulkan_compute)
307295
#endif // NCNN_VULKAN
308296
{
309-
opt.use_packing_layout = false;
310297
benchmark("squeezenet_ssd_int8", ncnn::Mat(300, 300, 3), opt);
311-
opt.use_packing_layout = true;
312298
}
313299

314300
benchmark("mobilenet_ssd", ncnn::Mat(300, 300, 3), opt);
@@ -317,9 +303,7 @@ int main(int argc, char** argv)
317303
if (!use_vulkan_compute)
318304
#endif // NCNN_VULKAN
319305
{
320-
opt.use_packing_layout = false;
321306
benchmark("mobilenet_ssd_int8", ncnn::Mat(300, 300, 3), opt);
322-
opt.use_packing_layout = true;
323307
}
324308

325309
benchmark("mobilenet_yolo", ncnn::Mat(416, 416, 3), opt);

src/allocator.cpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -508,7 +508,7 @@ VkBufferMemory* VkBlobBufferAllocator::fastMalloc(size_t size)
508508
if (vkdev->info.type == 1)
509509
{
510510
// integrated gpu, prefer unified memory
511-
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0);
511+
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
512512
}
513513
else
514514
{
@@ -758,7 +758,7 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
758758
if (vkdev->info.type == 1)
759759
{
760760
// integrated gpu, prefer unified memory
761-
memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0);
761+
memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
762762
}
763763
else
764764
{
@@ -805,7 +805,7 @@ VkBufferMemory* VkWeightBufferAllocator::fastMalloc(size_t size)
805805
if (vkdev->info.type == 1)
806806
{
807807
// integrated gpu, prefer unified memory
808-
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, 0);
808+
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
809809
}
810810
else
811811
{
@@ -929,9 +929,7 @@ VkBufferMemory* VkStagingBufferAllocator::fastMalloc(size_t size)
929929
// setup memory type
930930
if (memory_type_index == (uint32_t)-1)
931931
{
932-
// integrated gpu, prefer unified memory
933-
// discrete gpu, prefer the small pcie mappable memory, or fallback to host visible only anyway otherwise
934-
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
932+
memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
935933
}
936934

937935
ptr->memory = allocate_memory(memoryRequirements.size);
@@ -1113,7 +1111,7 @@ VkImageMemory* VkSimpleImageAllocator::fastMalloc(int width, int height, VkForma
11131111
{
11141112
VkImageMemory* ptr = new VkImageMemory;
11151113

1116-
ptr->image = create_image(width, height, format, VK_IMAGE_USAGE_STORAGE_BIT);
1114+
ptr->image = create_image(width, height, format, VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
11171115

11181116
VkMemoryRequirements memoryRequirements;
11191117
vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);

src/command.cpp

Lines changed: 90 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,23 @@ void VkCompute::record_write_timestamp(uint32_t query)
332332
}
333333
#endif // NCNN_BENCHMARK
334334

335+
void VkCompute::record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index)
336+
{
337+
if (queue_family_index == src_queue_family_index)
338+
return;
339+
340+
if (vkdev->info.support_VK_KHR_push_descriptor)
341+
return queue_transfer_acquire_barrier(m.buffer(), m.buffer_offset(), m.total() * m.elemsize, src_queue_family_index);
342+
343+
record_type r;
344+
r.type = 16;
345+
r.queue_transfer_acquire_barrier.buffer = m.buffer();
346+
r.queue_transfer_acquire_barrier.offset = m.buffer_offset();
347+
r.queue_transfer_acquire_barrier.size = m.total() * m.elemsize;
348+
r.queue_transfer_acquire_barrier.src_queue_family_index = src_queue_family_index;
349+
delayed_records.push_back(r);
350+
}
351+
335352
#if __ANDROID_API__ >= 26
336353
void VkCompute::record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m)
337354
{
@@ -870,6 +887,9 @@ int VkCompute::submit_and_wait()
870887
case 15:
871888
compute_host_barrier(r.compute_host_barrier.buffer, r.compute_host_barrier.offset, r.compute_host_barrier.size);
872889
break;
890+
case 16:
891+
queue_transfer_acquire_barrier(r.queue_transfer_acquire_barrier.buffer, r.queue_transfer_acquire_barrier.offset, r.queue_transfer_acquire_barrier.size, r.queue_transfer_acquire_barrier.src_queue_family_index);
892+
break;
873893
}
874894
}
875895

@@ -1180,6 +1200,27 @@ void VkCompute::compute_host_barrier(VkBuffer buffer, size_t offset, size_t size
11801200
vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0);
11811201
}
11821202

1203+
void VkCompute::queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index)
1204+
{
1205+
// fprintf(stderr, "cmd queue_transfer_acquire_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, src_queue_family_index, queue_family_index);
1206+
1207+
VkBufferMemoryBarrier bufferBarrier;
1208+
bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
1209+
bufferBarrier.pNext = 0;
1210+
bufferBarrier.srcAccessMask = 0;
1211+
bufferBarrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
1212+
bufferBarrier.srcQueueFamilyIndex = src_queue_family_index;
1213+
bufferBarrier.dstQueueFamilyIndex = queue_family_index;
1214+
bufferBarrier.buffer = buffer;
1215+
bufferBarrier.offset = offset;
1216+
bufferBarrier.size = size;
1217+
1218+
VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
1219+
VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
1220+
1221+
vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0);
1222+
}
1223+
11831224
void VkCompute::initial_image_compute_barrier(VkImage image)
11841225
{
11851226
// fprintf(stderr, "cmd initial_image_compute_barrier %p %lu %lu\n", image, oldlayout, newlayout);
@@ -1275,9 +1316,13 @@ void VkTransfer::record_upload(const Mat& src, VkMat& dst, const Option& opt)
12751316
// set weight blob as readonly
12761317
dst.data->state = 4;
12771318

1278-
if (dst.allocator->mappable)
1319+
// we can skip queue transfer and staging buffer allocation
1320+
// only on unified memory architecture and unified compute/transfer queue
1321+
// which is usually the case on integrated gpu / cpu
1322+
if (dst.allocator->mappable && queue_family_index == vkdev->info.compute_queue_family_index)
12791323
{
12801324
dst.upload(src_flattened);
1325+
12811326
return;
12821327
}
12831328

@@ -1317,6 +1362,8 @@ int VkTransfer::submit_and_wait()
13171362
mapped_ptr_offset += alignSize(r.size, buffer_offset_alignment);
13181363
}
13191364

1365+
staging_vkallocator->flush(staging_data);
1366+
13201367
begin_command_buffer();
13211368

13221369
// fprintf(stderr, "cmd transfer %p %lu\n", staging_data->buffer, staging_buffer_size);
@@ -1332,32 +1379,32 @@ int VkTransfer::submit_and_wait()
13321379
staging_buffer_offset += alignSize(r.size, buffer_offset_alignment);
13331380
}
13341381

1335-
// // finish TODO queue owner transfer release
1336-
// std::vector<VkBufferMemoryBarrier> bufferBarriers(transfer_count);
1337-
// for (int i=0; i<transfer_count; i++)
1338-
// {
1339-
// const record_type& r = delayed_records[i];
1340-
//
1341-
// bufferBarriers[i].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
1342-
// bufferBarriers[i].pNext = 0;
1343-
// bufferBarriers[i].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
1344-
// bufferBarriers[i].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
1345-
// bufferBarriers[i].srcQueueFamilyIndex = queue_family_index;
1346-
// bufferBarriers[i].dstQueueFamilyIndex = vkdev->info.compute_queue_family_index;
1347-
// bufferBarriers[i].buffer = r.vkmat.buffer();
1348-
// bufferBarriers[i].offset = r.vkmat.buffer_offset();
1349-
// bufferBarriers[i].size = r.size;
1350-
// }
1351-
//
1352-
// VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
1353-
// VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
1354-
//
1355-
// vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, transfer_count, bufferBarriers.data(), 0, 0);
1382+
// owner transfer release
1383+
for (int i=0; i<transfer_count; i++)
1384+
{
1385+
const record_type& r = delayed_records[i];
1386+
1387+
queue_transfer_release_barrier(r.vkmat.buffer(), r.vkmat.buffer_offset(), r.size, vkdev->info.compute_queue_family_index);
1388+
}
13561389

13571390
end_command_buffer();
13581391

13591392
int ret = queue_submit_and_wait_fence();
13601393

1394+
// compute queue owner transfer acquire
1395+
{
1396+
VkCompute cmd(vkdev);
1397+
1398+
for (int i=0; i<transfer_count; i++)
1399+
{
1400+
const record_type& r = delayed_records[i];
1401+
1402+
cmd.record_queue_transfer_acquire(r.vkmat, queue_family_index);
1403+
}
1404+
1405+
cmd.submit_and_wait();
1406+
}
1407+
13611408
// deallocate staging buffer
13621409
staging_vkallocator->fastFree(staging_data);
13631410
staging_data = 0;
@@ -1386,6 +1433,27 @@ void VkTransfer::copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vect
13861433
vkCmdCopyBuffer(command_buffer, src, dst, regions.size(), regions.data());
13871434
}
13881435

1436+
void VkTransfer::queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t dst_queue_family_index)
1437+
{
1438+
// fprintf(stderr, "cmd queue_transfer_release_barrier %p[+%lu] %lu %lu -> %lu\n", buffer, offset, size, queue_family_index, dst_queue_family_index);
1439+
1440+
VkBufferMemoryBarrier bufferBarrier;
1441+
bufferBarrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
1442+
bufferBarrier.pNext = 0;
1443+
bufferBarrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
1444+
bufferBarrier.dstAccessMask = 0;
1445+
bufferBarrier.srcQueueFamilyIndex = queue_family_index;
1446+
bufferBarrier.dstQueueFamilyIndex = dst_queue_family_index;
1447+
bufferBarrier.buffer = buffer;
1448+
bufferBarrier.offset = offset;
1449+
bufferBarrier.size = size;
1450+
1451+
VkPipelineStageFlags srcStageMask = VK_PIPELINE_STAGE_TRANSFER_BIT;
1452+
VkPipelineStageFlags dstStageMask = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
1453+
1454+
vkCmdPipelineBarrier(command_buffer, srcStageMask, dstStageMask, 0, 0, 0, 1, &bufferBarrier, 0, 0);
1455+
}
1456+
13891457
} // namespace ncnn
13901458

13911459
#endif // NCNN_VULKAN

src/command.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,11 @@ class VkCompute : public Command
6969

7070
void record_pipeline(const Pipeline* pipeline, const std::vector<VkMat>& bindings, const std::vector<vk_constant_type>& constants, const VkMat& m);
7171

72+
#if NCNN_BENCHMARK
7273
void record_write_timestamp(uint32_t query);
74+
#endif // NCNN_BENCHMARK
75+
76+
void record_queue_transfer_acquire(const VkMat& m, uint32_t src_queue_family_index);
7377

7478
#if __ANDROID_API__ >= 26
7579
void record_import_android_hardware_buffer(const ImportAndroidHardwareBufferPipeline* pipeline, const VkImageMat& im, const VkMat& m);
@@ -134,6 +138,7 @@ class VkCompute : public Command
134138
void transfer_host_barrier(VkBuffer buffer, size_t offset, size_t size);
135139
void host_compute_barrier(VkBuffer buffer, size_t offset, size_t size);
136140
void compute_host_barrier(VkBuffer buffer, size_t offset, size_t size);
141+
void queue_transfer_acquire_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t src_queue_family_index);
137142
void initial_image_compute_barrier(VkImage image);
138143
#if __ANDROID_API__ >= 26
139144
void update_import_android_hardware_buffer_bindings(VkPipelineLayout pipeline_layout, VkDescriptorUpdateTemplateKHR descriptor_update_template, const VkDescriptorImageInfo& descriptorImageInfo, const VkDescriptorBufferInfo& descriptorBufferInfo);
@@ -165,6 +170,7 @@ class VkCompute : public Command
165170
// 13=transfer-host barrier
166171
// 14=host-compute barrier
167172
// 15=compute-host barrier
173+
// 16=queue-transfer-acquire barrier
168174
int type;
169175

170176
union
@@ -187,6 +193,7 @@ class VkCompute : public Command
187193
struct { VkBuffer buffer; size_t offset; size_t size; } transfer_host_barrier;
188194
struct { VkBuffer buffer; size_t offset; size_t size; } host_compute_barrier;
189195
struct { VkBuffer buffer; size_t offset; size_t size; } compute_host_barrier;
196+
struct { VkBuffer buffer; size_t offset; size_t size; size_t src_queue_family_index; } queue_transfer_acquire_barrier;
190197
};
191198

192199
std::vector<VkBufferCopy> regions;
@@ -218,6 +225,7 @@ class VkTransfer : public Command
218225
// recording issue
219226
void copy_buffer(VkBuffer src, size_t src_offset, VkBuffer dst, size_t dst_offset, size_t size);
220227
void copy_buffer_regions(VkBuffer src, VkBuffer dst, const std::vector<VkBufferCopy>& regions);
228+
void queue_transfer_release_barrier(VkBuffer buffer, size_t offset, size_t size, uint32_t target_queue_family_index);
221229

222230
protected:
223231
size_t buffer_offset_alignment;

src/gpu.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
#if __ANDROID__
3232
#define ENABLE_VALIDATION_LAYER 0
3333
#else
34-
#define ENABLE_VALIDATION_LAYER 1
34+
#define ENABLE_VALIDATION_LAYER 0
3535
#endif
3636

3737
namespace ncnn {

0 commit comments

Comments
 (0)