Skip to content

Commit 5120888

Browse files
committed
[d3d11,util] Add GPU execution cost heuristic for submissions
May fix GPU time-outs on tilers.
1 parent e2394e8 commit 5120888

File tree

10 files changed

+106
-16
lines changed

10 files changed

+106
-16
lines changed

src/d3d11/d3d11_cmdlist.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,8 @@ namespace dxvk {
5454
}
5555

5656

57-
uint64_t D3D11CommandList::AddChunk(DxvkCsChunkRef&& Chunk) {
58-
m_chunks.push_back(std::move(Chunk));
57+
uint64_t D3D11CommandList::AddChunk(DxvkCsChunkRef&& Chunk, uint64_t Cost) {
58+
m_chunks.emplace_back(std::move(Chunk), Cost);
5959
return m_chunks.size() - 1;
6060
}
6161

@@ -99,7 +99,7 @@ namespace dxvk {
9999
flushType = GpuFlushType::ImplicitStrongHint;
100100

101101
// Dispatch the chunk and capture its sequence number
102-
uint64_t seq = DispatchProc(DxvkCsChunkRef(m_chunks[i]), flushType);
102+
uint64_t seq = DispatchProc(DxvkCsChunkRef(m_chunks[i].chunk), m_chunks[i].cost, flushType);
103103

104104
// Track resource sequence numbers for the added chunk
105105
while (j < m_resources.size() && m_resources[j].chunkId == i)

src/d3d11/d3d11_cmdlist.h

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
namespace dxvk {
88

9-
using D3D11ChunkDispatchProc = std::function<uint64_t (DxvkCsChunkRef&&, GpuFlushType)>;
9+
using D3D11ChunkDispatchProc = std::function<uint64_t (DxvkCsChunkRef&&, uint64_t, GpuFlushType)>;
1010

1111
class D3D11CommandList : public D3D11DeviceChild<ID3D11CommandList> {
1212

@@ -28,7 +28,8 @@ namespace dxvk {
2828
D3D11Query* pQuery);
2929

3030
uint64_t AddChunk(
31-
DxvkCsChunkRef&& Chunk);
31+
DxvkCsChunkRef&& Chunk,
32+
uint64_t Cost);
3233

3334
uint64_t AddCommandList(
3435
D3D11CommandList* pCommandList);
@@ -44,14 +45,22 @@ namespace dxvk {
4445

4546
private:
4647

48+
struct ChunkEntry {
49+
ChunkEntry() = default;
50+
ChunkEntry(DxvkCsChunkRef&& c, uint64_t v)
51+
: chunk(std::move(c)), cost(v) { }
52+
DxvkCsChunkRef chunk = { };
53+
uint64_t cost = 0u;
54+
};
55+
4756
struct TrackedResource {
4857
D3D11ResourceRef ref;
4958
uint64_t chunkId;
5059
};
5160

5261
UINT m_contextFlags = 0u;
5362

54-
std::vector<DxvkCsChunkRef> m_chunks;
63+
std::vector<ChunkEntry> m_chunks;
5564
std::vector<Com<D3D11Query, false>> m_queries;
5665
std::vector<TrackedResource> m_resources;
5766

src/d3d11/d3d11_context.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,8 @@ namespace dxvk {
395395
if (counterView == nullptr)
396396
return;
397397

398+
AddCost(GpuCostEstimate::Transfer);
399+
398400
EmitCs([
399401
cDstSlice = buf->GetBufferSlice(DstAlignedByteOffset),
400402
cSrcSlice = DxvkBufferSlice(counterView)
@@ -423,6 +425,8 @@ namespace dxvk {
423425
if (!rtv)
424426
return;
425427

428+
AddCost(GpuCostEstimate::Transfer);
429+
426430
auto view = rtv->GetImageView();
427431
auto color = ConvertColorValue(ColorRGBA, view->formatInfo());
428432

@@ -451,6 +455,8 @@ namespace dxvk {
451455
if (FAILED(pUnorderedAccessView->QueryInterface(IID_PPV_ARGS(&qiUav))))
452456
return;
453457

458+
AddCost(GpuCostEstimate::Transfer);
459+
454460
auto uav = static_cast<D3D11UnorderedAccessView*>(qiUav.ptr());
455461

456462
// Gather UAV format info. We'll use this to determine
@@ -598,6 +604,8 @@ namespace dxvk {
598604
if (!info || info->flags.any(DxvkFormatFlag::SampledSInt, DxvkFormatFlag::SampledUInt))
599605
return;
600606

607+
AddCost(GpuCostEstimate::Transfer);
608+
601609
VkClearValue clearValue;
602610
clearValue.color.float32[0] = Values[0];
603611
clearValue.color.float32[1] = Values[1];
@@ -657,6 +665,8 @@ namespace dxvk {
657665
if (!aspectMask)
658666
return;
659667

668+
AddCost(GpuCostEstimate::Transfer);
669+
660670
VkClearValue clearValue;
661671
clearValue.depthStencil.depth = Depth;
662672
clearValue.depthStencil.stencil = Stencil;
@@ -683,6 +693,8 @@ namespace dxvk {
683693
if (NumRects && !pRect)
684694
return;
685695

696+
AddCost(GpuCostEstimate::Transfer);
697+
686698
// ID3D11View has no methods to query the exact type of
687699
// the view, so we'll have to check each possible class
688700
auto dsv = dynamic_cast<D3D11DepthStencilView*>(pView);
@@ -768,6 +780,8 @@ namespace dxvk {
768780
if (!(resourceDesc.MiscFlags & D3D11_RESOURCE_MISC_GENERATE_MIPS))
769781
return;
770782

783+
AddCost(GpuCostEstimate::Transfer);
784+
771785
EmitCs([cDstImageView = view->GetImageView()]
772786
(DxvkContext* ctx) {
773787
ctx->generateMipmaps(cDstImageView, VK_FILTER_LINEAR);
@@ -825,6 +839,8 @@ namespace dxvk {
825839
|| SrcSubresource >= srcTextureInfo->CountSubresources())
826840
return;
827841

842+
AddCost(GpuCostEstimate::Transfer);
843+
828844
const VkImageSubresource dstSubresource =
829845
dstTextureInfo->GetSubresourceFromIndex(
830846
dstVulkanFormatInfo->aspectMask, DstSubresource);
@@ -1148,6 +1164,8 @@ namespace dxvk {
11481164
if (unlikely(!ThreadGroupCountX || !ThreadGroupCountY || !ThreadGroupCountZ))
11491165
return;
11501166

1167+
AddCost(GpuCostEstimate::Dispatch);
1168+
11511169
if (unlikely(HasDirtyComputeBindings()))
11521170
ApplyDirtyComputeBindings();
11531171

@@ -1170,6 +1188,8 @@ namespace dxvk {
11701188
if (!ValidateDrawBufferSize(pBufferForArgs, AlignedByteOffsetForArgs, sizeof(VkDispatchIndirectCommand)))
11711189
return;
11721190

1191+
AddCost(GpuCostEstimate::DispatchIndirect);
1192+
11731193
if (unlikely(HasDirtyComputeBindings()))
11741194
ApplyDirtyComputeBindings();
11751195

@@ -4151,6 +4171,8 @@ namespace dxvk {
41514171
if (SrcOffset >= srcLength || DstOffset >= dstLength || !ByteCount)
41524172
return;
41534173

4174+
AddCost(GpuCostEstimate::Transfer);
4175+
41544176
ByteCount = std::min(dstLength - DstOffset, ByteCount);
41554177
ByteCount = std::min(srcLength - SrcOffset, ByteCount);
41564178

@@ -4234,6 +4256,8 @@ namespace dxvk {
42344256
if (!SrcExtent.width || !SrcExtent.height || !SrcExtent.depth)
42354257
return;
42364258

4259+
AddCost(GpuCostEstimate::Transfer);
4260+
42374261
// While copying between 2D and 3D images is allowed in CopySubresourceRegion,
42384262
// copying more than one slice at a time is not suppoted. Layer counts are 1.
42394263
if ((pDstTexture->GetVkImageType() == VK_IMAGE_TYPE_3D)
@@ -4461,6 +4485,8 @@ namespace dxvk {
44614485
tiles[i] = tile;
44624486
}
44634487

4488+
AddCost(GpuCostEstimate::Transfer);
4489+
44644490
// If D3D12 is anything to go by, not passing this flag will trigger
44654491
// the other code path, regardless of whether TO_LINEAR_BUFFER is set.
44664492
if (Flags & D3D11_TILE_COPY_LINEAR_BUFFER_TO_SWIZZLED_TILED_RESOURCE) {
@@ -5372,6 +5398,7 @@ namespace dxvk {
53725398
}
53735399

53745400
if (needsUpdate) {
5401+
AddCost(GpuCostEstimate::RenderPass);
53755402
BindFramebuffer();
53765403

53775404
if constexpr (!IsDeferred)

src/d3d11/d3d11_context.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ namespace dxvk {
7979
protected:
8080
// Compile-time debug flag to force lazy binding on (True) or off (False)
8181
constexpr static Tristate DebugLazyBinding = Tristate::Auto;
82-
8382
public:
8483

8584
D3D11CommonContext(
@@ -799,6 +798,8 @@ namespace dxvk {
799798
DxvkCsChunkRef m_csChunk;
800799
DxvkCsDataBlock* m_csData = nullptr;
801800

801+
uint64_t m_estimatedCost = 0u;
802+
802803
DxvkLocalAllocationCache m_allocationCache;
803804

804805
D3D11ShaderStageState<Rc<DxvkBuffer>> m_instanceData;
@@ -1181,6 +1182,10 @@ namespace dxvk {
11811182
Rc<DxvkBuffer> AllocInstanceDataBuffer(
11821183
D3D11ShaderType ShaderStage);
11831184

1185+
force_inline void AddCost(uint64_t Value) {
1186+
m_estimatedCost += Value;
1187+
}
1188+
11841189
static DxvkInputAssemblyState InitDefaultPrimitiveTopology();
11851190

11861191
static DxvkRasterizerState InitDefaultRasterizerState();

src/d3d11/d3d11_context_def.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,8 @@ namespace dxvk {
397397

398398

399399
void D3D11DeferredContext::EmitCsChunk(DxvkCsChunkRef&& chunk) {
400-
m_chunkId = m_commandList->AddChunk(std::move(chunk));
400+
m_chunkId = m_commandList->AddChunk(std::move(chunk), m_estimatedCost);
401+
m_estimatedCost = 0u;
401402
}
402403

403404

src/d3d11/d3d11_context_ext.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ namespace dxvk {
4646
UINT ByteOffsetForArgs,
4747
UINT ByteStrideForArgs) {
4848
D3D10DeviceLock lock = m_ctx->LockContext();
49+
m_ctx->AddCost(GpuCostEstimate::DrawIndirect);
4950
m_ctx->SetDrawBuffers(pBufferForArgs, nullptr);
5051

5152
if (unlikely(m_ctx->HasDirtyGraphicsBindings()))
@@ -68,8 +69,9 @@ namespace dxvk {
6869
UINT ByteOffsetForArgs,
6970
UINT ByteStrideForArgs) {
7071
D3D10DeviceLock lock = m_ctx->LockContext();
72+
m_ctx->AddCost(GpuCostEstimate::DrawIndirect);
7173
m_ctx->SetDrawBuffers(pBufferForArgs, nullptr);
72-
74+
7375
if (unlikely(m_ctx->HasDirtyGraphicsBindings()))
7476
m_ctx->ApplyDirtyGraphicsBindings();
7577

@@ -92,6 +94,7 @@ namespace dxvk {
9294
UINT ByteOffsetForArgs,
9395
UINT ByteStrideForArgs) {
9496
D3D10DeviceLock lock = m_ctx->LockContext();
97+
m_ctx->AddCost(GpuCostEstimate::DrawIndirect);
9598
m_ctx->SetDrawBuffers(pBufferForArgs, pBufferForCount);
9699

97100
if (unlikely(m_ctx->HasDirtyGraphicsBindings()))
@@ -117,6 +120,7 @@ namespace dxvk {
117120
UINT ByteOffsetForArgs,
118121
UINT ByteStrideForArgs) {
119122
D3D10DeviceLock lock = m_ctx->LockContext();
123+
m_ctx->AddCost(GpuCostEstimate::DrawIndirect);
120124
m_ctx->SetDrawBuffers(pBufferForArgs, pBufferForCount);
121125

122126
if (unlikely(m_ctx->HasDirtyGraphicsBindings()))
@@ -212,6 +216,8 @@ namespace dxvk {
212216
launchInfo.shader = cubinShader;
213217

214218
/* Need to capture by value in case this gets called from a deferred context */
219+
m_ctx->AddCost(GpuCostEstimate::Dispatch);
220+
215221
m_ctx->EmitCs([cLaunchInfo = std::move(launchInfo)] (DxvkContext* ctx) {
216222
ctx->launchCuKernelNVX(cLaunchInfo.nvxLaunchInfo, cLaunchInfo.buffers, cLaunchInfo.images);
217223
});

src/d3d11/d3d11_context_imm.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ namespace dxvk {
261261
ConsiderFlush(GpuFlushType::ImplicitWeakHint);
262262

263263
// Dispatch command list to the CS thread
264-
commandList->EmitToCsThread([this] (DxvkCsChunkRef&& chunk, GpuFlushType flushType) {
264+
commandList->EmitToCsThread([this] (DxvkCsChunkRef&& chunk, uint64_t cost, GpuFlushType flushType) {
265265
EmitCsChunk(std::move(chunk));
266266

267267
// Return the sequence number from before the flush since
@@ -270,6 +270,7 @@ namespace dxvk {
270270

271271
// Consider a flush after every chunk in case the app
272272
// submits a very large command list or the GPU is idle
273+
AddCost(cost);
273274
ConsiderFlush(flushType);
274275
return csSeqNum;
275276
});
@@ -1110,7 +1111,7 @@ namespace dxvk {
11101111
uint64_t chunkId = GetCurrentSequenceNumber();
11111112
uint64_t submissionId = m_submissionFence->value();
11121113

1113-
if (m_flushTracker.considerFlush(FlushType, chunkId, submissionId))
1114+
if (m_flushTracker.considerFlush(FlushType, chunkId, submissionId, m_estimatedCost))
11141115
ExecuteFlush(FlushType, nullptr, false);
11151116
}
11161117

@@ -1173,6 +1174,9 @@ namespace dxvk {
11731174
// Reset counter for discarded memory in flight
11741175
m_discardMemoryOnFlush = m_discardMemoryCounter;
11751176

1177+
// Reset GPU execution cost estimate
1178+
m_estimatedCost = 0u;
1179+
11761180
// Notify the device that the context has been flushed,
11771181
// this resets some resource initialization heuristics.
11781182
m_parent->NotifyContextFlush();

src/d3d9/d3d9_device.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5876,7 +5876,7 @@ namespace dxvk {
58765876
uint64_t chunkId = GetCurrentSequenceNumber();
58775877
uint64_t submissionId = m_submissionFence->value();
58785878

5879-
if (m_flushTracker.considerFlush(FlushType, chunkId, submissionId))
5879+
if (m_flushTracker.considerFlush(FlushType, chunkId, submissionId, 0u))
58805880
Flush();
58815881
}
58825882

src/util/util_flush.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#include "util_flush.h"
2+
#include "util_string.h"
3+
#include "log/log.h"
24

35
namespace dxvk {
46

@@ -10,7 +12,8 @@ namespace dxvk {
1012
bool GpuFlushTracker::considerFlush(
1113
GpuFlushType flushType,
1214
uint64_t chunkId,
13-
uint32_t lastCompleteSubmissionId) {
15+
uint32_t lastCompleteSubmissionId,
16+
uint64_t estimatedCost) {
1417
constexpr uint32_t minPendingSubmissions = 2;
1518

1619
constexpr uint32_t minChunkCount = 3u;
@@ -22,8 +25,14 @@ namespace dxvk {
2225
if (!chunkCount)
2326
return false;
2427

25-
if (flushType > m_maxType)
26-
return false;
28+
// Deliberately ignore cost heuristic if we're not categorically ignoring
29+
// submission requests anyway, since we should never submit enough to time
30+
// out with the chunk-based heuristic.
31+
if (flushType > m_maxType) {
32+
if (estimatedCost >= GpuCostEstimate::MaxCostPerSubmission)
33+
Logger::err("frog");
34+
return estimatedCost >= GpuCostEstimate::MaxCostPerSubmission;
35+
}
2736

2837
// Take any earlier missed flush with a stronger hint into account, so
2938
// that we still flush those as soon as possible. Ignore synchronization

src/util/util_flush.h

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,33 @@
66

77
namespace dxvk {
88

9+
/**
10+
* \brief GPU cost estimate for various operations
11+
*
12+
* These provide only a very rough estimate for GPU execution times,
13+
* which can be useful to avoid GPU time-outs in some situations.
14+
*/
15+
struct GpuCostEstimate {
16+
/** Assume that compute dispatches are much more expensive than draws
17+
* regardless of workgroup counts. This is not always true, but may
18+
* help account for immediate synchronization or complex shaders that
19+
* we do not generally have any up-front knowledge about. */
20+
static constexpr uint64_t Dispatch = 4u;
21+
static constexpr uint64_t DispatchIndirect = 5u;
22+
/** Assume a high base cost per render pass. We're not counting draws
23+
* in order to avoid splitting passes on tiling GPUs, and draw costs
24+
* can vary wildly anyway. */
25+
static constexpr uint64_t RenderPass = 10u;
26+
/** Transfer cost can vary wildly, but so do use cases. Just assume
27+
* a low cost, especially since synchronization on back-to-back
28+
* transfers is unlikely to be necessary. */
29+
static constexpr uint64_t Transfer = 2u;
30+
31+
/** Cost threshold at which submissions are always preferred */
32+
static constexpr uint64_t MaxCostPerSubmission = 1'500u;
33+
};
34+
35+
936
/**
1037
* \brief GPU context flush type
1138
*/
@@ -60,12 +87,14 @@ namespace dxvk {
6087
* \param [in] flushType Flush type
6188
* \param [in] chunkId GPU command sequence number
6289
* \param [in] lastCompleteSubmissionId Last completed command submission ID
90+
* \param [in] estimatedCost Estimated submission cost
6391
* \returns \c true if a flush should be performed
6492
*/
6593
bool considerFlush(
6694
GpuFlushType flushType,
6795
uint64_t chunkId,
68-
uint32_t lastCompleteSubmissionId);
96+
uint32_t lastCompleteSubmissionId,
97+
uint64_t estimatedCost);
6998

7099
/**
71100
* \brief Notifies tracker about a context flush

0 commit comments

Comments
 (0)