Skip to content

Commit 5f18c0c

Browse files
adrianlizarragaashrit-ms
authored andcommitted
[QNN EP] Make offloading graph input/output quantization (to CPU) the default (#23368)
### Description Makes the QNN provider option `offload_graph_io_quantization` enabled by default. It was previously disabled by default. ### Motivation and Context Enabling this option significantly decreases inference latency for many models.
1 parent 3a05b42 commit 5f18c0c

35 files changed

+105
-8
lines changed

include/onnxruntime/core/session/onnxruntime_c_api.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3665,8 +3665,8 @@ struct OrtApi {
36653665
* - "1": Enabled.
36663666
* "offload_graph_io_quantization": Offload graph input quantization and graph output dequantization to another
36673667
* execution provider (typically CPU EP).
3668-
* - "0": Default. Disabled. QNN EP will handle quantization and dequantization of graph I/O.
3669-
* - "1": Enabled.
3668+
* - "0": Disabled. QNN EP will handle quantization and dequantization of graph I/O.
3669+
* - "1": Enabled. This is the default value.
36703670
* "enable_htp_spill_fill_buffer": Enable HTP spill fill buffer setting. The flag is used while generating context binary.
36713671
* - "0": Default. Disabled.
36723672
* - "1": Enabled.

onnxruntime/core/providers/qnn/qnn_execution_provider.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -351,13 +351,15 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
351351
// Add this option because this feature requires QnnSystem lib and it's no supported for Windows x86_64 platform
352352
enable_spill_fill_buffer_ = ParseBoolOption("enable_htp_spill_fill_buffer", false, provider_options_map);
353353

354-
model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", false,
354+
model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", true,
355355
provider_options_map);
356356

357357
if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) {
358-
LOGS_DEFAULT(WARNING) << "Fallback to CPU EP is disabled, but user configured QNN EP to offload graph I/O "
359-
<< "quantization/dequantization to another EP. Session creation will fail if the CPU EP "
360-
<< "handles the graph I/O quantization/dequantization.";
358+
LOGS_DEFAULT(INFO) << "Fallback to CPU EP is disabled, but user tried to configure QNN EP to offload graph I/O "
359+
<< "quantization/dequantization to another EP. These are conflicting options. Fallback to CPU "
360+
<< "EP will remain disabled and graph I/O quantization/dequantization will not be offloaded "
361+
<< "to another EP.";
362+
model_settings_.offload_graph_io_quantization = false;
361363
}
362364

363365
static const std::string QNN_HTP_SHARED_MEMORY_ALLOCATOR_ENABLED = "enable_htp_shared_memory_allocator";

onnxruntime/test/providers/qnn/argmaxmin_op_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
7070
#else
7171
provider_options["backend_path"] = "libQnnHtp.so";
7272
#endif
73+
provider_options["offload_graph_io_quantization"] = "0";
7374

7475
TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs), // baseline float32 model
7576
BuildQDQArgMxxTestCase<QType>(op_type, input_def, attrs), // QDQ model

onnxruntime/test/providers/qnn/average_pool_test.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ static void RunAveragePoolOpTest(const std::string& op_type,
3131
#else
3232
provider_options["backend_path"] = "libQnnCpu.so";
3333
#endif
34+
provider_options["offload_graph_io_quantization"] = "0";
3435

3536
RunQnnModelTest(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
3637
provider_options,
@@ -53,6 +54,7 @@ static void RunQDQAveragePoolOpTest(const std::string& op_type,
5354
#else
5455
provider_options["backend_path"] = "libQnnHtp.so";
5556
#endif
57+
provider_options["offload_graph_io_quantization"] = "0";
5658

5759
TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
5860
BuildQDQOpTestCase<QuantType>(op_type, input_defs, {}, attrs),

onnxruntime/test/providers/qnn/batch_norm_htp_test.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ static void RunBatchNormQDQTest(const TestInputDef<float>& input_def,
160160
#else
161161
provider_options["backend_path"] = "libQnnHtp.so";
162162
#endif
163+
provider_options["offload_graph_io_quantization"] = "0";
163164

164165
// Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
165166
TestQDQModelAccuracy(BuildBatchNormTestCase(input_def, scale_def, bias_def),
@@ -180,6 +181,7 @@ static void RunBatchNormFP16Test(const TestInputDef<float>& input_def,
180181
#else
181182
provider_options["backend_path"] = "libQnnHtp.so";
182183
#endif
184+
provider_options["offload_graph_io_quantization"] = "0";
183185

184186
TestInputDef<MLFloat16> input_fp16_def = ConvertToFP16InputDef(input_def);
185187
TestInputDef<MLFloat16> scale_fp16_def = ConvertToFP16InputDef(scale_def);

onnxruntime/test/providers/qnn/cast_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ static void RunCastOpTest(const std::vector<int64_t>& shape, ONNX_NAMESPACE::Ten
5757
#else
5858
provider_options["backend_path"] = use_htp ? "libQnnHtp.so" : "libQnnCpu.so";
5959
#endif
60+
provider_options["offload_graph_io_quantization"] = "0";
6061

6162
if (use_htp && enable_fp16_precision) {
6263
provider_options["enable_htp_fp16_precision"] = "1";

onnxruntime/test/providers/qnn/clip_op_test.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ static void RunQDQClipTestOnHTP(const TestInputDef<float>& input_def,
117117
#else
118118
provider_options["backend_path"] = "libQnnHtp.so";
119119
#endif
120+
provider_options["offload_graph_io_quantization"] = "0";
120121

121122
auto f32_model_builder = BuildOpTestCase<float, float>("Clip", {input_def}, {min_max_defs}, {});
122123
auto qdq_model_builder = BuildQDQOpTestCase<QType, float>("Clip", {input_def}, {min_max_defs}, {},
@@ -205,6 +206,7 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
205206
#else
206207
provider_options["backend_path"] = "libQnnHtp.so";
207208
#endif
209+
provider_options["offload_graph_io_quantization"] = "0";
208210

209211
RunQnnModelTest(model_fn,
210212
provider_options,

onnxruntime/test/providers/qnn/conv_test.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ static void RunCPUConvOpTest(const std::string& conv_op_type, const TestInputDef
9393
#else
9494
provider_options["backend_path"] = "libQnnCpu.so";
9595
#endif
96+
provider_options["offload_graph_io_quantization"] = "0";
97+
9698
auto build_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads,
9799
dilations, group, auto_pad);
98100
RunQnnModelTest(build_fn,
@@ -317,6 +319,7 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef
317319
#else
318320
provider_options["backend_path"] = "libQnnHtp.so";
319321
#endif
322+
provider_options["offload_graph_io_quantization"] = "0";
320323

321324
TestQDQModelAccuracy(BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
322325
group, auto_pad, output_activation),
@@ -354,6 +357,7 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te
354357
#else
355358
provider_options["backend_path"] = "libQnnHtp.so";
356359
#endif
360+
provider_options["offload_graph_io_quantization"] = "0";
357361

358362
auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
359363
group, auto_pad, output_activation);
@@ -665,6 +669,7 @@ TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
665669
#else
666670
provider_options["backend_path"] = "libQnnHtp.so";
667671
#endif
672+
provider_options["offload_graph_io_quantization"] = "0";
668673

669674
auto BuildConvMulGraph = [](ModelTestBuilder& builder) {
670675
// DQ node for Conv input

onnxruntime/test/providers/qnn/flatten_op_test.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ static void RunQDQFlattenTestOnHTP(const TestInputDef<float>& input_def,
101101
#else
102102
provider_options["backend_path"] = "libQnnHtp.so";
103103
#endif
104+
provider_options["offload_graph_io_quantization"] = "0";
104105

105106
auto f32_model_builder = BuildOpTestCase<float>("Flatten", {input_def}, {}, attrs);
106107
auto qdq_model_builder = BuildQDQOpTestCase<QType>("Flatten", {input_def}, {}, attrs, kOnnxDomain, use_contrib_qdq);
@@ -172,6 +173,7 @@ TEST_F(QnnHTPBackendTests, Flatten_QDQ8bit_Rank5) {
172173
#else
173174
provider_options["backend_path"] = "libQnnHtp.so";
174175
#endif
176+
provider_options["offload_graph_io_quantization"] = "0";
175177

176178
RunQnnModelTest(model_fn,
177179
provider_options,

onnxruntime/test/providers/qnn/gather_elems_op_test.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ static void RunCPUGatherElemsOpTest(const TestInputDef<float>& input_def,
6767
#else
6868
provider_options["backend_path"] = "libQnnCpu.so";
6969
#endif
70+
provider_options["offload_graph_io_quantization"] = "0";
7071

7172
RunQnnModelTest(BuildOpTestCase<DataType, IndexType>("GatherElements", {input_def}, {indices_def}, attrs),
7273
provider_options,
@@ -91,6 +92,7 @@ static void RunHTPQDQGatherElemsOpTest(const TestInputDef<float>& input_def,
9192
#else
9293
provider_options["backend_path"] = "libQnnHtp.so";
9394
#endif
95+
provider_options["offload_graph_io_quantization"] = "0";
9496

9597
auto f32_model_builder = BuildOpTestCase<float, IndexType>("GatherElements", {input_def}, {indices_def}, attrs);
9698
auto qdq_model_builder = BuildQDQGatherElemsTestCase<QuantType, IndexType>(input_def, indices_def, attrs,
@@ -119,6 +121,7 @@ static void RunHTPGatherElemsOpTest(const TestInputDef<DataType>& input_def,
119121
#else
120122
provider_options["backend_path"] = "libQnnHtp.so";
121123
#endif
124+
provider_options["offload_graph_io_quantization"] = "0";
122125

123126
RunQnnModelTest(BuildOpTestCase<DataType, IndexType>("GatherElements", {input_def}, {indices_def}, attrs),
124127
provider_options,

0 commit comments

Comments
 (0)