microsoft
diff --git a/‎include/onnxruntime/core/session/onnxruntime_c_api.h‎
Lines changed: 2 additions & 2 deletions b/‎include/onnxruntime/core/session/onnxruntime_c_api.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎onnxruntime/core/providers/qnn/qnn_execution_provider.cc‎
Lines changed: 6 additions & 4 deletions b/‎onnxruntime/core/providers/qnn/qnn_execution_provider.cc‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎onnxruntime/test/providers/qnn/argmaxmin_op_test.cc‎
Lines changed: 1 addition & 0 deletions b/‎onnxruntime/test/providers/qnn/argmaxmin_op_test.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎onnxruntime/test/providers/qnn/average_pool_test.cc‎
Lines changed: 2 additions & 0 deletions b/‎onnxruntime/test/providers/qnn/average_pool_test.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnxruntime/test/providers/qnn/batch_norm_htp_test.cc‎
Lines changed: 2 additions & 0 deletions b/‎onnxruntime/test/providers/qnn/batch_norm_htp_test.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnxruntime/test/providers/qnn/cast_test.cc‎
Lines changed: 1 addition & 0 deletions b/‎onnxruntime/test/providers/qnn/cast_test.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎onnxruntime/test/providers/qnn/clip_op_test.cc‎
Lines changed: 2 additions & 0 deletions b/‎onnxruntime/test/providers/qnn/clip_op_test.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnxruntime/test/providers/qnn/conv_test.cc‎
Lines changed: 5 additions & 0 deletions b/‎onnxruntime/test/providers/qnn/conv_test.cc‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎onnxruntime/test/providers/qnn/flatten_op_test.cc‎
Lines changed: 2 additions & 0 deletions b/‎onnxruntime/test/providers/qnn/flatten_op_test.cc‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎onnxruntime/test/providers/qnn/gather_elems_op_test.cc‎
Lines changed: 3 additions & 0 deletions b/‎onnxruntime/test/providers/qnn/gather_elems_op_test.cc‎
Lines changed: 3 additions & 0 deletions
@@ -3665,8 +3665,8 @@ struct OrtApi {
    *     - "1": Enabled.
    *   "offload_graph_io_quantization": Offload graph input quantization and graph output dequantization to another
    *   execution provider (typically CPU EP).
-   *     - "0": Default. Disabled. QNN EP will handle quantization and dequantization of graph I/O.
-   *     - "1": Enabled.
+   *     - "0": Disabled. QNN EP will handle quantization and dequantization of graph I/O.
+   *     - "1": Enabled. This is the default value.
    *   "enable_htp_spill_fill_buffer": Enable HTP spill fill buffer setting. The flag is used while generating context binary.
    *     - "0": Default. Disabled.
    *     - "1": Enabled.
 
@@ -351,13 +351,15 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   // Add this option because this feature requires QnnSystem lib and it's no supported for Windows x86_64 platform
   enable_spill_fill_buffer_ = ParseBoolOption("enable_htp_spill_fill_buffer", false, provider_options_map);
 
-  model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", false,
+  model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", true,
                                                                   provider_options_map);
 
   if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) {
-    LOGS_DEFAULT(WARNING) << "Fallback to CPU EP is disabled, but user configured QNN EP to offload graph I/O "
-                          << "quantization/dequantization to another EP. Session creation will fail if the CPU EP "
-                          << "handles the graph I/O quantization/dequantization.";
+    LOGS_DEFAULT(INFO) << "Fallback to CPU EP is disabled, but user tried to configure QNN EP to offload graph I/O "
+                       << "quantization/dequantization to another EP. These are conflicting options. Fallback to CPU "
+                       << "EP will remain disabled and graph I/O quantization/dequantization will not be offloaded "
+                       << "to another EP.";
+    model_settings_.offload_graph_io_quantization = false;
   }
 
   static const std::string QNN_HTP_SHARED_MEMORY_ALLOCATOR_ENABLED = "enable_htp_shared_memory_allocator";
 
@@ -70,6 +70,7 @@ static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),   // baseline float32 model
                        BuildQDQArgMxxTestCase<QType>(op_type, input_def, attrs),  // QDQ model
 
@@ -31,6 +31,7 @@ static void RunAveragePoolOpTest(const std::string& op_type,
 #else
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   RunQnnModelTest(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
                   provider_options,
@@ -53,6 +54,7 @@ static void RunQDQAveragePoolOpTest(const std::string& op_type,
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
                        BuildQDQOpTestCase<QuantType>(op_type, input_defs, {}, attrs),
 
@@ -160,6 +160,7 @@ static void RunBatchNormQDQTest(const TestInputDef<float>& input_def,
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
   TestQDQModelAccuracy(BuildBatchNormTestCase(input_def, scale_def, bias_def),
@@ -180,6 +181,7 @@ static void RunBatchNormFP16Test(const TestInputDef<float>& input_def,
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   TestInputDef<MLFloat16> input_fp16_def = ConvertToFP16InputDef(input_def);
   TestInputDef<MLFloat16> scale_fp16_def = ConvertToFP16InputDef(scale_def);
 
@@ -57,6 +57,7 @@ static void RunCastOpTest(const std::vector<int64_t>& shape, ONNX_NAMESPACE::Ten
 #else
   provider_options["backend_path"] = use_htp ? "libQnnHtp.so" : "libQnnCpu.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   if (use_htp && enable_fp16_precision) {
     provider_options["enable_htp_fp16_precision"] = "1";
 
@@ -117,6 +117,7 @@ static void RunQDQClipTestOnHTP(const TestInputDef<float>& input_def,
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   auto f32_model_builder = BuildOpTestCase<float, float>("Clip", {input_def}, {min_max_defs}, {});
   auto qdq_model_builder = BuildQDQOpTestCase<QType, float>("Clip", {input_def}, {min_max_defs}, {},
@@ -205,6 +206,7 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   RunQnnModelTest(model_fn,
                   provider_options,
 
@@ -93,6 +93,8 @@ static void RunCPUConvOpTest(const std::string& conv_op_type, const TestInputDef
 #else
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
+
   auto build_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads,
                                        dilations, group, auto_pad);
   RunQnnModelTest(build_fn,
@@ -317,6 +319,7 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   TestQDQModelAccuracy(BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
                                             group, auto_pad, output_activation),
@@ -354,6 +357,7 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
                                      group, auto_pad, output_activation);
@@ -665,6 +669,7 @@ TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   auto BuildConvMulGraph = [](ModelTestBuilder& builder) {
     // DQ node for Conv input
 
@@ -101,6 +101,7 @@ static void RunQDQFlattenTestOnHTP(const TestInputDef<float>& input_def,
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   auto f32_model_builder = BuildOpTestCase<float>("Flatten", {input_def}, {}, attrs);
   auto qdq_model_builder = BuildQDQOpTestCase<QType>("Flatten", {input_def}, {}, attrs, kOnnxDomain, use_contrib_qdq);
@@ -172,6 +173,7 @@ TEST_F(QnnHTPBackendTests, Flatten_QDQ8bit_Rank5) {
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   RunQnnModelTest(model_fn,
                   provider_options,
 
@@ -67,6 +67,7 @@ static void RunCPUGatherElemsOpTest(const TestInputDef<float>& input_def,
 #else
   provider_options["backend_path"] = "libQnnCpu.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   RunQnnModelTest(BuildOpTestCase<DataType, IndexType>("GatherElements", {input_def}, {indices_def}, attrs),
                   provider_options,
@@ -91,6 +92,7 @@ static void RunHTPQDQGatherElemsOpTest(const TestInputDef<float>& input_def,
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   auto f32_model_builder = BuildOpTestCase<float, IndexType>("GatherElements", {input_def}, {indices_def}, attrs);
   auto qdq_model_builder = BuildQDQGatherElemsTestCase<QuantType, IndexType>(input_def, indices_def, attrs,
@@ -119,6 +121,7 @@ static void RunHTPGatherElemsOpTest(const TestInputDef<DataType>& input_def,
 #else
   provider_options["backend_path"] = "libQnnHtp.so";
 #endif
+  provider_options["offload_graph_io_quantization"] = "0";
 
   RunQnnModelTest(BuildOpTestCase<DataType, IndexType>("GatherElements", {input_def}, {indices_def}, attrs),
                   provider_options,