[HUDI-9672] Disable skipping clustering for spark incremental query to avoid data duplication (#13659)

cshuo · rahil-c · commit 1b3cd907b23b · 2025-08-05T17:48:32.000-07:00
diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelationV2.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/MergeOnReadIncrementalRelationV2.scala
@@ -60,13 +60,7 @@ case class MergeOnReadIncrementalRelationV2(override val sqlContext: SQLContext,
     if (fullTableScan) {
       metaClient.getCommitsAndCompactionTimeline
     } else {
-      val completeTimeline =
-        metaClient.getCommitsTimeline.filterCompletedInstants()
-          .findInstantsInRangeByCompletionTime(startCompletionTime, endCompletionTime)
-
-      // Need to add pending compaction instants to avoid data missing, see HUDI-5990 for details.
-      val pendingCompactionTimeline = metaClient.getCommitsAndCompactionTimeline.filterPendingMajorOrMinorCompactionTimeline()
-      concatTimeline(completeTimeline, pendingCompactionTimeline, metaClient)
+      queryContext.getActiveTimeline
     }
   }
 
@@ -200,8 +194,10 @@ trait HoodieIncrementalRelationV2Trait extends HoodieBaseRelation {
       .metaClient(metaClient)
       .startCompletionTime(optParams(DataSourceReadOptions.START_COMMIT.key))
       .endCompletionTime(optParams.getOrElse(DataSourceReadOptions.END_COMMIT.key, null))
-      .skipClustering(optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_READ_SKIP_CLUSTER.key(),
-        String.valueOf(DataSourceReadOptions.INCREMENTAL_READ_SKIP_CLUSTER.defaultValue)).toBoolean)
+      // do not support skip cluster for spark incremental query yet to avoid data duplication problem,
+      // see details in HUDI-9672.
+      // .skipClustering(optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_READ_SKIP_CLUSTER.key(),
+      //  String.valueOf(DataSourceReadOptions.INCREMENTAL_READ_SKIP_CLUSTER.defaultValue)).toBoolean)
       .skipCompaction(optParams.getOrElse(DataSourceReadOptions.INCREMENTAL_READ_SKIP_COMPACT.key(),
         String.valueOf(DataSourceReadOptions.INCREMENTAL_READ_SKIP_COMPACT.defaultValue)).toBoolean)
       .rangeType(rangeType)
diff --git a/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestStreamingSource.scala b/hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestStreamingSource.scala
@@ -213,54 +213,50 @@ class TestStreamingSource extends StreamTest {
   }
 
   test("Test mor streaming source with clustering") {
-    Array("true", "false").foreach(skipCluster => {
-      withTempDir { inputDir =>
-        val tablePath = s"${inputDir.getCanonicalPath}/test_mor_stream_cluster"
-        val metaClient = HoodieTableMetaClient.newTableBuilder()
-          .setTableType(MERGE_ON_READ)
-          .setTableName(getTableName(tablePath))
-          .setRecordKeyFields("id")
-          .setPreCombineField("ts")
-          .initTable(HadoopFSUtils.getStorageConf(spark.sessionState.newHadoopConf()), tablePath)
+    withTempDir { inputDir =>
+      val tablePath = s"${inputDir.getCanonicalPath}/test_mor_stream_cluster"
+      val metaClient = HoodieTableMetaClient.newTableBuilder()
+        .setTableType(MERGE_ON_READ)
+        .setTableName(getTableName(tablePath))
+        .setRecordKeyFields("id")
+        .setPreCombineField("ts")
+        .initTable(HadoopFSUtils.getStorageConf(spark.sessionState.newHadoopConf()), tablePath)
 
-        addData(tablePath, Seq(("1", "a1", "10", "000")))
-        addData(tablePath, Seq(("2", "a1", "11", "001")))
-        addData(tablePath, Seq(("3", "a1", "12", "002")))
-        addData(tablePath, Seq(("4", "a1", "13", "003")), enableInlineCluster = true)
-        addData(tablePath, Seq(("5", "a1", "14", "004")))
+      addData(tablePath, Seq(("1", "a1", "10", "000")))
+      addData(tablePath, Seq(("2", "a1", "11", "001")))
+      addData(tablePath, Seq(("3", "a1", "12", "002")))
+      addData(tablePath, Seq(("4", "a1", "13", "003")), enableInlineCluster = true)
+      addData(tablePath, Seq(("5", "a1", "14", "004")))
 
-        val timestamp =
-          metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants()
-            .firstInstant().get().getCompletionTime
+      val timestamp =
+        metaClient.getActiveTimeline.getCommitsTimeline.filterCompletedInstants()
+          .firstInstant().get().getCompletionTime
 
-        val df = spark.readStream
-          .format("org.apache.hudi")
-          .option(START_OFFSET.key(), timestamp)
-          .option(DataSourceReadOptions.INCREMENTAL_READ_SKIP_CLUSTER.key(), skipCluster)
-          .load(tablePath)
-          .select("id", "name", "price", "ts")
+      val df = spark.readStream
+        .format("org.apache.hudi")
+        .option(START_OFFSET.key(), timestamp)
+        .load(tablePath)
+        .select("id", "name", "price", "ts")
 
-        testStream(df)(
-          AssertOnQuery { q => q.processAllAvailable(); true },
-          // Start after the first commit
-          CheckAnswerRows(Seq(
-            Row("2", "a1", "11", "001"),
-            Row("3", "a1", "12", "002"),
-            Row("4", "a1", "13", "003"),
-            Row("5", "a1", "14", "004")), lastOnly = true, isSorted = false)
-        )
-        assertTrue(metaClient.reloadActiveTimeline
-          .filter(JavaConversions.getPredicate(
-            e => e.isCompleted && HoodieTimeline.REPLACE_COMMIT_ACTION.equals(e.getAction)))
-          .countInstants() > 0)
-      }
-    })
+      testStream(df)(
+        AssertOnQuery { q => q.processAllAvailable(); true },
+        // Start after the first commit
+        CheckAnswerRows(Seq(
+          Row("2", "a1", "11", "001"),
+          Row("3", "a1", "12", "002"),
+          Row("4", "a1", "13", "003"),
+          Row("5", "a1", "14", "004")), lastOnly = true, isSorted = false))
+      assertTrue(metaClient.reloadActiveTimeline
+        .filter(JavaConversions.getPredicate(
+          e => e.isCompleted && HoodieTimeline.REPLACE_COMMIT_ACTION.equals(e.getAction)))
+        .countInstants() > 0)
+    }
   }
 
   test("test mor stream source with compaction") {
     Array("true", "false").foreach(skipCompact => {
       withTempDir { inputDir =>
-        val tablePath = s"${inputDir.getCanonicalPath}/test_mor_stream"
+        val tablePath = s"${inputDir.getCanonicalPath}/test_mor_stream_$skipCompact"
         val metaClient = HoodieTableMetaClient.newTableBuilder()
           .setTableType(MERGE_ON_READ)
           .setTableName(getTableName(tablePath))