apache
diff --git a/‎core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java‎
Lines changed: 9 additions & 3 deletions b/‎core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java‎
Lines changed: 7 additions & 1 deletion b/‎core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎core/src/main/java/org/apache/spark/storage/FileSystemManagedBuffer.java‎
Lines changed: 116 additions & 0 deletions b/‎core/src/main/java/org/apache/spark/storage/FileSystemManagedBuffer.java‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/Dependency.scala‎
Lines changed: 6 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/Dependency.scala‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 7 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/internal/config/package.scala‎
Lines changed: 35 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/internal/config/package.scala‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala‎
Lines changed: 6 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/Pool.scala‎
Lines changed: 4 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/scheduler/Pool.scala‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/SchedulingAlgorithm.scala‎
Lines changed: 19 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/SchedulingAlgorithm.scala‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/SchedulingMode.scala‎
Lines changed: 2 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/scheduler/SchedulingMode.scala‎
Lines changed: 2 additions & 1 deletion
@@ -98,6 +98,7 @@ final class BypassMergeSortShuffleWriter<K, V>
   private final long mapId;
   private final Serializer serializer;
   private final ShuffleExecutorComponents shuffleExecutorComponents;
+  private final boolean remoteWrites;
 
   /** Array of file writers, one for each partition */
   private DiskBlockObjectWriter[] partitionWriters;
@@ -136,6 +137,7 @@ final class BypassMergeSortShuffleWriter<K, V>
     this.mapId = mapId;
     this.shuffleId = dep.shuffleId();
     this.partitioner = dep.partitioner();
+    this.remoteWrites = dep.useRemoteShuffleStorage();
     this.numPartitions = partitioner.numPartitions();
     this.writeMetrics = writeMetrics;
     this.serializer = dep.serializer();
@@ -149,12 +151,14 @@ public void write(Iterator<Product2<K, V>> records) throws IOException {
     assert (partitionWriters == null);
     ShuffleMapOutputWriter mapOutputWriter = shuffleExecutorComponents
         .createMapOutputWriter(shuffleId, mapId, numPartitions);
+    BlockManagerId blockManagerId = remoteWrites ?
+            RemoteShuffleStorage.BLOCK_MANAGER_ID() : blockManager.shuffleServerId();
     try {
       if (!records.hasNext()) {
         partitionLengths = mapOutputWriter.commitAllPartitions(
           ShuffleChecksumHelper.EMPTY_CHECKSUM_VALUE).getPartitionLengths();
         mapStatus = MapStatus$.MODULE$.apply(
-          blockManager.shuffleServerId(), partitionLengths, mapId, getAggregatedChecksumValue());
+          blockManagerId, partitionLengths, mapId, getAggregatedChecksumValue());
         return;
       }
       final SerializerInstance serInstance = serializer.newInstance();
@@ -196,7 +200,7 @@ public void write(Iterator<Product2<K, V>> records) throws IOException {
 
       partitionLengths = writePartitionedData(mapOutputWriter);
       mapStatus = MapStatus$.MODULE$.apply(
-        blockManager.shuffleServerId(), partitionLengths, mapId, getAggregatedChecksumValue());
+        blockManagerId, partitionLengths, mapId, getAggregatedChecksumValue());
     } catch (Exception e) {
       try {
         mapOutputWriter.abort(e);
@@ -236,8 +240,10 @@ private long[] writePartitionedData(ShuffleMapOutputWriter mapOutputWriter) thro
       try {
         for (int i = 0; i < numPartitions; i++) {
           final File file = partitionWriterSegments[i].file();
-          ShufflePartitionWriter writer = mapOutputWriter.getPartitionWriter(i);
           if (file.exists()) {
+            // TODO: Remove thsi comment: the line below was moved so that assertions
+            //  cann be added and in general safe
+            ShufflePartitionWriter writer = mapOutputWriter.getPartitionWriter(i);
             if (transferToEnabled) {
               // Using WritableByteChannelWrapper to make resource closing consistent between
               // this implementation and UnsafeShuffleWriter.
 
@@ -26,6 +26,8 @@
 import java.nio.channels.WritableByteChannel;
 import java.util.Iterator;
 
+import org.apache.spark.storage.BlockManagerId;
+import org.apache.spark.storage.RemoteShuffleStorage;
 import scala.Option;
 import scala.Product2;
 import scala.jdk.javaapi.CollectionConverters;
@@ -89,6 +91,7 @@ public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   private final boolean transferToEnabled;
   private final int initialSortBufferSize;
   private final int mergeBufferSizeInBytes;
+  private final boolean remoteWrites;
 
   @Nullable private MapStatus mapStatus;
   @Nullable private ShuffleExternalSorter sorter;
@@ -135,6 +138,7 @@ public UnsafeShuffleWriter(
     this.shuffleId = dep.shuffleId();
     this.serializer = dep.serializer().newInstance();
     this.partitioner = dep.partitioner();
+    this.remoteWrites = dep.useRemoteShuffleStorage();
     this.writeMetrics = writeMetrics;
     this.shuffleExecutorComponents = shuffleExecutorComponents;
     this.taskContext = taskContext;
@@ -247,8 +251,10 @@ void closeAndWriteOutput() throws IOException {
         }
       }
     }
+    BlockManagerId blockManagerId = remoteWrites ?
+            RemoteShuffleStorage.BLOCK_MANAGER_ID() : blockManager.shuffleServerId();
     mapStatus = MapStatus$.MODULE$.apply(
-      blockManager.shuffleServerId(), partitionLengths, mapId, getAggregatedChecksumValue());
+      blockManagerId, partitionLengths, mapId, getAggregatedChecksumValue());
   }
 
   @VisibleForTesting
 
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.storage;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+import org.apache.spark.network.buffer.ManagedBuffer;
+
+/**
+ * A {@link ManagedBuffer} backed by a file using Hadoop FileSystem.
+ * This buffer creates an input stream with a 64MB buffer size for efficient reading.
+ * Note: This implementation throws UnsupportedOperationException for methods that
+ * require loading the entire file into memory (nioByteBuffer, convertToNetty, convertToNettyForSsl)
+ * as files can be very large and loading them entirely into memory is not practical.
+ */
+public class FileSystemManagedBuffer extends ManagedBuffer {
+  private int bufferSize; // 64MB buffer size
+  private final Path filePath;
+  private final long fileSize;
+  private final Configuration hadoopConf;
+
+  public FileSystemManagedBuffer(Path filePath, Configuration hadoopConf) throws IOException {
+    this.filePath = filePath;
+    this.hadoopConf = hadoopConf;
+    // Get file size using FileSystem.newInstance to avoid cached dependencies
+    FileSystem fileSystem = FileSystem.newInstance(filePath.toUri(), hadoopConf);
+    try {
+      this.fileSize = fileSystem.getFileStatus(filePath).getLen();
+    } finally {
+      fileSystem.close();
+    }
+    bufferSize = 64;
+  }
+
+  public FileSystemManagedBuffer(Path filePath, Configuration hadoopConf, int bufferSize)
+          throws IOException {
+    this(filePath, hadoopConf);
+    this.bufferSize = bufferSize;
+  }
+
+  @Override
+  public long size() {
+    return fileSize;
+  }
+
+  @Override
+  public ByteBuffer nioByteBuffer() throws IOException {
+    throw new UnsupportedOperationException(
+      "FileSystemManagedBuffer does not support nioByteBuffer() as it would require loading " +
+      "the entire file into memory, which is not practical for large files. " +
+      "Use createInputStream() instead.");
+  }
+
+  @Override
+  public InputStream createInputStream() throws IOException {
+    // Create a new FileSystem instance to avoid cached dependencies
+    // and create a buffered input stream with 64MB buffer size for efficient reading
+    FileSystem fileSystem = FileSystem.newInstance(filePath.toUri(), hadoopConf);
+    return fileSystem.open(filePath, bufferSize * 1024 * 1024);
+  }
+
+  @Override
+  public ManagedBuffer retain() {
+    // FileSystemManagedBuffer doesn't use reference counting, so just return this
+    return this;
+  }
+
+  @Override
+  public ManagedBuffer release() {
+    // FileSystemManagedBuffer doesn't use reference counting, so just return this
+    return this;
+  }
+
+  @Override
+  public Object convertToNetty() {
+    throw new UnsupportedOperationException(
+      "FileSystemManagedBuffer does not support convertToNetty() as it would require loading " +
+      "the entire file into memory, which is not practical for large files. " +
+      "Use createInputStream() instead.");
+  }
+
+  @Override
+  public Object convertToNettyForSsl() {
+    throw new UnsupportedOperationException(
+      "FileSystemManagedBuffer does not support convertToNettyForSsl()" +
+              " as it would require loading " +
+      "the entire file into memory, which is not practical for large files. " +
+      "Use createInputStream() instead.");
+  }
+
+  @Override
+  public String toString() {
+    return "FileSegmentManagedBuffer[file=" + filePath + ",length=" + fileSize + "]";
+  }
+}
@@ -90,7 +90,9 @@ class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
     val mapSideCombine: Boolean = false,
     val shuffleWriterProcessor: ShuffleWriteProcessor = new ShuffleWriteProcessor,
     val rowBasedChecksums: Array[RowBasedChecksum] = ShuffleDependency.EMPTY_ROW_BASED_CHECKSUMS,
-    val checksumMismatchFullRetryEnabled: Boolean = false)
+    val checksumMismatchFullRetryEnabled: Boolean = false,
+    val useRemoteShuffleStorage: Boolean = false
+    )
   extends Dependency[Product2[K, V]] with Logging {
 
   def this(
@@ -249,7 +251,9 @@ class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
     )
   }
 
-  _rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this))
+  if (!useRemoteShuffleStorage) {
+    _rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this))
+  }
   _rdd.sparkContext.shuffleDriverComponents.registerShuffle(shuffleId)
 }
 
 
@@ -650,6 +650,8 @@ class SparkContext(config: SparkConf) extends Logging {
     _env.blockManager.initialize(_applicationId)
     FallbackStorage.registerBlockManagerIfNeeded(
       _env.blockManager.master, _conf, _hadoopConfiguration)
+    RemoteShuffleStorage.registerBlockManagerifNeeded(_env.blockManager.master, _conf,
+      _hadoopConfiguration)
 
     // The metrics system for Driver need to be set spark.app.id to app ID.
     // So it should start after we get app ID from the task scheduler and set spark.app.id.
@@ -2377,6 +2379,11 @@ class SparkContext(config: SparkConf) extends Logging {
     Utils.tryLogNonFatalError {
       FallbackStorage.cleanUp(_conf, _hadoopConfiguration)
     }
+
+    Utils.tryLogNonFatalError {
+      RemoteShuffleStorage.cleanUp(_conf, _hadoopConfiguration)
+    }
+
     Utils.tryLogNonFatalError {
       _eventLogger.foreach(_.stop())
     }
 
@@ -626,6 +626,13 @@ package object config {
       .checkValue(_.endsWith(java.io.File.separator), "Path should end with separator.")
       .createOptional
 
+  private[spark] val SHUFFLE_REMOTE_STORAGE_CLEANUP =
+    ConfigBuilder("spark.shuffle.remote.storage.cleanUp")
+      .doc("If true, Spark cleans up its fallback storage data during shutting down.")
+      .version("3.2.0")
+      .booleanConf
+      .createWithDefault(false)
+
   private[spark] val STORAGE_DECOMMISSION_SHUFFLE_MAX_DISK_SIZE =
     ConfigBuilder("spark.storage.decommission.shuffleBlocks.maxDiskSize")
       .doc("Maximum disk space to use to store shuffle blocks before rejecting remote " +
@@ -2905,4 +2912,32 @@ package object config {
       .checkValue(v => v.forall(Set("stdout", "stderr").contains),
         "The value only can be one or more of 'stdout, stderr'.")
       .createWithDefault(Seq("stdout", "stderr"))
+
+  private[spark] val SHUFFLE_REMOTE_STORAGE_PATH =
+  ConfigBuilder("spark.shuffle.remote.storage.path")
+      .doc("The location for storing shuffle blocks on remote storage.")
+      .version("4.1.0")
+      .stringConf
+      .checkValue(_.endsWith(java.io.File.separator), "Path should end with separator.")
+      .createOptional
+
+  private[spark] val REMOTE_SHUFFLE_BUFFER_SIZE =
+    ConfigBuilder("spark.shuffle.remote.buffer.size")
+      .version("4.1.0")
+      .stringConf
+      .createWithDefault("64M")
+
+  private[spark] val START_REDUCERS_IN_PARALLEL_TO_MAPPER =
+    ConfigBuilder("spark.shuffle.consolidation.enabled")
+      .doc("starts reducers in parallel to mappers")
+      .version("4.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val EAGERNESS_THRESHOLD_PERCENTAGE =
+    ConfigBuilder("spark.shuffle.remote.eagerness.percentage")
+      .doc("Percentage of mapper complet tasks before starting reducers ")
+      .version("4.1.0")
+      .intConf
+      .createWithDefault(20)
 }
@@ -1755,7 +1755,12 @@ private[spark] class DAGScheduler(
         log"${MDC(STAGE, stage)} (${MDC(RDD_ID, stage.rdd)}) (first 15 tasks are " +
         log"for partitions ${MDC(PARTITION_IDS, tasks.take(15).map(_.partitionId))})")
       val shuffleId = stage match {
-        case s: ShuffleMapStage => Some(s.shuffleDep.shuffleId)
+        case s: ShuffleMapStage =>
+          // hack to prioritize remote shuffle writes
+          if (properties != null) {
+            properties.setProperty("remote", s.shuffleDep.useRemoteShuffleStorage.toString)
+          }
+          Some(s.shuffleDep.shuffleId)
         case _: ResultStage => None
       }
 
 
@@ -53,8 +53,11 @@ private[spark] class Pool(
         new FairSchedulingAlgorithm()
       case SchedulingMode.FIFO =>
         new FIFOSchedulingAlgorithm()
+      case SchedulingMode.WEIGHTED_FIFO =>
+        new WeightedFIFOSchedulingAlgorithm()
       case _ =>
-        val msg = s"Unsupported scheduling mode: $schedulingMode. Use FAIR or FIFO instead."
+        val msg = s"Unsupported scheduling mode: $schedulingMode. Use FAIR, FIFO," +
+          s" or WEIGHTED_FIFO instead."
         throw new IllegalArgumentException(msg)
     }
   }
 
@@ -40,6 +40,25 @@ private[spark] class FIFOSchedulingAlgorithm extends SchedulingAlgorithm {
   }
 }
 
+private[spark] class WeightedFIFOSchedulingAlgorithm extends SchedulingAlgorithm {
+  override def comparator(s1: Schedulable, s2: Schedulable): Boolean = {
+    val priority1 = s1.priority
+    val priority2 = s2.priority
+    var res = math.signum(priority1 - priority2)
+    if (res == 0) {
+      if (s1.weight == s2.weight) {
+        val stageId1 = s1.stageId
+        val stageId2 = s2.stageId
+        res = math.signum(stageId1 - stageId2)
+      } else {
+        // Higher the weight, earlier should it run(unlike priority)
+        res = math.signum(s2.weight - s1.weight)
+      }
+    }
+    res < 0
+  }
+}
+
 private[spark] class FairSchedulingAlgorithm extends SchedulingAlgorithm {
   override def comparator(s1: Schedulable, s2: Schedulable): Boolean = {
     val minShare1 = s1.minShare
 
@@ -20,10 +20,11 @@ package org.apache.spark.scheduler
 /**
  *  "FAIR" and "FIFO" determines which policy is used
  *    to order tasks amongst a Schedulable's sub-queues
+ *  "WEIGHTED_FIFO" is similar to FIFO but uses weight-based comparison in addition.
  *  "NONE" is used when the a Schedulable has no sub-queues.
  */
 object SchedulingMode extends Enumeration {
 
   type SchedulingMode = Value
-  val FAIR, FIFO, NONE = Value
+  val FAIR, FIFO, WEIGHTED_FIFO, NONE = Value
 }
Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,9 @@ class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](`
`90`	`90`	`val mapSideCombine: Boolean = false,`
`91`	`91`	`val shuffleWriterProcessor: ShuffleWriteProcessor = new ShuffleWriteProcessor,`
`92`	`92`	`val rowBasedChecksums: Array[RowBasedChecksum] = ShuffleDependency.EMPTY_ROW_BASED_CHECKSUMS,`
`93`		`- val checksumMismatchFullRetryEnabled: Boolean = false)`
	`93`	`+ val checksumMismatchFullRetryEnabled: Boolean = false,`
	`94`	`+ val useRemoteShuffleStorage: Boolean = false`
	`95`	`+ )`
`94`	`96`	`extends Dependency[Product2[K, V]] with Logging {`
`95`	`97`
`96`	`98`	`def this(`
`@@ -249,7 +251,9 @@ class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](`
`249`	`251`	`)`
`250`	`252`	`}`
`251`	`253`
`252`		`- _rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this))`
	`254`	`+ if (!useRemoteShuffleStorage) {`
	`255`	`+ _rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this))`
	`256`	`+ }`
`253`	`257`	`_rdd.sparkContext.shuffleDriverComponents.registerShuffle(shuffleId)`
`254`	`258`	`}`
`255`	`259`
Original file line number	Diff line number	Diff line change
`@@ -53,8 +53,11 @@ private[spark] class Pool(`
`53`	`53`	`new FairSchedulingAlgorithm()`
`54`	`54`	`case SchedulingMode.FIFO =>`
`55`	`55`	`new FIFOSchedulingAlgorithm()`
	`56`	`+ case SchedulingMode.WEIGHTED_FIFO =>`
	`57`	`+ new WeightedFIFOSchedulingAlgorithm()`
`56`	`58`	`case _ =>`
`57`		`- val msg = s"Unsupported scheduling mode: $schedulingMode. Use FAIR or FIFO instead."`
	`59`	`+ val msg = s"Unsupported scheduling mode: $schedulingMode. Use FAIR, FIFO," +`
	`60`	`+ s" or WEIGHTED_FIFO instead."`
`58`	`61`	`throw new IllegalArgumentException(msg)`
`59`	`62`	`}`
`60`	`63`	`}`