apache · utkarsh39 · Aug 8, 2023 · Aug 8, 2023 · Aug 8, 2023 · Aug 9, 2023
diff --git a/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala b/core/src/main/scala/org/apache/spark/ContextAwareIterator.scala
@@ -30,8 +30,10 @@ import org.apache.spark.annotation.DeveloperApi
  * Thus, we should use [[ContextAwareIterator]] to stop consuming after the task ends.
  *
  * @since 3.1.0
+ * @deprecated since 3.5.0 as its only usage for Python evaluation is now extinct
  */
 @DeveloperApi
+@deprecated("Only usage for Python evaluation is now extinct", "3.5.0")
 class ContextAwareIterator[+T](val context: TaskContext, val delegate: Iterator[T])
   extends Iterator[T] {
 

diff --git a/core/src/main/scala/org/apache/spark/SparkEnv.scala b/core/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -18,7 +18,6 @@
 package org.apache.spark
 
 import java.io.File
-import java.net.Socket
 import java.util.Locale
 
 import scala.collection.JavaConverters._
@@ -30,7 +29,7 @@ import com.google.common.cache.CacheBuilder
 import org.apache.hadoop.conf.Configuration
 
 import org.apache.spark.annotation.DeveloperApi
-import org.apache.spark.api.python.PythonWorkerFactory
+import org.apache.spark.api.python.{PythonWorker, PythonWorkerFactory}
 import org.apache.spark.broadcast.BroadcastManager
 import org.apache.spark.executor.ExecutorBackend
 import org.apache.spark.internal.{config, Logging}
@@ -129,7 +128,7 @@ class SparkEnv (
       pythonExec: String,
       workerModule: String,
       daemonModule: String,
-      envVars: Map[String, String]): (java.net.Socket, Option[Int]) = {
+      envVars: Map[String, String]): (PythonWorker, Option[Int]) = {
     synchronized {
       val key = PythonWorkersKey(pythonExec, workerModule, daemonModule, envVars)
       pythonWorkers.getOrElseUpdate(key,
@@ -140,7 +139,7 @@ class SparkEnv (
   private[spark] def createPythonWorker(
       pythonExec: String,
       workerModule: String,
-      envVars: Map[String, String]): (java.net.Socket, Option[Int]) = {
+      envVars: Map[String, String]): (PythonWorker, Option[Int]) = {
     createPythonWorker(
       pythonExec, workerModule, PythonWorkerFactory.defaultDaemonModule, envVars)
   }
@@ -161,7 +160,7 @@ class SparkEnv (
       pythonExec: String,
       workerModule: String,
       envVars: Map[String, String],
-      worker: Socket): Unit = {
+      worker: PythonWorker {
     destroyPythonWorker(
       pythonExec, workerModule, PythonWorkerFactory.defaultDaemonModule, envVars, worker)
   }
@@ -171,7 +170,7 @@ class SparkEnv (
       workerModule: String,
       daemonModule: String,
       envVars: Map[String, String],
-      worker: Socket): Unit = {
+      worker: PythonWorker {
     synchronized {
       val key = PythonWorkersKey(pythonExec, workerModule, daemonModule, envVars)
       pythonWorkers.get(key).foreach(_.releaseWorker(worker))

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -137,15 +137,15 @@ private class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte]
 private[spark] object PythonRDD extends Logging {
 
   // remember the broadcasts sent to each worker
-  private val workerBroadcasts = new mutable.WeakHashMap[Socket, mutable.Set[Long]]()
+  private val workerBroadcasts = new mutable.WeakHashMap[PythonWorker, mutable.Set[Long]]()
 
   // Authentication helper used when serving iterator data.
   private lazy val authHelper = {
     val conf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
     new SocketAuthHelper(conf)
   }
 
-  def getWorkerBroadcasts(worker: Socket): mutable.Set[Long] = {
+  def getWorkerBroadcasts(worker: PythonWorker): mutable.Set[Long] = {
     synchronized {
       workerBroadcasts.getOrElseUpdate(worker, new mutable.HashSet[Long]())
     }
@@ -300,7 +300,11 @@ private[spark] object PythonRDD extends Logging {
     new PythonBroadcast(path)
   }
 
-  def writeIteratorToStream[T](iter: Iterator[T], dataOut: DataOutputStream): Unit = {
+  /**
+   * Writes the next element of the iterator `iter` to `dataOut`. Returns true if any data was
+   * written to the stream. Returns false if no data was written as the iterator has been exhausted.
+   */
+  def writeNextElementToStream[T](iter: Iterator[T], dataOut: DataOutputStream): Boolean = {
 
     def write(obj: Any): Unit = obj match {
       case null =>
@@ -318,8 +322,18 @@ private[spark] object PythonRDD extends Logging {
       case other =>
         throw new SparkException("Unexpected element type " + other.getClass)
     }
+    if (iter.hasNext) {
+      write(iter.next())
+      true
+    } else {
+      false
+    }
+  }
 
-    iter.foreach(write)
+  def writeIteratorToStream[T](iter: Iterator[T], dataOut: DataOutputStream): Unit = {
+    while (writeNextElementToStream(iter, dataOut)) {
+      // Nothing.
+    }
   }
 
   /**