Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-44705][PYTHON] Make PythonRunner single-threaded #42385

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ import org.apache.spark.annotation.DeveloperApi
* Thus, we should use [[ContextAwareIterator]] to stop consuming after the task ends.
*
* @since 3.1.0
* @deprecated since 3.5.0 as its only usage for Python evaluation is now extinct
*/
@DeveloperApi
@deprecated("Only usage for Python evaluation is now extinct", "3.5.0")
utkarsh39 marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@utkarsh39 This should be 4.0.0.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR to fix it: #42494

class ContextAwareIterator[+T](val context: TaskContext, val delegate: Iterator[T])
extends Iterator[T] {

Expand Down
11 changes: 5 additions & 6 deletions core/src/main/scala/org/apache/spark/SparkEnv.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.apache.spark

import java.io.File
import java.net.Socket
import java.util.Locale

import scala.collection.JavaConverters._
Expand All @@ -30,7 +29,7 @@ import com.google.common.cache.CacheBuilder
import org.apache.hadoop.conf.Configuration

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.api.python.PythonWorkerFactory
import org.apache.spark.api.python.{PythonWorker, PythonWorkerFactory}
import org.apache.spark.broadcast.BroadcastManager
import org.apache.spark.executor.ExecutorBackend
import org.apache.spark.internal.{config, Logging}
Expand Down Expand Up @@ -129,7 +128,7 @@ class SparkEnv (
pythonExec: String,
workerModule: String,
daemonModule: String,
envVars: Map[String, String]): (java.net.Socket, Option[Int]) = {
envVars: Map[String, String]): (PythonWorker, Option[Int]) = {
synchronized {
val key = PythonWorkersKey(pythonExec, workerModule, daemonModule, envVars)
pythonWorkers.getOrElseUpdate(key,
Expand All @@ -140,7 +139,7 @@ class SparkEnv (
private[spark] def createPythonWorker(
pythonExec: String,
workerModule: String,
envVars: Map[String, String]): (java.net.Socket, Option[Int]) = {
envVars: Map[String, String]): (PythonWorker, Option[Int]) = {
createPythonWorker(
pythonExec, workerModule, PythonWorkerFactory.defaultDaemonModule, envVars)
}
Expand All @@ -161,7 +160,7 @@ class SparkEnv (
pythonExec: String,
workerModule: String,
envVars: Map[String, String],
worker: Socket): Unit = {
worker: PythonWorker {
destroyPythonWorker(
pythonExec, workerModule, PythonWorkerFactory.defaultDaemonModule, envVars, worker)
}
Expand All @@ -171,7 +170,7 @@ class SparkEnv (
workerModule: String,
daemonModule: String,
envVars: Map[String, String],
worker: Socket): Unit = {
worker: PythonWorker {
synchronized {
val key = PythonWorkersKey(pythonExec, workerModule, daemonModule, envVars)
pythonWorkers.get(key).foreach(_.releaseWorker(worker))
Expand Down
22 changes: 18 additions & 4 deletions core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
Original file line number Diff line number Diff line change
Expand Up @@ -137,15 +137,15 @@ private class PairwiseRDD(prev: RDD[Array[Byte]]) extends RDD[(Long, Array[Byte]
private[spark] object PythonRDD extends Logging {

// remember the broadcasts sent to each worker
private val workerBroadcasts = new mutable.WeakHashMap[Socket, mutable.Set[Long]]()
private val workerBroadcasts = new mutable.WeakHashMap[PythonWorker, mutable.Set[Long]]()

// Authentication helper used when serving iterator data.
private lazy val authHelper = {
val conf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
new SocketAuthHelper(conf)
}

def getWorkerBroadcasts(worker: Socket): mutable.Set[Long] = {
def getWorkerBroadcasts(worker: PythonWorker): mutable.Set[Long] = {
synchronized {
workerBroadcasts.getOrElseUpdate(worker, new mutable.HashSet[Long]())
}
Expand Down Expand Up @@ -300,7 +300,11 @@ private[spark] object PythonRDD extends Logging {
new PythonBroadcast(path)
}

def writeIteratorToStream[T](iter: Iterator[T], dataOut: DataOutputStream): Unit = {
/**
* Writes the next element of the iterator `iter` to `dataOut`. Returns true if any data was
* written to the stream. Returns false if no data was written as the iterator has been exhausted.
*/
def writeNextElementToStream[T](iter: Iterator[T], dataOut: DataOutputStream): Boolean = {

def write(obj: Any): Unit = obj match {
case null =>
Expand All @@ -318,8 +322,18 @@ private[spark] object PythonRDD extends Logging {
case other =>
throw new SparkException("Unexpected element type " + other.getClass)
}
if (iter.hasNext) {
write(iter.next())
true
} else {
false
}
}

iter.foreach(write)
def writeIteratorToStream[T](iter: Iterator[T], dataOut: DataOutputStream): Unit = {
while (writeNextElementToStream(iter, dataOut)) {
// Nothing.
}
}

/**
Expand Down
Loading