apache · HyukjinKwon · Jun 30, 2020 · Jul 13, 2020 · HyukjinKwon · Jul 3, 2020
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
@@ -129,7 +129,8 @@ jobs:
         architecture: x64
     - name: Install Python 3.6
       uses: actions/setup-python@v2
-      if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
+      # Yarn has a Python specific test too, for example, YarnClusterSuite.
+      if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
       with:
         python-version: 3.6
         architecture: x64

diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -45,79 +45,13 @@ private[spark] object SerDeUtil extends Logging {
       }
     }
   }
-  // Unpickle array.array generated by Python 2.6
-  class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
-    //  /* Description of types */
-    //  static struct arraydescr descriptors[] = {
-    //    {'c', sizeof(char), c_getitem, c_setitem},
-    //    {'b', sizeof(char), b_getitem, b_setitem},
-    //    {'B', sizeof(char), BB_getitem, BB_setitem},
-    //    #ifdef Py_USING_UNICODE
-    //      {'u', sizeof(Py_UNICODE), u_getitem, u_setitem},
-    //    #endif
-    //    {'h', sizeof(short), h_getitem, h_setitem},
-    //    {'H', sizeof(short), HH_getitem, HH_setitem},
-    //    {'i', sizeof(int), i_getitem, i_setitem},
-    //    {'I', sizeof(int), II_getitem, II_setitem},
-    //    {'l', sizeof(long), l_getitem, l_setitem},
-    //    {'L', sizeof(long), LL_getitem, LL_setitem},
-    //    {'f', sizeof(float), f_getitem, f_setitem},
-    //    {'d', sizeof(double), d_getitem, d_setitem},
-    //    {'\0', 0, 0, 0} /* Sentinel */
-    //  };
-    val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
-      Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
-        'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21
-      )
-    } else {
-      Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
-        'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20
-      )
-    }
-    override def construct(args: Array[Object]): Object = {
-      if (args.length == 1) {
-        construct(args ++ Array(""))
-      } else if (args.length == 2 && args(1).isInstanceOf[String]) {
-        val typecode = args(0).asInstanceOf[String].charAt(0)
-        // This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
-        val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
-        if (typecode == 'c') {
-          // It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses
-          // a string for array data instead of list as Python 2.7, and handles an array of
-          // typecode 'c' as 1-byte character.
-          val result = new Array[Char](data.length)
-          var i = 0
-          while (i < data.length) {
-            result(i) = data(i).toChar
-            i += 1
-          }
-          result
-        } else {
-          construct(typecode, machineCodes(typecode), data)
-        }
-      } else if (args.length == 2 && args(0) == "l") {
-        // On Python 2, an array of typecode 'l' should be handled as long rather than int.
-        val values = args(1).asInstanceOf[JArrayList[_]]
-        val result = new Array[Long](values.size)
-        var i = 0
-        while (i < values.size) {
-          result(i) = values.get(i).asInstanceOf[Number].longValue()
-          i += 1
-        }
-        result
-      } else {
-        super.construct(args)
-      }
-    }
-  }
 
   private var initialized = false
   // This should be called before trying to unpickle array.array from Python
   // In cluster mode, this should be put in closure
   def initialize(): Unit = {
     synchronized{
       if (!initialized) {
-        Unpickler.registerConstructor("array", "array", new ArrayConstructor())
         Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor())
         Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor())
         Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor())

diff --git a/dev/create-release/releaseutils.py b/dev/create-release/releaseutils.py
@@ -49,8 +49,6 @@
     print("Install using 'sudo pip install unidecode'")
     sys.exit(-1)
 
-if sys.version < '3':
-    input = raw_input  # noqa
 
 # Contributors list file name
 contributors_file_name = "contributors.txt"
@@ -152,10 +150,7 @@ def get_commits(tag):
             if not is_valid_author(author):
                 author = github_username
         # Guard against special characters
-        try:               # Python 2
-            author = unicode(author, "UTF-8")
-        except NameError:  # Python 3
-            author = str(author)
+        author = str(author)
         author = unidecode.unidecode(author).strip()
         commit = Commit(_hash, author, title, pr_number)
         commits.append(commit)

diff --git a/dev/github_jira_sync.py b/dev/github_jira_sync.py
@@ -22,14 +22,9 @@
 import os
 import re
 import sys
-if sys.version < '3':
-    from urllib2 import urlopen
-    from urllib2 import Request
-    from urllib2 import HTTPError
-else:
-    from urllib.request import urlopen
-    from urllib.request import Request
-    from urllib.error import HTTPError
+from urllib.request import urlopen
+from urllib.request import Request
+from urllib.error import HTTPError
 
 try:
     import jira.client

diff --git a/dev/lint-python b/dev/lint-python
@@ -168,7 +168,15 @@ function sphinx_test {
 
     # Check that the documentation builds acceptably, skip check if sphinx is not installed.
     if ! hash "$SPHINX_BUILD" 2> /dev/null; then
-        echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now."
+        echo "The $SPHINX_BUILD command was not found. Skipping Sphinx build for now."
+        echo
+        return
+    fi
+
+    # TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines
+    PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)')
+    if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then
+        echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now."
         echo
         return
     fi

diff --git a/dev/merge_spark_pr.py b/dev/merge_spark_pr.py
@@ -31,15 +31,9 @@
 import subprocess
 import sys
 import traceback
-if sys.version < '3':
-    input = raw_input  # noqa
-    from urllib2 import urlopen
-    from urllib2 import Request
-    from urllib2 import HTTPError
-else:
-    from urllib.request import urlopen
-    from urllib.request import Request
-    from urllib.error import HTTPError
+from urllib.request import urlopen
+from urllib.request import Request
+from urllib.error import HTTPError
 
 try:
     import jira.client

diff --git a/dev/run-tests-jenkins.py b/dev/run-tests-jenkins.py
@@ -22,15 +22,9 @@
 import json
 import functools
 import subprocess
-if sys.version < '3':
-    from urllib2 import urlopen
-    from urllib2 import Request
-    from urllib2 import HTTPError, URLError
-else:
-    from urllib.request import urlopen
-    from urllib.request import Request
-    from urllib.error import HTTPError, URLError
-
+from urllib.request import urlopen
+from urllib.request import Request
+from urllib.error import HTTPError, URLError
 
 from sparktestsupport import SPARK_HOME, ERROR_CODES
 from sparktestsupport.shellutils import run_cmd

diff --git a/dev/sparktestsupport/toposort.py b/dev/sparktestsupport/toposort.py
@@ -24,8 +24,7 @@
 #    Moved functools import to the top of the file.
 #    Changed assert to a ValueError.
 #    Changed iter[items|keys] to [items|keys], for python 3
-#     compatibility. I don't think it matters for python 2 these are
-#     now lists instead of iterables.
+#     compatibility.
 #    Copy the input so as to leave it unmodified.
 #    Renamed function from toposort2 to toposort.
 #    Handle empty input.

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -2917,7 +2917,7 @@ The following variables can be set in `spark-env.sh`:
   </tr>
   <tr>
     <td><code>PYSPARK_PYTHON</code></td>
-    <td>Python binary executable to use for PySpark in both driver and workers (default is <code>python2.7</code> if available, otherwise <code>python</code>).
+    <td>Python binary executable to use for PySpark in both driver and workers (default is <code>python3</code> if available, otherwise <code>python</code>).
     Property <code>spark.pyspark.python</code> take precedence if it is set</td>
   </tr>
   <tr>

diff --git a/docs/index.md b/docs/index.md
@@ -44,9 +44,8 @@ source, visit [Building Spark](building-spark.html).
 
 Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.
 
-Spark runs on Java 8/11, Scala 2.12, Python 2.7+/3.4+ and R 3.5+.
+Spark runs on Java 8/11, Scala 2.12, Python 3.6+ and R 3.5+.
 Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0.
-Python 2 and Python 3 prior to version 3.6 support is deprecated as of Spark 3.0.0.
 For the Scala API, Spark {{site.SPARK_VERSION}}
 uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
 ({{site.SCALA_BINARY_VERSION}}.x).

diff --git a/docs/rdd-programming-guide.md b/docs/rdd-programming-guide.md
@@ -101,10 +101,10 @@ import org.apache.spark.SparkConf;
 
 <div data-lang="python"  markdown="1">
 
-Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter,
+Spark {{site.SPARK_VERSION}} works with Python 3.6+. It can use the standard CPython interpreter,
 so C libraries like NumPy can be used. It also works with PyPy 2.3+.
 
-Note that Python 2 support is deprecated as of Spark 3.0.0.
+Python 2, 3.4 and 3.5 supports were removed in Spark 3.1.0.
 
 Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as:
 
@@ -134,8 +134,8 @@ PySpark requires the same minor version of Python in both driver and workers. It
 you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example:
 
 {% highlight bash %}
-$ PYSPARK_PYTHON=python3.4 bin/pyspark
-$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py
+$ PYSPARK_PYTHON=python3.8 bin/pyspark
+$ PYSPARK_PYTHON=/path-to-your-pypy/pypy bin/spark-submit examples/src/main/python/pi.py
 {% endhighlight %}
 
 </div>
@@ -276,7 +276,7 @@ $ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspar
 
 You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`.
 
-After the Jupyter Notebook server is launched, you can create a new "Python 2" notebook from
+After the Jupyter Notebook server is launched, you can create a new notebook from
 the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
 your notebook before you start to try Spark from the Jupyter notebook.
 
@@ -447,7 +447,7 @@ Writables are automatically converted:
 
 <table class="table">
 <tr><th>Writable Type</th><th>Python Type</th></tr>
-<tr><td>Text</td><td>unicode str</td></tr>
+<tr><td>Text</td><td>str</td></tr>
 <tr><td>IntWritable</td><td>int</td></tr>
 <tr><td>FloatWritable</td><td>float</td></tr>
 <tr><td>DoubleWritable</td><td>float</td></tr>

diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
@@ -21,8 +21,6 @@
 
 This example requires numpy (http://www.numpy.org/)
 """
-from __future__ import print_function
-
 import sys
 
 import numpy as np

diff --git a/examples/src/main/python/avro_inputformat.py b/examples/src/main/python/avro_inputformat.py
@@ -43,8 +43,6 @@
 {u'favorite_color': None, u'name': u'Alyssa'}
 {u'favorite_color': u'red', u'name': u'Ben'}
 """
-from __future__ import print_function
-
 import sys
 
 from functools import reduce

diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
@@ -22,8 +22,6 @@
 
 This example requires NumPy (http://www.numpy.org/).
 """
-from __future__ import print_function
-
 import sys
 
 import numpy as np

diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py
@@ -22,8 +22,6 @@
 In practice, one may prefer to use the LogisticRegression algorithm in
 ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py.
 """
-from __future__ import print_function
-
 import sys
 
 import numpy as np

diff --git a/examples/src/main/python/ml/aft_survival_regression.py b/examples/src/main/python/ml/aft_survival_regression.py
@@ -20,8 +20,6 @@
 Run with:
   bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.regression import AFTSurvivalRegression
 from pyspark.ml.linalg import Vectors

diff --git a/examples/src/main/python/ml/als_example.py b/examples/src/main/python/ml/als_example.py
@@ -15,12 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
-import sys
-if sys.version >= '3':
-    long = int
-
 from pyspark.sql import SparkSession
 
 # $example on$
@@ -39,7 +33,7 @@
     lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
     parts = lines.map(lambda row: row.value.split("::"))
     ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
-                                         rating=float(p[2]), timestamp=long(p[3])))
+                                         rating=float(p[2]), timestamp=int(p[3])))
     ratings = spark.createDataFrame(ratingsRDD)
     (training, test) = ratings.randomSplit([0.8, 0.2])
 

diff --git a/examples/src/main/python/ml/anova_selector_example.py b/examples/src/main/python/ml/anova_selector_example.py
@@ -20,8 +20,6 @@
 Run with:
   bin/spark-submit examples/src/main/python/ml/anova_selector_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import ANOVASelector

diff --git a/examples/src/main/python/ml/anova_test_example.py b/examples/src/main/python/ml/anova_test_example.py
@@ -20,8 +20,6 @@
 Run with:
   bin/spark-submit examples/src/main/python/ml/anova_test_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.linalg import Vectors

diff --git a/examples/src/main/python/ml/binarizer_example.py b/examples/src/main/python/ml/binarizer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import Binarizer

diff --git a/examples/src/main/python/ml/bisecting_k_means_example.py b/examples/src/main/python/ml/bisecting_k_means_example.py
@@ -20,8 +20,6 @@
 Run with:
   bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.clustering import BisectingKMeans
 from pyspark.ml.evaluation import ClusteringEvaluator

diff --git a/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py b/examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
@@ -20,8 +20,6 @@
 Run with:
   bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
 """
-from __future__ import print_function
-
 # $example on$
 from pyspark.ml.feature import BucketedRandomProjectionLSH
 from pyspark.ml.linalg import Vectors

diff --git a/examples/src/main/python/ml/bucketizer_example.py b/examples/src/main/python/ml/bucketizer_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import Bucketizer

diff --git a/examples/src/main/python/ml/chi_square_test_example.py b/examples/src/main/python/ml/chi_square_test_example.py
@@ -20,8 +20,6 @@
 Run with:
   bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
 """
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.linalg import Vectors

diff --git a/examples/src/main/python/ml/chisq_selector_example.py b/examples/src/main/python/ml/chisq_selector_example.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 #
 
-from __future__ import print_function
-
 from pyspark.sql import SparkSession
 # $example on$
 from pyspark.ml.feature import ChiSqSelector