Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-32138] Drop Python 2.7, 3.4 and 3.5 #28957

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,8 @@ jobs:
architecture: x64
- name: Install Python 3.6
uses: actions/setup-python@v2
if: contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
# Yarn has a Python specific test too, for example, YarnClusterSuite.
if: contains(matrix.modules, 'yarn') || contains(matrix.modules, 'pyspark') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: 3.6
architecture: x64
Expand Down
66 changes: 0 additions & 66 deletions core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,79 +45,13 @@ private[spark] object SerDeUtil extends Logging {
}
}
}
// Unpickle array.array generated by Python 2.6
class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
// /* Description of types */
// static struct arraydescr descriptors[] = {
// {'c', sizeof(char), c_getitem, c_setitem},
// {'b', sizeof(char), b_getitem, b_setitem},
// {'B', sizeof(char), BB_getitem, BB_setitem},
// #ifdef Py_USING_UNICODE
// {'u', sizeof(Py_UNICODE), u_getitem, u_setitem},
// #endif
// {'h', sizeof(short), h_getitem, h_setitem},
// {'H', sizeof(short), HH_getitem, HH_setitem},
// {'i', sizeof(int), i_getitem, i_setitem},
// {'I', sizeof(int), II_getitem, II_setitem},
// {'l', sizeof(long), l_getitem, l_setitem},
// {'L', sizeof(long), LL_getitem, LL_setitem},
// {'f', sizeof(float), f_getitem, f_setitem},
// {'d', sizeof(double), d_getitem, d_setitem},
// {'\0', 0, 0, 0} /* Sentinel */
// };
val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
Map('B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
'L' -> 11, 'l' -> 13, 'f' -> 15, 'd' -> 17, 'u' -> 21
)
} else {
Map('B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
'L' -> 10, 'l' -> 12, 'f' -> 14, 'd' -> 16, 'u' -> 20
)
}
override def construct(args: Array[Object]): Object = {
if (args.length == 1) {
construct(args ++ Array(""))
} else if (args.length == 2 && args(1).isInstanceOf[String]) {
val typecode = args(0).asInstanceOf[String].charAt(0)
// This must be ISO 8859-1 / Latin 1, not UTF-8, to interoperate correctly
val data = args(1).asInstanceOf[String].getBytes(StandardCharsets.ISO_8859_1)
if (typecode == 'c') {
// It seems like the pickle of pypy uses the similar protocol to Python 2.6, which uses
// a string for array data instead of list as Python 2.7, and handles an array of
// typecode 'c' as 1-byte character.
val result = new Array[Char](data.length)
var i = 0
while (i < data.length) {
result(i) = data(i).toChar
i += 1
}
result
} else {
construct(typecode, machineCodes(typecode), data)
}
} else if (args.length == 2 && args(0) == "l") {
// On Python 2, an array of typecode 'l' should be handled as long rather than int.
val values = args(1).asInstanceOf[JArrayList[_]]
val result = new Array[Long](values.size)
var i = 0
while (i < values.size) {
result(i) = values.get(i).asInstanceOf[Number].longValue()
i += 1
}
result
} else {
super.construct(args)
}
}
}

private var initialized = false
// This should be called before trying to unpickle array.array from Python
// In cluster mode, this should be put in closure
def initialize(): Unit = {
synchronized{
if (!initialized) {
Unpickler.registerConstructor("array", "array", new ArrayConstructor())
Unpickler.registerConstructor("__builtin__", "bytearray", new ByteArrayConstructor())
Unpickler.registerConstructor("builtins", "bytearray", new ByteArrayConstructor())
Unpickler.registerConstructor("__builtin__", "bytes", new ByteArrayConstructor())
Expand Down
7 changes: 1 addition & 6 deletions dev/create-release/releaseutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@
print("Install using 'sudo pip install unidecode'")
sys.exit(-1)

if sys.version < '3':
input = raw_input # noqa

# Contributors list file name
contributors_file_name = "contributors.txt"
Expand Down Expand Up @@ -152,10 +150,7 @@ def get_commits(tag):
if not is_valid_author(author):
author = github_username
# Guard against special characters
try: # Python 2
author = unicode(author, "UTF-8")
except NameError: # Python 3
author = str(author)
author = str(author)
author = unidecode.unidecode(author).strip()
commit = Commit(_hash, author, title, pr_number)
commits.append(commit)
Expand Down
11 changes: 3 additions & 8 deletions dev/github_jira_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,9 @@
import os
import re
import sys
if sys.version < '3':
from urllib2 import urlopen
from urllib2 import Request
from urllib2 import HTTPError
else:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError

try:
import jira.client
Expand Down
10 changes: 9 additions & 1 deletion dev/lint-python
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,15 @@ function sphinx_test {

# Check that the documentation builds acceptably, skip check if sphinx is not installed.
if ! hash "$SPHINX_BUILD" 2> /dev/null; then
echo "The $SPHINX_BUILD command was not found. Skipping pydoc checks for now."
echo "The $SPHINX_BUILD command was not found. Skipping Sphinx build for now."
echo
return
fi

# TODO(SPARK-32279): Install Sphinx in Python 3 of Jenkins machines
PYTHON_HAS_SPHINX=$("$PYTHON_EXECUTABLE" -c 'import importlib.util; print(importlib.util.find_spec("sphinx") is not None)')
if [[ "$PYTHON_HAS_SPHINX" == "False" ]]; then
echo "$PYTHON_EXECUTABLE does not have Sphinx installed. Skipping Sphinx build for now."
echo
return
fi
Expand Down
12 changes: 3 additions & 9 deletions dev/merge_spark_pr.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,9 @@
import subprocess
import sys
import traceback
if sys.version < '3':
input = raw_input # noqa
from urllib2 import urlopen
from urllib2 import Request
from urllib2 import HTTPError
else:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError

try:
import jira.client
Expand Down
12 changes: 3 additions & 9 deletions dev/run-tests-jenkins.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,9 @@
import json
import functools
import subprocess
if sys.version < '3':
from urllib2 import urlopen
from urllib2 import Request
from urllib2 import HTTPError, URLError
else:
from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError, URLError

from urllib.request import urlopen
from urllib.request import Request
from urllib.error import HTTPError, URLError

from sparktestsupport import SPARK_HOME, ERROR_CODES
from sparktestsupport.shellutils import run_cmd
Expand Down
3 changes: 1 addition & 2 deletions dev/sparktestsupport/toposort.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@
# Moved functools import to the top of the file.
# Changed assert to a ValueError.
# Changed iter[items|keys] to [items|keys], for python 3
# compatibility. I don't think it matters for python 2 these are
# now lists instead of iterables.
# compatibility.
# Copy the input so as to leave it unmodified.
# Renamed function from toposort2 to toposort.
# Handle empty input.
Expand Down
2 changes: 1 addition & 1 deletion docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -2917,7 +2917,7 @@ The following variables can be set in `spark-env.sh`:
</tr>
<tr>
<td><code>PYSPARK_PYTHON</code></td>
<td>Python binary executable to use for PySpark in both driver and workers (default is <code>python2.7</code> if available, otherwise <code>python</code>).
<td>Python binary executable to use for PySpark in both driver and workers (default is <code>python3</code> if available, otherwise <code>python</code>).
Property <code>spark.pyspark.python</code> take precedence if it is set</td>
</tr>
<tr>
Expand Down
3 changes: 1 addition & 2 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,8 @@ source, visit [Building Spark](building-spark.html).

Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS), and it should run on any platform that runs a supported version of Java. This should include JVMs on x86_64 and ARM64. It's easy to run locally on one machine --- all you need is to have `java` installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.

Spark runs on Java 8/11, Scala 2.12, Python 2.7+/3.4+ and R 3.5+.
Spark runs on Java 8/11, Scala 2.12, Python 3.6+ and R 3.5+.
Java 8 prior to version 8u92 support is deprecated as of Spark 3.0.0.
Python 2 and Python 3 prior to version 3.6 support is deprecated as of Spark 3.0.0.
For the Scala API, Spark {{site.SPARK_VERSION}}
uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
({{site.SCALA_BINARY_VERSION}}.x).
Expand Down
12 changes: 6 additions & 6 deletions docs/rdd-programming-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ import org.apache.spark.SparkConf;

<div data-lang="python" markdown="1">

Spark {{site.SPARK_VERSION}} works with Python 2.7+ or Python 3.4+. It can use the standard CPython interpreter,
Spark {{site.SPARK_VERSION}} works with Python 3.6+. It can use the standard CPython interpreter,
so C libraries like NumPy can be used. It also works with PyPy 2.3+.

Note that Python 2 support is deprecated as of Spark 3.0.0.
Python 2, 3.4 and 3.5 supports were removed in Spark 3.1.0.

Spark applications in Python can either be run with the `bin/spark-submit` script which includes Spark at runtime, or by including it in your setup.py as:

Expand Down Expand Up @@ -134,8 +134,8 @@ PySpark requires the same minor version of Python in both driver and workers. It
you can specify which version of Python you want to use by `PYSPARK_PYTHON`, for example:

{% highlight bash %}
$ PYSPARK_PYTHON=python3.4 bin/pyspark
$ PYSPARK_PYTHON=/opt/pypy-2.5/bin/pypy bin/spark-submit examples/src/main/python/pi.py
$ PYSPARK_PYTHON=python3.8 bin/pyspark
$ PYSPARK_PYTHON=/path-to-your-pypy/pypy bin/spark-submit examples/src/main/python/pi.py
{% endhighlight %}

</div>
Expand Down Expand Up @@ -276,7 +276,7 @@ $ PYSPARK_DRIVER_PYTHON=jupyter PYSPARK_DRIVER_PYTHON_OPTS=notebook ./bin/pyspar

You can customize the `ipython` or `jupyter` commands by setting `PYSPARK_DRIVER_PYTHON_OPTS`.

After the Jupyter Notebook server is launched, you can create a new "Python 2" notebook from
After the Jupyter Notebook server is launched, you can create a new notebook from
the "Files" tab. Inside the notebook, you can input the command `%pylab inline` as part of
your notebook before you start to try Spark from the Jupyter notebook.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, Jupyter with PySpark in this way seems broken in Spark due to how find-spark-home works in PySpark, see #28256

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's been long time not using pyspark, I'm glad that this is fixed in spark 3.0. Thanks.


Expand Down Expand Up @@ -447,7 +447,7 @@ Writables are automatically converted:

<table class="table">
<tr><th>Writable Type</th><th>Python Type</th></tr>
<tr><td>Text</td><td>unicode str</td></tr>
<tr><td>Text</td><td>str</td></tr>
<tr><td>IntWritable</td><td>int</td></tr>
<tr><td>FloatWritable</td><td>float</td></tr>
<tr><td>DoubleWritable</td><td>float</td></tr>
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/als.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@

This example requires numpy (http://www.numpy.org/)
"""
from __future__ import print_function

import sys

import numpy as np
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/avro_inputformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@
{u'favorite_color': None, u'name': u'Alyssa'}
{u'favorite_color': u'red', u'name': u'Ben'}
"""
from __future__ import print_function

import sys

from functools import reduce
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@

This example requires NumPy (http://www.numpy.org/).
"""
from __future__ import print_function

import sys

import numpy as np
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/logistic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
In practice, one may prefer to use the LogisticRegression algorithm in
ML, as shown in examples/src/main/python/ml/logistic_regression_with_elastic_net.py.
"""
from __future__ import print_function

import sys

import numpy as np
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/ml/aft_survival_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
Run with:
bin/spark-submit examples/src/main/python/ml/aft_survival_regression.py
"""
from __future__ import print_function

# $example on$
from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors
Expand Down
8 changes: 1 addition & 7 deletions examples/src/main/python/ml/als_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,6 @@
# limitations under the License.
#

from __future__ import print_function

import sys
if sys.version >= '3':
long = int

from pyspark.sql import SparkSession

# $example on$
Expand All @@ -39,7 +33,7 @@
lines = spark.read.text("data/mllib/als/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
rating=float(p[2]), timestamp=long(p[3])))
rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/ml/anova_selector_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
Run with:
bin/spark-submit examples/src/main/python/ml/anova_selector_example.py
"""
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import ANOVASelector
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/ml/anova_test_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
Run with:
bin/spark-submit examples/src/main/python/ml/anova_test_example.py
"""
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/ml/binarizer_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
# limitations under the License.
#

from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Binarizer
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/ml/bisecting_k_means_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
Run with:
bin/spark-submit examples/src/main/python/ml/bisecting_k_means_example.py
"""
from __future__ import print_function

# $example on$
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
Run with:
bin/spark-submit examples/src/main/python/ml/bucketed_random_projection_lsh_example.py
"""
from __future__ import print_function

# $example on$
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/ml/bucketizer_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
# limitations under the License.
#

from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Bucketizer
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/ml/chi_square_test_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
Run with:
bin/spark-submit examples/src/main/python/ml/chi_square_test_example.py
"""
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors
Expand Down
2 changes: 0 additions & 2 deletions examples/src/main/python/ml/chisq_selector_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
# limitations under the License.
#

from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import ChiSqSelector
Expand Down
Loading