-
Notifications
You must be signed in to change notification settings - Fork 1
/
Dockerfile
83 lines (63 loc) · 2.33 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# Short Description: Apache PySpark
# Full Description: The ubuntu:xenial Docker image with Python 3 and Apache PySpark
FROM ubuntu:xenial
USER root
# 1. AP
# Install g++, nano, pigz, wget, make
RUN apt-get -qq update \
&& apt-get install -y g++ nano pigz wget make \
&& apt-get clean
# 2. JAVA
# Install java
RUN apt-get -qq update \
&& apt-get install -y openjdk-8* \
&& apt-get clean
# 3. PYTHON+PIP
# Add deadsnakes repository
RUN apt-get -qq update \
&& apt-get install -y software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa
# Install python
RUN apt-get -qq update \
&& apt-get install -y python3.6 python3.6-dev python3-pip
# Link python3 to python and pip3 to pip
RUN ln -sfn /usr/bin/python3.6 /usr/bin/python3 \
&& ln -sfn /usr/bin/python3 /usr/bin/python \
&& ln -sfn /usr/bin/pip3 /usr/bin/pip
ARG APACHE_SPARK_VERSION
ARG HADOOP_VERSION
ARG GRAPHFRAME_VERSION
# 4. SPARK
ARG APACHE_SPARK_VERSION
ARG HADOOP_VERSION
ARG GRAPHFRAME_VERSION
# Spark installation
WORKDIR /tmp
RUN wget -q "https://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \
tar xzf "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" -C /usr/local && \
rm "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz"
WORKDIR /usr/local
RUN ln -s "spark-${APACHE_SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" spark
# Configure Spark
ENV SPARK_HOME=/usr/local/spark
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info"
ENV PATH=$PATH:$SPARK_HOME/bin
ENV SPARK_CONF_DIR="${SPARK_HOME}/conf"
ENV PYTHONPATH="${SPARK_HOME}/python:${PYTHONPATH}"
ENV PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.7-src.zip:${PYTHONPATH}"
ENV PYSPARK_PYTHONPATH_SET=1
# Download graphframes jar
RUN wget --directory-prefix /usr/local/spark/jars \
http://dl.bintray.com/spark-packages/maven/graphframes/graphframes/${GRAPHFRAME_VERSION}/graphframes-${GRAPHFRAME_VERSION}.jar
# 5. PySPARQL
WORKDIR /code/package
COPY PySPARQL PySPARQL
COPY requirements.txt requirements.txt
COPY setup.py setup.py
COPY README.rst README.rst
RUN pip install -r requirements.txt
RUN pip install .
WORKDIR /code
RUN rm -rf package
COPY tests tests
CMD ["pytest"]