NVIDIA · eordentlich · Sep 28, 2024 · lijinf2 · Sep 28, 2024
diff --git a/ci/Dockerfile b/ci/Dockerfile
@@ -37,6 +37,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     && conda config --set solver libmamba
 
 # install cuML
-ARG CUML_VER=24.08
-RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.9 cuda-version=11.8 \
+ARG CUML_VER=24.10
+RUN conda install -y -c rapidsai-nightly -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=11.8 numpy~=1.0 \
     && conda clean --all -f -y
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -9,7 +9,7 @@
 project = 'spark-rapids-ml'
 copyright = '2024, NVIDIA'
 author = 'NVIDIA'
-release = '24.08.0'
+release = '24.10.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
@@ -51,7 +51,7 @@ If you already have a Databricks account, you can run the example notebooks on a
       spark.task.resource.gpu.amount 1
       spark.databricks.delta.preview.enabled true
       spark.python.worker.reuse true
-      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.06.1.jar:/databricks/spark/python
+      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.08.1.jar:/databricks/spark/python
       spark.sql.execution.arrow.maxRecordsPerBatch 100000
       spark.rapids.memory.gpu.minAllocFraction 0.0001
       spark.plugins com.nvidia.spark.SQLPlugin

diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -5,7 +5,7 @@ SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file
 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
 # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
 RAPIDS_VERSION=24.8.0
-SPARK_RAPIDS_VERSION=24.06.1
+SPARK_RAPIDS_VERSION=24.08.1
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 

diff --git a/python/benchmark/databricks/gpu_etl_cluster_spec.sh b/python/benchmark/databricks/gpu_etl_cluster_spec.sh
@@ -9,7 +9,7 @@ cat <<EOF
         "spark.task.cpus": "1",
         "spark.databricks.delta.preview.enabled": "true",
         "spark.python.worker.reuse": "true",
-        "spark.executorEnv.PYTHONPATH": "/databricks/jars/rapids-4-spark_2.12-24.06.1.jar:/databricks/spark/python",
+        "spark.executorEnv.PYTHONPATH": "/databricks/jars/rapids-4-spark_2.12-24.08.1.jar:/databricks/spark/python",
         "spark.sql.files.minPartitionNum": "2",
         "spark.sql.execution.arrow.maxRecordsPerBatch": "10000",
         "spark.executor.cores": "8",

diff --git a/python/benchmark/databricks/init-pip-cuda-11.8.sh b/python/benchmark/databricks/init-pip-cuda-11.8.sh
@@ -6,7 +6,7 @@ BENCHMARK_ZIP=/dbfs/path/to/benchmark.zip
 # also, in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
 # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
 RAPIDS_VERSION=24.8.0
-SPARK_RAPIDS_VERSION=24.06.1
+SPARK_RAPIDS_VERSION=24.08.1
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 
@@ -24,12 +24,17 @@ ln -s /usr/local/cuda-11.8 /usr/local/cuda
 
 # install cudf and cuml
 # using ~= pulls in micro version patches
-/databricks/python/bin/pip install cudf-cu11~=${RAPIDS_VERSION} \
-    cuml-cu11~=${RAPIDS_VERSION} \
-    cuvs-cu11~=${RAPIDS_VERSION} \
-    pylibraft-cu11~=${RAPIDS_VERSION} \
-    rmm-cu11~=${RAPIDS_VERSION} \
-    --extra-index-url=https://pypi.nvidia.com
+# /databricks/python/bin/pip install cudf-cu11~=${RAPIDS_VERSION} \
+#     cuml-cu11~=${RAPIDS_VERSION} \
+#     cuvs-cu11~=${RAPIDS_VERSION} \
+#     pylibraft-cu11~=${RAPIDS_VERSION} \
+#     rmm-cu11~=${RAPIDS_VERSION} \
+#     --extra-index-url=https://pypi.nvidia.com
+
+/databricks/python/bin/pip install \
+    --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \
+    "cudf-cu11>=24.10.0a0,<=24.10" "dask-cudf-cu11>=24.10.0a0,<=24.10" \
+    "cuml-cu11>=24.10.0a0,<=24.10" "dask-cuda>=24.10.0a0,<=24.10"
 
 # install spark-rapids-ml
 python_ver=`python --version | grep -oP '3\.[0-9]+'`

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "spark-rapids-ml"
-version = "24.8.0"
+version = "24.10.0"
 authors = [
   { name="Jinfeng Li", email="jinfeng@nvidia.com" },
   { name="Bobby Wang", email="bobwang@nvidia.com" },

diff --git a/python/run_benchmark.sh b/python/run_benchmark.sh
@@ -107,7 +107,7 @@ EOF
 
 if [[ $cluster_type == "gpu_etl" ]]
 then
-SPARK_RAPIDS_VERSION=24.06.1
+SPARK_RAPIDS_VERSION=24.08.1
 rapids_jar=${rapids_jar:-rapids-4-spark_2.12-$SPARK_RAPIDS_VERSION.jar}
 if [ ! -f $rapids_jar ]; then
     echo "downloading spark rapids jar"

diff --git a/python/src/spark_rapids_ml/__init__.py b/python/src/spark_rapids_ml/__init__.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-__version__ = "24.08.0"
+__version__ = "24.10.0"
 
 import pandas as pd
 import pyspark

diff --git a/python/src/spark_rapids_ml/clustering.py b/python/src/spark_rapids_ml/clustering.py
@@ -483,7 +483,7 @@ def _construct_kmeans() -> CumlT:
             kmeans = CumlKMeansMG(output_type="cudf", **cuml_alg_params)
             from spark_rapids_ml.utils import cudf_to_cuml_array
 
-            kmeans.n_cols = n_cols
+            kmeans.n_features_in_ = n_cols
             kmeans.dtype = np.dtype(dtype)
             kmeans.cluster_centers_ = cudf_to_cuml_array(
                 np.array(cluster_centers_).astype(dtype), order=array_order