Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Karthik changes #359

Merged
merged 9 commits into from
Jul 16, 2020
1 change: 1 addition & 0 deletions docs/demo/Databricks/generate-init-script.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","source":["dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")\n \ndbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n#!/bin/bash\nsudo wget -O /databricks/jars/rapids-4-spark_2.12-0.1.0-databricks.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.1.0-databricks/rapids-4-spark_2.12-0.1.0-databricks.jar\nsudo wget -O /databricks/jars/cudf-0.14-cuda10-1.jar https://repo1.maven.org/maven2/ai/rapids/cudf/0.14/cudf-0.14-cuda10-1.jar\"\"\", True)"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["%sh\ncd ../../dbfs/databricks/init_scripts\npwd\nls -ltr\ncat init.sh"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":3}],"metadata":{"name":"generate-init-script","notebookId":2645746662301564},"nbformat":4,"nbformat_minor":0}
1 change: 1 addition & 0 deletions docs/demo/GCP/Mortgage-ETL-CPU.ipynb

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions docs/demo/GCP/Mortgage-ETL-GPU.ipynb

Large diffs are not rendered by default.

101 changes: 101 additions & 0 deletions docs/demo/GCP/criteo-gpu.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
{
"metadata": {
"name": "Criteo-XGBoost4j",
"kernelspec": {
"language": "scala",
"name": "spark2-scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": "import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostClassifier}\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.sql.{SparkSession, SQLContext}\nimport org.apache.spark.SparkConf\nimport org.apache.spark.sql.types.{DoubleType, FloatType, IntegerType, StructField, StructType}\nimport scala.util.Properties"
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": "val trainPath \u003d \"gs://dataproc-nv-demo/criteo/train/\"\nval evalPath \u003d \"gs://dataproc-nv-demo/criteo/test/\"\nval transPath \u003d \"gs://dataproc-nv-demo/criteo/test/\"\nval modelPath \u003d \"gs://dataproc-nv-demo/criteo/model/criteo\"\n\n// val trainPath \u003d \"hdfs:///criteo/train/\"\n// val evalPath \u003d \"hdfs:///criteo/test/\"\n// val transPath \u003d \"hdfs:///criteo/test/\"\n// val modelPath \u003d \"hdfs:///criteo/model/criteo\""
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": "val conf \u003d new SparkConf()\nconf.set(\"spark.executor.instances\", \"20\")\nconf.set(\"spark.executor.cores\", \"7\")\nconf.set(\"spark.task.cpus\", \"7\")\nconf.set(\"spark.executor.memory\", \"24g\")\nconf.set(\"spark.rapids.memory.pinnedPool.size\", \"2G\")\nconf.set(\"spark.executor.memoryOverhead\", \"16G\")\nconf.set(\"spark.executor.extraJavaOptions\", \"-Dai.rapids.cudf.prefer-pinned\u003dtrue\")\nconf.set(\"spark.locality.wait\", \"0s\")\nconf.set(\"spark.sql.files.maxPartitionBytes\", \"512m\")\nconf.set(\"spark.executor.resource.gpu.amount\", \"1\")\nconf.set(\"spark.task.resource.gpu.amount\", \"1\")\nconf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\nconf.set(\"spark.rapids.sql.hasNans\", \"false\")\nconf.set(\"spark.rapids.sql.batchSizeBytes\", \"512M\")\nconf.set(\"spark.rapids.sql.reader.batchSizeBytes\", \"768M\")\nconf.set(\"spark.rapids.sql.variableFloatAgg.enabled\", \"true\")\nconf.set(\"spark.rapids.memory.gpu.pooling.enabled\", \"false\")\n// conf.set(\"spark.rapids.memory.gpu.allocFraction\", \"0.1\")\nval spark \u003d SparkSession.builder.appName(\"criteo-gpu\")\n .enableHiveSupport()\n .config(conf)\n .getOrCreate\nval reader \u003d spark.read.option(\"header\", true)"
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": "val trainSet \u003d reader.parquet(trainPath)\nval evalSet \u003d reader.parquet(evalPath)\nval transSet \u003d reader.parquet(transPath)"
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": "def getFeatureNames(length: Int): List[String] \u003d\n 1.until(length).map(i \u003d\u003e s\"_c$i\").toList\nval labelColName \u003d \"_c0\"\nval featureNames \u003d getFeatureNames(40)"
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": "val commParamMap \u003d Map(\n \"eval_metric\" -\u003e \"logloss\",\n \"eta\" -\u003e 0.1,\n \"gamma\" -\u003e 0.1,\n \"missing\" -\u003e 0.0,\n \"max_depth\" -\u003e 10,\n \"max_leaves\" -\u003e 256,\n \"objective\" -\u003e \"binary:logistic\",\n \"grow_policy\" -\u003e \"depthwise\",\n \"min_child_weight\" -\u003e 30,\n \"lambda\" -\u003e 1,\n \"scale_pos_weight\" -\u003e 2,\n \"subsample\" -\u003e 1,\n \"num_round\" -\u003e 100)\nval xgbParamFinal \u003d commParamMap ++ Map(\"tree_method\" -\u003e \"gpu_hist\", \"num_workers\" -\u003e 20, \"nthread\" -\u003e7)"
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": "val xgbClassifier \u003d new XGBoostClassifier(xgbParamFinal)\n .setLabelCol(labelColName)\n // \u003d\u003d\u003d diff \u003d\u003d\u003d\n .setFeaturesCols(featureNames)\nxgbClassifier.setEvalSets(Map(\"eval\" -\u003e evalSet))\n\n\ndef benchmark[R](phase: String)(block: \u003d\u003e R): (R, Float) \u003d {\n val t0 \u003d System.currentTimeMillis\n val result \u003d block // call-by-name\n val t1 \u003d System.currentTimeMillis\n println(\"Elapsed time [\" + phase + \"]: \" + ((t1 - t0).toFloat / 1000) + \"s\")\n (result, (t1 - t0).toFloat / 1000)\n}\n\n // Start training\nprintln(\"\\n------ Training ------\")\nval (xgbClassificationModel, _) \u003d benchmark(\"train\") {\n xgbClassifier.fit(trainSet)\n}\n"
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": "println(\"\\n------ Transforming ------\")\nval (results, _) \u003d benchmark(\"transform\") {\n val ret \u003d xgbClassificationModel.transform(transSet).cache()\n ret\n}\nz.show(results.select(labelColName,\"rawPrediction\",\"probability\",\"prediction\").limit(10))\n\nprintln(\"\\n------Accuracy of Evaluation------\")\nval evaluator \u003d new MulticlassClassificationEvaluator().setLabelCol(labelColName)\nval accuracy \u003d evaluator.evaluate(results)\nprintln(accuracy)"
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": "xgbClassificationModel.write.overwrite.save(modelPath)\n\nval modelFromDisk \u003d XGBoostClassificationModel.load(modelPath)\n\nval (results2, _) \u003d benchmark(\"transform2\") {\n modelFromDisk.transform(transSet)\n}\nz.show(results2.limit(2))"
}
]
}
Loading