diff --git a/docs/demo/Databricks/generate-init-script.ipynb b/docs/demo/Databricks/generate-init-script.ipynb new file mode 100644 index 00000000000..bad856791ce --- /dev/null +++ b/docs/demo/Databricks/generate-init-script.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"code","source":["dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")\n \ndbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n#!/bin/bash\nsudo wget -O /databricks/jars/rapids-4-spark_2.12-0.1.0-databricks.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.1.0-databricks/rapids-4-spark_2.12-0.1.0-databricks.jar\nsudo wget -O /databricks/jars/cudf-0.14-cuda10-1.jar https://repo1.maven.org/maven2/ai/rapids/cudf/0.14/cudf-0.14-cuda10-1.jar\"\"\", True)"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["%sh\ncd ../../dbfs/databricks/init_scripts\npwd\nls -ltr\ncat init.sh"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":3}],"metadata":{"name":"generate-init-script","notebookId":2645746662301564},"nbformat":4,"nbformat_minor":0} diff --git a/docs/demo/GCP/Mortgage-ETL-CPU.ipynb b/docs/demo/GCP/Mortgage-ETL-CPU.ipynb new file mode 100644 index 00000000000..8618ed1f072 --- /dev/null +++ b/docs/demo/GCP/Mortgage-ETL-CPU.ipynb @@ -0,0 +1,1174 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Source\n", + "\n", + "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. This processed dataset is redistributed with permission and consent from Fannie Mae. For the full raw dataset visit [Fannie Mae]() to register for an account and to download\n", + "\n", + "Instruction is available at NVIDIA [RAPIDS demo site](https://rapidsai.github.io/demos/datasets/mortgage-data).\n", + "\n", + "### Prerequisite\n", + "\n", + "This notebook runs in a Dataproc cluster with GPU nodes, with [Spark RAPIDS](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/rapids) set up.\n", + "\n", + "### Define ETL Process\n", + "\n", + "Define data schema and steps to do the ETL process:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from pyspark import broadcast\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import *\n", + "from pyspark.sql.types import *\n", + "from pyspark.sql.window import Window\n", + "\n", + "def _get_quarter_from_csv_file_name():\n", + " return substring_index(substring_index(input_file_name(), '.', 1), '_', -1)\n", + "\n", + "_csv_perf_schema = StructType([\n", + " StructField('loan_id', LongType()),\n", + " StructField('monthly_reporting_period', StringType()),\n", + " StructField('servicer', StringType()),\n", + " StructField('interest_rate', DoubleType()),\n", + " StructField('current_actual_upb', DoubleType()),\n", + " StructField('loan_age', DoubleType()),\n", + " StructField('remaining_months_to_legal_maturity', DoubleType()),\n", + " StructField('adj_remaining_months_to_maturity', DoubleType()),\n", + " StructField('maturity_date', StringType()),\n", + " StructField('msa', DoubleType()),\n", + " StructField('current_loan_delinquency_status', IntegerType()),\n", + " StructField('mod_flag', StringType()),\n", + " StructField('zero_balance_code', StringType()),\n", + " StructField('zero_balance_effective_date', StringType()),\n", + " StructField('last_paid_installment_date', StringType()),\n", + " StructField('foreclosed_after', StringType()),\n", + " StructField('disposition_date', StringType()),\n", + " StructField('foreclosure_costs', DoubleType()),\n", + " StructField('prop_preservation_and_repair_costs', DoubleType()),\n", + " StructField('asset_recovery_costs', DoubleType()),\n", + " StructField('misc_holding_expenses', DoubleType()),\n", + " StructField('holding_taxes', DoubleType()),\n", + " StructField('net_sale_proceeds', DoubleType()),\n", + " StructField('credit_enhancement_proceeds', DoubleType()),\n", + " StructField('repurchase_make_whole_proceeds', StringType()),\n", + " StructField('other_foreclosure_proceeds', DoubleType()),\n", + " StructField('non_interest_bearing_upb', DoubleType()),\n", + " StructField('principal_forgiveness_upb', StringType()),\n", + " StructField('repurchase_make_whole_proceeds_flag', StringType()),\n", + " StructField('foreclosure_principal_write_off_amount', StringType()),\n", + " StructField('servicing_activity_indicator', StringType())])\n", + "_csv_acq_schema = StructType([\n", + " StructField('loan_id', LongType()),\n", + " StructField('orig_channel', StringType()),\n", + " StructField('seller_name', StringType()),\n", + " StructField('orig_interest_rate', DoubleType()),\n", + " StructField('orig_upb', IntegerType()),\n", + " StructField('orig_loan_term', IntegerType()),\n", + " StructField('orig_date', StringType()),\n", + " StructField('first_pay_date', StringType()),\n", + " StructField('orig_ltv', DoubleType()),\n", + " StructField('orig_cltv', DoubleType()),\n", + " StructField('num_borrowers', DoubleType()),\n", + " StructField('dti', DoubleType()),\n", + " StructField('borrower_credit_score', DoubleType()),\n", + " StructField('first_home_buyer', StringType()),\n", + " StructField('loan_purpose', StringType()),\n", + " StructField('property_type', StringType()),\n", + " StructField('num_units', IntegerType()),\n", + " StructField('occupancy_status', StringType()),\n", + " StructField('property_state', StringType()),\n", + " StructField('zip', IntegerType()),\n", + " StructField('mortgage_insurance_percent', DoubleType()),\n", + " StructField('product_type', StringType()),\n", + " StructField('coborrow_credit_score', DoubleType()),\n", + " StructField('mortgage_insurance_type', DoubleType()),\n", + " StructField('relocation_mortgage_indicator', StringType())])\n", + "_name_mapping = [\n", + " (\"WITMER FUNDING, LLC\", \"Witmer\"),\n", + " (\"WELLS FARGO CREDIT RISK TRANSFER SECURITIES TRUST 2015\", \"Wells Fargo\"),\n", + " (\"WELLS FARGO BANK, NA\" , \"Wells Fargo\"),\n", + " (\"WELLS FARGO BANK, N.A.\" , \"Wells Fargo\"),\n", + " (\"WELLS FARGO BANK, NA\" , \"Wells Fargo\"),\n", + " (\"USAA FEDERAL SAVINGS BANK\" , \"USAA\"),\n", + " (\"UNITED SHORE FINANCIAL SERVICES, LLC D\\\\/B\\\\/A UNITED WHOLESALE MORTGAGE\" , \"United Seq(e\"),\n", + " (\"U.S. BANK N.A.\" , \"US Bank\"),\n", + " (\"SUNTRUST MORTGAGE INC.\" , \"Suntrust\"),\n", + " (\"STONEGATE MORTGAGE CORPORATION\" , \"Stonegate Mortgage\"),\n", + " (\"STEARNS LENDING, LLC\" , \"Stearns Lending\"),\n", + " (\"STEARNS LENDING, INC.\" , \"Stearns Lending\"),\n", + " (\"SIERRA PACIFIC MORTGAGE COMPANY, INC.\" , \"Sierra Pacific Mortgage\"),\n", + " (\"REGIONS BANK\" , \"Regions\"),\n", + " (\"RBC MORTGAGE COMPANY\" , \"RBC\"),\n", + " (\"QUICKEN LOANS INC.\" , \"Quicken Loans\"),\n", + " (\"PULTE MORTGAGE, L.L.C.\" , \"Pulte Mortgage\"),\n", + " (\"PROVIDENT FUNDING ASSOCIATES, L.P.\" , \"Provident Funding\"),\n", + " (\"PROSPECT MORTGAGE, LLC\" , \"Prospect Mortgage\"),\n", + " (\"PRINCIPAL RESIDENTIAL MORTGAGE CAPITAL RESOURCES, LLC\" , \"Principal Residential\"),\n", + " (\"PNC BANK, N.A.\" , \"PNC\"),\n", + " (\"PMT CREDIT RISK TRANSFER TRUST 2015-2\" , \"PennyMac\"),\n", + " (\"PHH MORTGAGE CORPORATION\" , \"PHH Mortgage\"),\n", + " (\"PENNYMAC CORP.\" , \"PennyMac\"),\n", + " (\"PACIFIC UNION FINANCIAL, LLC\" , \"Other\"),\n", + " (\"OTHER\" , \"Other\"),\n", + " (\"NYCB MORTGAGE COMPANY, LLC\" , \"NYCB\"),\n", + " (\"NEW YORK COMMUNITY BANK\" , \"NYCB\"),\n", + " (\"NETBANK FUNDING SERVICES\" , \"Netbank\"),\n", + " (\"NATIONSTAR MORTGAGE, LLC\" , \"Nationstar Mortgage\"),\n", + " (\"METLIFE BANK, NA\" , \"Metlife\"),\n", + " (\"LOANDEPOT.COM, LLC\" , \"LoanDepot.com\"),\n", + " (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2015-1\" , \"JP Morgan Chase\"),\n", + " (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2014-1\" , \"JP Morgan Chase\"),\n", + " (\"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION\" , \"JP Morgan Chase\"),\n", + " (\"JPMORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n", + " (\"JP MORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n", + " (\"IRWIN MORTGAGE, CORPORATION\" , \"Irwin Mortgage\"),\n", + " (\"IMPAC MORTGAGE CORP.\" , \"Impac Mortgage\"),\n", + " (\"HSBC BANK USA, NATIONAL ASSOCIATION\" , \"HSBC\"),\n", + " (\"HOMEWARD RESIDENTIAL, INC.\" , \"Homeward Mortgage\"),\n", + " (\"HOMESTREET BANK\" , \"Other\"),\n", + " (\"HOMEBRIDGE FINANCIAL SERVICES, INC.\" , \"HomeBridge\"),\n", + " (\"HARWOOD STREET FUNDING I, LLC\" , \"Harwood Mortgage\"),\n", + " (\"GUILD MORTGAGE COMPANY\" , \"Guild Mortgage\"),\n", + " (\"GMAC MORTGAGE, LLC (USAA FEDERAL SAVINGS BANK)\" , \"GMAC\"),\n", + " (\"GMAC MORTGAGE, LLC\" , \"GMAC\"),\n", + " (\"GMAC (USAA)\" , \"GMAC\"),\n", + " (\"FREMONT BANK\" , \"Fremont Bank\"),\n", + " (\"FREEDOM MORTGAGE CORP.\" , \"Freedom Mortgage\"),\n", + " (\"FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"Franklin America\"),\n", + " (\"FLEET NATIONAL BANK\" , \"Fleet National\"),\n", + " (\"FLAGSTAR CAPITAL MARKETS CORPORATION\" , \"Flagstar Bank\"),\n", + " (\"FLAGSTAR BANK, FSB\" , \"Flagstar Bank\"),\n", + " (\"FIRST TENNESSEE BANK NATIONAL ASSOCIATION\" , \"Other\"),\n", + " (\"FIFTH THIRD BANK\" , \"Fifth Third Bank\"),\n", + " (\"FEDERAL HOME LOAN BANK OF CHICAGO\" , \"Fedral Home of Chicago\"),\n", + " (\"FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB\" , \"FDIC\"),\n", + " (\"DOWNEY SAVINGS AND LOAN ASSOCIATION, F.A.\" , \"Downey Mortgage\"),\n", + " (\"DITECH FINANCIAL LLC\" , \"Ditech\"),\n", + " (\"CITIMORTGAGE, INC.\" , \"Citi\"),\n", + " (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERFIRST MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n", + " (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERBANK MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n", + " (\"CHASE HOME FINANCE, LLC\" , \"JP Morgan Chase\"),\n", + " (\"CHASE HOME FINANCE FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"JP Morgan Chase\"),\n", + " (\"CHASE HOME FINANCE (CIE 1)\" , \"JP Morgan Chase\"),\n", + " (\"CHASE HOME FINANCE\" , \"JP Morgan Chase\"),\n", + " (\"CASHCALL, INC.\" , \"CashCall\"),\n", + " (\"CAPITAL ONE, NATIONAL ASSOCIATION\" , \"Capital One\"),\n", + " (\"CALIBER HOME LOANS, INC.\" , \"Caliber Funding\"),\n", + " (\"BISHOPS GATE RESIDENTIAL MORTGAGE TRUST\" , \"Bishops Gate Mortgage\"),\n", + " (\"BANK OF AMERICA, N.A.\" , \"Bank of America\"),\n", + " (\"AMTRUST BANK\" , \"AmTrust\"),\n", + " (\"AMERISAVE MORTGAGE CORPORATION\" , \"Amerisave\"),\n", + " (\"AMERIHOME MORTGAGE COMPANY, LLC\" , \"AmeriHome Mortgage\"),\n", + " (\"ALLY BANK\" , \"Ally Bank\"),\n", + " (\"ACADEMY MORTGAGE CORPORATION\" , \"Academy Mortgage\"),\n", + " (\"NO CASH-OUT REFINANCE\" , \"OTHER REFINANCE\"),\n", + " (\"REFINANCE - NOT SPECIFIED\" , \"OTHER REFINANCE\"),\n", + " (\"Other REFINANCE\" , \"OTHER REFINANCE\")]\n", + "\n", + "cate_col_names = [\n", + " \"orig_channel\",\n", + " \"first_home_buyer\",\n", + " \"loan_purpose\",\n", + " \"property_type\",\n", + " \"occupancy_status\",\n", + " \"property_state\",\n", + " \"relocation_mortgage_indicator\",\n", + " \"seller_name\",\n", + " \"mod_flag\"\n", + "]\n", + "# Numberic columns\n", + "label_col_name = \"delinquency_12\"\n", + "numeric_col_names = [\n", + " \"orig_interest_rate\",\n", + " \"orig_upb\",\n", + " \"orig_loan_term\",\n", + " \"orig_ltv\",\n", + " \"orig_cltv\",\n", + " \"num_borrowers\",\n", + " \"dti\",\n", + " \"borrower_credit_score\",\n", + " \"num_units\",\n", + " \"zip\",\n", + " \"mortgage_insurance_percent\",\n", + " \"current_loan_delinquency_status\",\n", + " \"current_actual_upb\",\n", + " \"interest_rate\",\n", + " \"loan_age\",\n", + " \"msa\",\n", + " \"non_interest_bearing_upb\",\n", + " label_col_name\n", + "]\n", + "all_col_names = cate_col_names + numeric_col_names\n", + "\n", + "def read_perf_csv(spark, path):\n", + " return spark.read.format('csv') \\\n", + " .option('nullValue', '') \\\n", + " .option('header', 'false') \\\n", + " .option('delimiter', '|') \\\n", + " .schema(_csv_perf_schema) \\\n", + " .load(path) \\\n", + " .withColumn('quarter', _get_quarter_from_csv_file_name())\n", + "\n", + "def read_acq_csv(spark, path):\n", + " return spark.read.format('csv') \\\n", + " .option('nullValue', '') \\\n", + " .option('header', 'false') \\\n", + " .option('delimiter', '|') \\\n", + " .schema(_csv_acq_schema) \\\n", + " .load(path) \\\n", + " .withColumn('quarter', _get_quarter_from_csv_file_name())\n", + "\n", + "def _parse_dates(perf):\n", + " return perf \\\n", + " .withColumn('monthly_reporting_period', to_date(col('monthly_reporting_period'), 'MM/dd/yyyy')) \\\n", + " .withColumn('monthly_reporting_period_month', month(col('monthly_reporting_period'))) \\\n", + " .withColumn('monthly_reporting_period_year', year(col('monthly_reporting_period'))) \\\n", + " .withColumn('monthly_reporting_period_day', dayofmonth(col('monthly_reporting_period'))) \\\n", + " .withColumn('last_paid_installment_date', to_date(col('last_paid_installment_date'), 'MM/dd/yyyy')) \\\n", + " .withColumn('foreclosed_after', to_date(col('foreclosed_after'), 'MM/dd/yyyy')) \\\n", + " .withColumn('disposition_date', to_date(col('disposition_date'), 'MM/dd/yyyy')) \\\n", + " .withColumn('maturity_date', to_date(col('maturity_date'), 'MM/yyyy')) \\\n", + " .withColumn('zero_balance_effective_date', to_date(col('zero_balance_effective_date'), 'MM/yyyy'))\n", + "\n", + "def _create_perf_deliquency(spark, perf):\n", + " aggDF = perf.select(\n", + " col(\"quarter\"),\n", + " col(\"loan_id\"),\n", + " col(\"current_loan_delinquency_status\"),\n", + " when(col(\"current_loan_delinquency_status\") >= 1, col(\"monthly_reporting_period\")).alias(\"delinquency_30\"),\n", + " when(col(\"current_loan_delinquency_status\") >= 3, col(\"monthly_reporting_period\")).alias(\"delinquency_90\"),\n", + " when(col(\"current_loan_delinquency_status\") >= 6, col(\"monthly_reporting_period\")).alias(\"delinquency_180\")) \\\n", + " .groupBy(\"quarter\", \"loan_id\") \\\n", + " .agg(\n", + " max(\"current_loan_delinquency_status\").alias(\"delinquency_12\"),\n", + " min(\"delinquency_30\").alias(\"delinquency_30\"),\n", + " min(\"delinquency_90\").alias(\"delinquency_90\"),\n", + " min(\"delinquency_180\").alias(\"delinquency_180\")) \\\n", + " .select(\n", + " col(\"quarter\"),\n", + " col(\"loan_id\"),\n", + " (col(\"delinquency_12\") >= 1).alias(\"ever_30\"),\n", + " (col(\"delinquency_12\") >= 3).alias(\"ever_90\"),\n", + " (col(\"delinquency_12\") >= 6).alias(\"ever_180\"),\n", + " col(\"delinquency_30\"),\n", + " col(\"delinquency_90\"),\n", + " col(\"delinquency_180\"))\n", + " joinedDf = perf \\\n", + " .withColumnRenamed(\"monthly_reporting_period\", \"timestamp\") \\\n", + " .withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n", + " .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n", + " .withColumnRenamed(\"current_loan_delinquency_status\", \"delinquency_12\") \\\n", + " .withColumnRenamed(\"current_actual_upb\", \"upb_12\") \\\n", + " .select(\"quarter\", \"loan_id\", \"timestamp\", \"delinquency_12\", \"upb_12\", \"timestamp_month\", \"timestamp_year\") \\\n", + " .join(aggDF, [\"loan_id\", \"quarter\"], \"left_outer\")\n", + "\n", + " # calculate the 12 month delinquency and upb values\n", + " months = 12\n", + " monthArray = [lit(x) for x in range(0, 12)]\n", + " # explode on a small amount of data is actually slightly more efficient than a cross join\n", + " testDf = joinedDf \\\n", + " .withColumn(\"month_y\", explode(array(monthArray))) \\\n", + " .select(\n", + " col(\"quarter\"),\n", + " floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000) / months).alias(\"josh_mody\"),\n", + " floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000 - col(\"month_y\")) / months).alias(\"josh_mody_n\"),\n", + " col(\"ever_30\"),\n", + " col(\"ever_90\"),\n", + " col(\"ever_180\"),\n", + " col(\"delinquency_30\"),\n", + " col(\"delinquency_90\"),\n", + " col(\"delinquency_180\"),\n", + " col(\"loan_id\"),\n", + " col(\"month_y\"),\n", + " col(\"delinquency_12\"),\n", + " col(\"upb_12\")) \\\n", + " .groupBy(\"quarter\", \"loan_id\", \"josh_mody_n\", \"ever_30\", \"ever_90\", \"ever_180\", \"delinquency_30\", \"delinquency_90\", \"delinquency_180\", \"month_y\") \\\n", + " .agg(max(\"delinquency_12\").alias(\"delinquency_12\"), min(\"upb_12\").alias(\"upb_12\")) \\\n", + " .withColumn(\"timestamp_year\", floor((lit(24000) + (col(\"josh_mody_n\") * lit(months)) + (col(\"month_y\") - 1)) / lit(12))) \\\n", + " .selectExpr('*', 'pmod(24000 + (josh_mody_n * {}) + month_y, 12) as timestamp_month_tmp'.format(months)) \\\n", + " .withColumn(\"timestamp_month\", when(col(\"timestamp_month_tmp\") == lit(0), lit(12)).otherwise(col(\"timestamp_month_tmp\"))) \\\n", + " .withColumn(\"delinquency_12\", ((col(\"delinquency_12\") > 3).cast(\"int\") + (col(\"upb_12\") == 0).cast(\"int\")).alias(\"delinquency_12\")) \\\n", + " .drop(\"timestamp_month_tmp\", \"josh_mody_n\", \"month_y\")\n", + "\n", + " return perf.withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n", + " .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n", + " .join(testDf, [\"quarter\", \"loan_id\", \"timestamp_year\", \"timestamp_month\"], \"left\") \\\n", + " .drop(\"timestamp_year\", \"timestamp_month\")\n", + "\n", + "def _create_acquisition(spark, acq):\n", + " nameMapping = spark.createDataFrame(_name_mapping, [\"from_seller_name\", \"to_seller_name\"])\n", + " return acq.join(nameMapping, col(\"seller_name\") == col(\"from_seller_name\"), \"left\") \\\n", + " .drop(\"from_seller_name\") \\\n", + " .withColumn(\"old_name\", col(\"seller_name\")) \\\n", + " .withColumn(\"seller_name\", coalesce(col(\"to_seller_name\"), col(\"seller_name\"))) \\\n", + " .drop(\"to_seller_name\") \\\n", + " .withColumn(\"orig_date\", to_date(col(\"orig_date\"), \"MM/yyyy\")) \\\n", + " .withColumn(\"first_pay_date\", to_date(col(\"first_pay_date\"), \"MM/yyyy\")) \\\n", + "\n", + "def _gen_dictionary(etl_df, col_names):\n", + " cnt_table = etl_df.select(posexplode(array([col(i) for i in col_names])))\\\n", + " .withColumnRenamed(\"pos\", \"column_id\")\\\n", + " .withColumnRenamed(\"col\", \"data\")\\\n", + " .filter(\"data is not null\")\\\n", + " .groupBy(\"column_id\", \"data\")\\\n", + " .count()\n", + " windowed = Window.partitionBy(\"column_id\").orderBy(desc(\"count\"))\n", + " return cnt_table.withColumn(\"id\", row_number().over(windowed)).drop(\"count\")\n", + "\n", + "\n", + "def _cast_string_columns_to_numeric(spark, input_df):\n", + " cached_dict_df = _gen_dictionary(input_df, cate_col_names).cache()\n", + " output_df = input_df\n", + " # Generate the final table with all columns being numeric.\n", + " for col_pos, col_name in enumerate(cate_col_names):\n", + " col_dict_df = cached_dict_df.filter(col(\"column_id\") == col_pos)\\\n", + " .drop(\"column_id\")\\\n", + " .withColumnRenamed(\"data\", col_name)\n", + " \n", + " output_df = output_df.join(broadcast(col_dict_df), col_name, \"left\")\\\n", + " .drop(col_name)\\\n", + " .withColumnRenamed(\"id\", col_name)\n", + " return output_df\n", + "\n", + "def run_mortgage(spark, perf, acq):\n", + " parsed_perf = _parse_dates(perf)\n", + " perf_deliqency = _create_perf_deliquency(spark, parsed_perf)\n", + " cleaned_acq = _create_acquisition(spark, acq)\n", + " df = perf_deliqency.join(cleaned_acq, [\"loan_id\", \"quarter\"], \"inner\")\n", + " test_quarters = ['2016Q1','2016Q2','2016Q3','2016Q4']\n", + " train_df = df.filter(~df.quarter.isin(test_quarters)).drop(\"quarter\")\n", + " test_df = df.filter(df.quarter.isin(test_quarters)).drop(\"quarter\")\n", + " casted_train_df = _cast_string_columns_to_numeric(spark, train_df)\\\n", + " .select(all_col_names)\\\n", + " .withColumn(label_col_name, when(col(label_col_name) > 0, 1).otherwise(0))\\\n", + " .fillna(float(0))\n", + " casted_test_df = _cast_string_columns_to_numeric(spark, test_df)\\\n", + " .select(all_col_names)\\\n", + " .withColumn(label_col_name, when(col(label_col_name) > 0, 1).otherwise(0))\\\n", + " .fillna(float(0))\n", + " return casted_train_df, casted_test_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Spark conf and Create Spark Session\n", + "For details explanation for spark conf, please go to Spark RAPIDS [config guide](https://nvidia.github.io/spark-rapids/docs/configs.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "sc.stop()\n", + "\n", + "conf = SparkConf().setAppName(\"MortgageETL-CPU\")\n", + "conf.set(\"spark.executor.instances\", \"20\")\n", + "conf.set(\"spark.executor.cores\", \"7\") # spark.executor.cores times spark.executor.instances should equal total cores.\n", + "conf.set(\"spark.task.cpus\", \"1\")\n", + "conf.set(\"spark.executor.memory\", \"36g\")\n", + "conf.set(\"spark.locality.wait\", \"0s\")\n", + "conf.set(\"spark.sql.files.maxPartitionBytes\", \"512m\")\n", + "conf.set(\"spark.executor.resource.gpu.amount\", \"0\")\n", + "conf.set(\"spark.task.resource.gpu.amount\", \"0\")\n", + "conf.set(\"spark.plugins\", \" \")\n", + "conf.set(\"spark.sql.broadcastTimeout\", \"7200\")\n", + "spark = SparkSession.builder \\\n", + " .config(conf=conf) \\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Data Input/Output location" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "orig_perf_path = 'gs://dataproc-nv-demo/mortgage_full/perf/*'\n", + "orig_acq_path = 'gs://dataproc-nv-demo/mortgage_full/acq/*'\n", + "train_path = 'gs://dataproc-nv-demo/mortgage_cpu/train/'\n", + "test_path = 'gs://dataproc-nv-demo/mortgage_cpu/test/'\n", + "tmp_perf_path = 'gs://dataproc-nv-demo/mortgage_parquet_cpu/perf/'\n", + "tmp_acq_path = 'gs://dataproc-nv-demo/mortgage_parquet_cpu/acq/'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read CSV data and Transcode to Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Lets transcode the data first\n", + "start = time.time()\n", + "# we want a few big files instead of lots of small files\n", + "spark.conf.set('spark.sql.files.maxPartitionBytes', '200G')\n", + "acq = read_acq_csv(spark, orig_acq_path)\n", + "acq.repartition(20).write.parquet(tmp_acq_path, mode='overwrite')\n", + "perf = read_perf_csv(spark, orig_perf_path)\n", + "perf.coalesce(80).write.parquet(tmp_perf_path, mode='overwrite')\n", + "end = time.time()\n", + "print(end - start)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Execute ETL Code Defined in 1st Cell" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1194.4553289413452\n", + "1813.5378119945526\n" + ] + } + ], + "source": [ + "# Now lets actually process the data\\n\",\n", + "start = time.time()\n", + "spark.conf.set('spark.sql.shuffle.partitions', '160')\n", + "perf = spark.read.parquet(tmp_perf_path)\n", + "acq = spark.read.parquet(tmp_acq_path)\n", + "train_out, test_out = run_mortgage(spark, perf, acq)\n", + "train_out.write.parquet(train_path, mode='overwrite')\n", + "end = time.time()\n", + "print(end - start)\n", + "test_out.write.parquet(test_path, mode='overwrite')\n", + "end = time.time()\n", + "print(end - start)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Print Physical Plan" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== Physical Plan ==\n", + "*(27) Project [coalesce(orig_channel#1675, 0) AS orig_channel#3439, coalesce(first_home_buyer#1877, 0) AS first_home_buyer#3440, coalesce(loan_purpose#2079, 0) AS loan_purpose#3441, coalesce(property_type#2281, 0) AS property_type#3442, coalesce(occupancy_status#2483, 0) AS occupancy_status#3443, coalesce(property_state#2685, 0) AS property_state#3444, coalesce(relocation_mortgage_indicator#2887, 0) AS relocation_mortgage_indicator#3445, coalesce(seller_name#3089, 0) AS seller_name#3446, coalesce(id#1498, 0) AS mod_flag#3447, coalesce(nanvl(orig_interest_rate#67, null), 0.0) AS orig_interest_rate#3448, coalesce(orig_upb#68, 0) AS orig_upb#3449, coalesce(orig_loan_term#69, 0) AS orig_loan_term#3450, coalesce(nanvl(orig_ltv#72, null), 0.0) AS orig_ltv#3451, coalesce(nanvl(orig_cltv#73, null), 0.0) AS orig_cltv#3452, coalesce(nanvl(num_borrowers#74, null), 0.0) AS num_borrowers#3453, coalesce(nanvl(dti#75, null), 0.0) AS dti#3454, coalesce(nanvl(borrower_credit_score#76, null), 0.0) AS borrower_credit_score#3455, coalesce(num_units#80, 0) AS num_units#3456, coalesce(zip#83, 0) AS zip#3457, coalesce(nanvl(mortgage_insurance_percent#84, null), 0.0) AS mortgage_insurance_percent#3458, coalesce(current_loan_delinquency_status#10, 0) AS current_loan_delinquency_status#3459, coalesce(nanvl(current_actual_upb#4, null), 0.0) AS current_actual_upb#3460, coalesce(nanvl(interest_rate#3, null), 0.0) AS interest_rate#3461, coalesce(nanvl(loan_age#5, null), 0.0) AS loan_age#3462, ... 3 more fields]\n", + "+- *(27) BroadcastHashJoin [mod_flag#11], [mod_flag#3157], LeftOuter, BuildRight\n", + " :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, zip#83, mortgage_insurance_percent#84, orig_channel#1675, first_home_buyer#1877, loan_purpose#2079, property_type#2281, occupancy_status#2483, ... 3 more fields]\n", + " : +- *(27) BroadcastHashJoin [seller_name#1172], [seller_name#2955], LeftOuter, BuildRight\n", + " : :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, zip#83, mortgage_insurance_percent#84, orig_channel#1675, first_home_buyer#1877, loan_purpose#2079, property_type#2281, ... 3 more fields]\n", + " : : +- *(27) BroadcastHashJoin [relocation_mortgage_indicator#88], [relocation_mortgage_indicator#2753], LeftOuter, BuildRight\n", + " : : :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, orig_channel#1675, first_home_buyer#1877, loan_purpose#2079, ... 3 more fields]\n", + " : : : +- *(27) BroadcastHashJoin [property_state#82], [property_state#2551], LeftOuter, BuildRight\n", + " : : : :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, property_state#82, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, orig_channel#1675, first_home_buyer#1877, ... 3 more fields]\n", + " : : : : +- *(27) BroadcastHashJoin [occupancy_status#81], [occupancy_status#2349], LeftOuter, BuildRight\n", + " : : : : :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, occupancy_status#81, property_state#82, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, orig_channel#1675, ... 3 more fields]\n", + " : : : : : +- *(27) BroadcastHashJoin [property_type#79], [property_type#2147], LeftOuter, BuildRight\n", + " : : : : : :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, property_type#79, num_units#80, occupancy_status#81, property_state#82, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, ... 3 more fields]\n", + " : : : : : : +- *(27) BroadcastHashJoin [loan_purpose#78], [loan_purpose#1945], LeftOuter, BuildRight\n", + " : : : : : : :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, loan_purpose#78, property_type#79, num_units#80, occupancy_status#81, property_state#82, zip#83, mortgage_insurance_percent#84, ... 3 more fields]\n", + " : : : : : : : +- *(27) BroadcastHashJoin [first_home_buyer#77], [first_home_buyer#1743], LeftOuter, BuildRight\n", + " : : : : : : : :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, first_home_buyer#77, loan_purpose#78, property_type#79, num_units#80, occupancy_status#81, property_state#82, zip#83, ... 3 more fields]\n", + " : : : : : : : : +- *(27) BroadcastHashJoin [orig_channel#65], [orig_channel#1541], LeftOuter, BuildRight\n", + " : : : : : : : : :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, orig_channel#65, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, first_home_buyer#77, loan_purpose#78, property_type#79, num_units#80, occupancy_status#81, property_state#82, ... 3 more fields]\n", + " : : : : : : : : : +- *(27) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n", + " : : : : : : : : : :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n", + " : : : : : : : : : : +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#2011]\n", + " : : : : : : : : : : +- *(10) Project [quarter#31, loan_id#0L, interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812]\n", + " : : : : : : : : : : +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n", + " : : : : : : : : : : :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n", + " : : : : : : : : : : : +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#1968]\n", + " : : : : : : : : : : : +- *(1) Project [loan_id#0L, interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n", + " : : : : : : : : : : : +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n", + " : : : : : : : : : : : +- *(1) ColumnarToRow\n", + " : : : : : : : : : : : +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,interest_rate#3,current_actual_upb#4,loan_age#5,msa#9,current_loan_delinquency_status#10,mod_flag#11,non_interest_bearing_upb#26,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : : : : : : : : : : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : : : : : : : : : : +- *(5) ColumnarToRow\n", + " : : : : : : : : : : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : : : +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n", + " : : : : : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n", + " : : : : : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : : : : : +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n", + " : : : : : : : : : +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : : : : : +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n", + " : : : : : : : : : +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n", + " : : : : : : : : : +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n", + " : : : : : : : : : :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n", + " : : : : : : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n", + " : : : : : : : : : : +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n", + " : : : : : : : : : : +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n", + " : : : : : : : : : : +- *(3) ColumnarToRow\n", + " : : : : : : : : : : +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : : : : : +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n", + " : : : : : : : : : +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n", + " : : : : : : : : : +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n", + " : : : : : : : : : +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n", + " : : : : : : : : : +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : : : : : : : : : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : : : : : : : : : +- *(5) ColumnarToRow\n", + " : : : : : : : : : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : : +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n", + " : : : : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n", + " : : : : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : : : : +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n", + " : : : : : : : : +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : : : : +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n", + " : : : : : : : : +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n", + " : : : : : : : : +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n", + " : : : : : : : : :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n", + " : : : : : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n", + " : : : : : : : : : +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n", + " : : : : : : : : : +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n", + " : : : : : : : : : +- *(3) ColumnarToRow\n", + " : : : : : : : : : +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : : : : +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n", + " : : : : : : : : +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n", + " : : : : : : : : +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n", + " : : : : : : : : +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n", + " : : : : : : : : +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : : : : : : : : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : : : : : : : : +- *(5) ColumnarToRow\n", + " : : : : : : : : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n", + " : : : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n", + " : : : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : : : +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n", + " : : : : : : : +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : : : +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n", + " : : : : : : : +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n", + " : : : : : : : +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n", + " : : : : : : : :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n", + " : : : : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n", + " : : : : : : : : +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n", + " : : : : : : : : +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n", + " : : : : : : : : +- *(3) ColumnarToRow\n", + " : : : : : : : : +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : : : +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n", + " : : : : : : : +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n", + " : : : : : : : +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n", + " : : : : : : : +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n", + " : : : : : : : +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : : : : : : : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : : : : : : : +- *(5) ColumnarToRow\n", + " : : : : : : : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n", + " : : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n", + " : : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : : +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n", + " : : : : : : +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : : +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n", + " : : : : : : +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n", + " : : : : : : +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n", + " : : : : : : :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n", + " : : : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n", + " : : : : : : : +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n", + " : : : : : : : +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n", + " : : : : : : : +- *(3) ColumnarToRow\n", + " : : : : : : : +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : : +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n", + " : : : : : : +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n", + " : : : : : : +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n", + " : : : : : : +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n", + " : : : : : : +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : : : : : : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : : : : : : +- *(5) ColumnarToRow\n", + " : : : : : : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n", + " : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n", + " : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : : +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n", + " : : : : : +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n", + " : : : : : +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n", + " : : : : : +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n", + " : : : : : :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n", + " : : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n", + " : : : : : : +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n", + " : : : : : : +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n", + " : : : : : : +- *(3) ColumnarToRow\n", + " : : : : : : +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : : +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n", + " : : : : : +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n", + " : : : : : +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n", + " : : : : : +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n", + " : : : : : +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : : : : : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : : : : : +- *(5) ColumnarToRow\n", + " : : : : : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n", + " : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n", + " : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : : +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n", + " : : : : +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n", + " : : : : +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n", + " : : : : +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n", + " : : : : :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n", + " : : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n", + " : : : : : +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n", + " : : : : : +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n", + " : : : : : +- *(3) ColumnarToRow\n", + " : : : : : +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : : +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n", + " : : : : +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n", + " : : : : +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n", + " : : : : +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n", + " : : : : +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : : : : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : : : : +- *(5) ColumnarToRow\n", + " : : : : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n", + " : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n", + " : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : : +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n", + " : : : +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n", + " : : : +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n", + " : : : +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n", + " : : : +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n", + " : : : :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n", + " : : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n", + " : : : : +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n", + " : : : : +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n", + " : : : : +- *(3) ColumnarToRow\n", + " : : : : +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : : +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n", + " : : : +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n", + " : : : +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n", + " : : : +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n", + " : : : +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : : : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : : : +- *(5) ColumnarToRow\n", + " : : : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n", + " : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n", + " : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : : +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n", + " : : +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n", + " : : +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n", + " : : +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n", + " : : +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n", + " : : :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n", + " : : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n", + " : : : +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n", + " : : : +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n", + " : : : +- *(3) ColumnarToRow\n", + " : : : +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : : +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n", + " : : +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n", + " : : +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n", + " : : +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n", + " : : +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : : +- *(5) ColumnarToRow\n", + " : : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n", + " : +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n", + " : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n", + " : +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n", + " : +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n", + " : +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n", + " : +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n", + " : +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n", + " : :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n", + " : : +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n", + " : : +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n", + " : : +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n", + " : : +- *(3) ColumnarToRow\n", + " : : +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct\n", + " : +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n", + " : +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n", + " : +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n", + " : +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n", + " : +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n", + " : +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n", + " : +- *(5) ColumnarToRow\n", + " : +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct= 1, col(\"monthly_reporting_period\")).alias(\"delinquency_30\"),\n", + " when(col(\"current_loan_delinquency_status\") >= 3, col(\"monthly_reporting_period\")).alias(\"delinquency_90\"),\n", + " when(col(\"current_loan_delinquency_status\") >= 6, col(\"monthly_reporting_period\")).alias(\"delinquency_180\")) \\\n", + " .groupBy(\"quarter\", \"loan_id\") \\\n", + " .agg(\n", + " max(\"current_loan_delinquency_status\").alias(\"delinquency_12\"),\n", + " min(\"delinquency_30\").alias(\"delinquency_30\"),\n", + " min(\"delinquency_90\").alias(\"delinquency_90\"),\n", + " min(\"delinquency_180\").alias(\"delinquency_180\")) \\\n", + " .select(\n", + " col(\"quarter\"),\n", + " col(\"loan_id\"),\n", + " (col(\"delinquency_12\") >= 1).alias(\"ever_30\"),\n", + " (col(\"delinquency_12\") >= 3).alias(\"ever_90\"),\n", + " (col(\"delinquency_12\") >= 6).alias(\"ever_180\"),\n", + " col(\"delinquency_30\"),\n", + " col(\"delinquency_90\"),\n", + " col(\"delinquency_180\"))\n", + " joinedDf = perf \\\n", + " .withColumnRenamed(\"monthly_reporting_period\", \"timestamp\") \\\n", + " .withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n", + " .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n", + " .withColumnRenamed(\"current_loan_delinquency_status\", \"delinquency_12\") \\\n", + " .withColumnRenamed(\"current_actual_upb\", \"upb_12\") \\\n", + " .select(\"quarter\", \"loan_id\", \"timestamp\", \"delinquency_12\", \"upb_12\", \"timestamp_month\", \"timestamp_year\") \\\n", + " .join(aggDF, [\"loan_id\", \"quarter\"], \"left_outer\")\n", + "\n", + " # calculate the 12 month delinquency and upb values\n", + " months = 12\n", + " monthArray = [lit(x) for x in range(0, 12)]\n", + " # explode on a small amount of data is actually slightly more efficient than a cross join\n", + " testDf = joinedDf \\\n", + " .withColumn(\"month_y\", explode(array(monthArray))) \\\n", + " .select(\n", + " col(\"quarter\"),\n", + " floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000) / months).alias(\"josh_mody\"),\n", + " floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000 - col(\"month_y\")) / months).alias(\"josh_mody_n\"),\n", + " col(\"ever_30\"),\n", + " col(\"ever_90\"),\n", + " col(\"ever_180\"),\n", + " col(\"delinquency_30\"),\n", + " col(\"delinquency_90\"),\n", + " col(\"delinquency_180\"),\n", + " col(\"loan_id\"),\n", + " col(\"month_y\"),\n", + " col(\"delinquency_12\"),\n", + " col(\"upb_12\")) \\\n", + " .groupBy(\"quarter\", \"loan_id\", \"josh_mody_n\", \"ever_30\", \"ever_90\", \"ever_180\", \"delinquency_30\", \"delinquency_90\", \"delinquency_180\", \"month_y\") \\\n", + " .agg(max(\"delinquency_12\").alias(\"delinquency_12\"), min(\"upb_12\").alias(\"upb_12\")) \\\n", + " .withColumn(\"timestamp_year\", floor((lit(24000) + (col(\"josh_mody_n\") * lit(months)) + (col(\"month_y\") - 1)) / lit(12))) \\\n", + " .selectExpr('*', 'pmod(24000 + (josh_mody_n * {}) + month_y, 12) as timestamp_month_tmp'.format(months)) \\\n", + " .withColumn(\"timestamp_month\", when(col(\"timestamp_month_tmp\") == lit(0), lit(12)).otherwise(col(\"timestamp_month_tmp\"))) \\\n", + " .withColumn(\"delinquency_12\", ((col(\"delinquency_12\") > 3).cast(\"int\") + (col(\"upb_12\") == 0).cast(\"int\")).alias(\"delinquency_12\")) \\\n", + " .drop(\"timestamp_month_tmp\", \"josh_mody_n\", \"month_y\")\n", + "\n", + " return perf.withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n", + " .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n", + " .join(testDf, [\"quarter\", \"loan_id\", \"timestamp_year\", \"timestamp_month\"], \"left\") \\\n", + " .drop(\"timestamp_year\", \"timestamp_month\")\n", + "\n", + "def _create_acquisition(spark, acq):\n", + " nameMapping = spark.createDataFrame(_name_mapping, [\"from_seller_name\", \"to_seller_name\"])\n", + " return acq.join(nameMapping, col(\"seller_name\") == col(\"from_seller_name\"), \"left\") \\\n", + " .drop(\"from_seller_name\") \\\n", + " .withColumn(\"old_name\", col(\"seller_name\")) \\\n", + " .withColumn(\"seller_name\", coalesce(col(\"to_seller_name\"), col(\"seller_name\"))) \\\n", + " .drop(\"to_seller_name\") \\\n", + " .withColumn(\"orig_date\", to_date(col(\"orig_date\"), \"MM/yyyy\")) \\\n", + " .withColumn(\"first_pay_date\", to_date(col(\"first_pay_date\"), \"MM/yyyy\")) \\\n", + "\n", + "def _gen_dictionary(etl_df, col_names):\n", + " cnt_table = etl_df.select(posexplode(array([col(i) for i in col_names])))\\\n", + " .withColumnRenamed(\"pos\", \"column_id\")\\\n", + " .withColumnRenamed(\"col\", \"data\")\\\n", + " .filter(\"data is not null\")\\\n", + " .groupBy(\"column_id\", \"data\")\\\n", + " .count()\n", + " windowed = Window.partitionBy(\"column_id\").orderBy(desc(\"count\"))\n", + " return cnt_table.withColumn(\"id\", row_number().over(windowed)).drop(\"count\")\n", + "\n", + "\n", + "def _cast_string_columns_to_numeric(spark, input_df):\n", + " cached_dict_df = _gen_dictionary(input_df, cate_col_names).cache()\n", + " output_df = input_df\n", + " # Generate the final table with all columns being numeric.\n", + " for col_pos, col_name in enumerate(cate_col_names):\n", + " col_dict_df = cached_dict_df.filter(col(\"column_id\") == col_pos)\\\n", + " .drop(\"column_id\")\\\n", + " .withColumnRenamed(\"data\", col_name)\n", + " \n", + " output_df = output_df.join(broadcast(col_dict_df), col_name, \"left\")\\\n", + " .drop(col_name)\\\n", + " .withColumnRenamed(\"id\", col_name)\n", + " return output_df\n", + "\n", + "def run_mortgage(spark, perf, acq):\n", + " parsed_perf = _parse_dates(perf)\n", + " perf_deliqency = _create_perf_deliquency(spark, parsed_perf)\n", + " cleaned_acq = _create_acquisition(spark, acq)\n", + " df = perf_deliqency.join(cleaned_acq, [\"loan_id\", \"quarter\"], \"inner\")\n", + " test_quarters = ['2016Q1','2016Q2','2016Q3','2016Q4']\n", + " train_df = df.filter(~df.quarter.isin(test_quarters)).drop(\"quarter\")\n", + " test_df = df.filter(df.quarter.isin(test_quarters)).drop(\"quarter\")\n", + " casted_train_df = _cast_string_columns_to_numeric(spark, train_df)\\\n", + " .select(all_col_names)\\\n", + " .withColumn(label_col_name, when(col(label_col_name) > 0, 1).otherwise(0))\\\n", + " .fillna(float(0))\n", + " casted_test_df = _cast_string_columns_to_numeric(spark, test_df)\\\n", + " .select(all_col_names)\\\n", + " .withColumn(label_col_name, when(col(label_col_name) > 0, 1).otherwise(0))\\\n", + " .fillna(float(0))\n", + " return casted_train_df, casted_test_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Spark conf and Create Spark Session\n", + "For details explanation for spark conf, please go to Spark RAPIDS [config guide](https://nvidia.github.io/spark-rapids/docs/configs.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "sc.stop()\n", + "\n", + "conf = SparkConf().setAppName(\"MortgageETL\")\n", + "conf.set('spark.rapids.sql.explain', 'ALL')\n", + "conf.set(\"spark.executor.instances\", \"20\")\n", + "conf.set(\"spark.executor.cores\", \"7\")\n", + "conf.set(\"spark.task.cpus\", \"1\")\n", + "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", \"2\")\n", + "conf.set(\"spark.executor.memory\", \"4g\")\n", + "conf.set(\"spark.rapids.memory.pinnedPool.size\", \"2G\")\n", + "conf.set(\"spark.executor.memoryOverhead\", \"2G\")\n", + "conf.set(\"spark.executor.extraJavaOptions\", \"-Dai.rapids.cudf.prefer-pinned=true\")\n", + "conf.set(\"spark.locality.wait\", \"0s\")\n", + "conf.set(\"spark.sql.files.maxPartitionBytes\", \"512m\")\n", + "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n", + "conf.set(\"spark.task.resource.gpu.amount\", \"0.142\")\n", + "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n", + "conf.set(\"spark.rapids.sql.hasNans\", \"false\")\n", + "conf.set('spark.rapids.sql.batchSizeBytes', '512M')\n", + "conf.set('spark.rapids.sql.reader.batchSizeBytes', '768M')\n", + "conf.set('spark.rapids.sql.variableFloatAgg.enabled', 'true')\n", + "\n", + "spark = SparkSession.builder \\\n", + " .config(conf=conf) \\\n", + " .getOrCreate()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Data Input/Output location" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "orig_perf_path = 'gs://dataproc-nv-demo/mortgage_full/perf/*'\n", + "orig_acq_path = 'gs://dataproc-nv-demo/mortgage_full/acq/*'\n", + "\n", + "train_path = 'gs://dataproc-nv-demo/mortgage_full/train/'\n", + "test_path = 'gs://dataproc-nv-demo/mortgage_full/test/'\n", + "tmp_perf_path = 'gs://dataproc-nv-demo/mortgage_parquet_gpu/perf/'\n", + "tmp_acq_path = 'gs://dataproc-nv-demo/mortgage_parquet_gpu/acq/'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read CSV data and Transcode to Parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "108.28529238700867\n" + ] + } + ], + "source": [ + "# Lets transcode the data first\n", + "start = time.time()\n", + "# we want a few big files instead of lots of small files\n", + "spark.conf.set('spark.sql.files.maxPartitionBytes', '200G')\n", + "acq = read_acq_csv(spark, orig_acq_path)\n", + "acq.repartition(20).write.parquet(tmp_acq_path, mode='overwrite')\n", + "perf = read_perf_csv(spark, orig_perf_path)\n", + "perf.coalesce(80).write.parquet(tmp_perf_path, mode='overwrite')\n", + "end = time.time()\n", + "print(end - start)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Execute ETL Code Defined in 1st Cell" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "137.99262690544128\n", + "171.97584056854248\n" + ] + } + ], + "source": [ + "# Now lets actually process the data\\n\",\n", + "start = time.time()\n", + "spark.conf.set('spark.sql.files.maxPartitionBytes', '1G')\n", + "spark.conf.set('spark.sql.shuffle.partitions', '160')\n", + "perf = spark.read.parquet(tmp_perf_path)\n", + "acq = spark.read.parquet(tmp_acq_path)\n", + "train_out, test_out = run_mortgage(spark, perf, acq)\n", + "train_out.write.parquet(train_path, mode='overwrite')\n", + "end = time.time()\n", + "print(end - start)\n", + "test_out.write.parquet(test_path, mode='overwrite')\n", + "end = time.time()\n", + "print(end - start)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Print Physical Plan" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "== Physical Plan ==\n", + "*(2) GpuColumnarToRow false\n", + "+- GpuProject [gpucoalesce(orig_channel#1922, 0) AS orig_channel#3686, gpucoalesce(first_home_buyer#2124, 0) AS first_home_buyer#3687, gpucoalesce(loan_purpose#2326, 0) AS loan_purpose#3688, gpucoalesce(property_type#2528, 0) AS property_type#3689, gpucoalesce(occupancy_status#2730, 0) AS occupancy_status#3690, gpucoalesce(property_state#2932, 0) AS property_state#3691, gpucoalesce(relocation_mortgage_indicator#3134, 0) AS relocation_mortgage_indicator#3692, gpucoalesce(seller_name#3336, 0) AS seller_name#3693, gpucoalesce(id#1728, 0) AS mod_flag#3694, gpucoalesce(gpunanvl(orig_interest_rate#297, null), 0.0) AS orig_interest_rate#3695, gpucoalesce(orig_upb#298, 0) AS orig_upb#3696, gpucoalesce(orig_loan_term#299, 0) AS orig_loan_term#3697, gpucoalesce(gpunanvl(orig_ltv#302, null), 0.0) AS orig_ltv#3698, gpucoalesce(gpunanvl(orig_cltv#303, null), 0.0) AS orig_cltv#3699, gpucoalesce(gpunanvl(num_borrowers#304, null), 0.0) AS num_borrowers#3700, gpucoalesce(gpunanvl(dti#305, null), 0.0) AS dti#3701, gpucoalesce(gpunanvl(borrower_credit_score#306, null), 0.0) AS borrower_credit_score#3702, gpucoalesce(num_units#310, 0) AS num_units#3703, gpucoalesce(zip#313, 0) AS zip#3704, gpucoalesce(gpunanvl(mortgage_insurance_percent#314, null), 0.0) AS mortgage_insurance_percent#3705, gpucoalesce(current_loan_delinquency_status#240, 0) AS current_loan_delinquency_status#3706, gpucoalesce(gpunanvl(current_actual_upb#234, null), 0.0) AS current_actual_upb#3707, gpucoalesce(gpunanvl(interest_rate#233, null), 0.0) AS interest_rate#3708, gpucoalesce(gpunanvl(loan_age#235, null), 0.0) AS loan_age#3709, ... 3 more fields]\n", + " +- GpuBroadcastHashJoin [mod_flag#241], [mod_flag#3404], LeftOuter, BuildRight\n", + " :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, zip#313, mortgage_insurance_percent#314, orig_channel#1922, first_home_buyer#2124, loan_purpose#2326, property_type#2528, occupancy_status#2730, ... 3 more fields]\n", + " : +- GpuBroadcastHashJoin [seller_name#1402], [seller_name#3202], LeftOuter, BuildRight\n", + " : :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, zip#313, mortgage_insurance_percent#314, orig_channel#1922, first_home_buyer#2124, loan_purpose#2326, property_type#2528, ... 3 more fields]\n", + " : : +- GpuBroadcastHashJoin [relocation_mortgage_indicator#318], [relocation_mortgage_indicator#3000], LeftOuter, BuildRight\n", + " : : :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, orig_channel#1922, first_home_buyer#2124, loan_purpose#2326, ... 3 more fields]\n", + " : : : +- GpuBroadcastHashJoin [property_state#312], [property_state#2798], LeftOuter, BuildRight\n", + " : : : :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, property_state#312, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, orig_channel#1922, first_home_buyer#2124, ... 3 more fields]\n", + " : : : : +- GpuBroadcastHashJoin [occupancy_status#311], [occupancy_status#2596], LeftOuter, BuildRight\n", + " : : : : :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, occupancy_status#311, property_state#312, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, orig_channel#1922, ... 3 more fields]\n", + " : : : : : +- GpuBroadcastHashJoin [property_type#309], [property_type#2394], LeftOuter, BuildRight\n", + " : : : : : :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, property_type#309, num_units#310, occupancy_status#311, property_state#312, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, ... 3 more fields]\n", + " : : : : : : +- GpuBroadcastHashJoin [loan_purpose#308], [loan_purpose#2192], LeftOuter, BuildRight\n", + " : : : : : : :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, loan_purpose#308, property_type#309, num_units#310, occupancy_status#311, property_state#312, zip#313, mortgage_insurance_percent#314, ... 3 more fields]\n", + " : : : : : : : +- GpuBroadcastHashJoin [first_home_buyer#307], [first_home_buyer#1990], LeftOuter, BuildRight\n", + " : : : : : : : :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, first_home_buyer#307, loan_purpose#308, property_type#309, num_units#310, occupancy_status#311, property_state#312, zip#313, ... 3 more fields]\n", + " : : : : : : : : +- GpuBroadcastHashJoin [orig_channel#295], [orig_channel#1788], LeftOuter, BuildRight\n", + " : : : : : : : : :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, orig_channel#295, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, first_home_buyer#307, loan_purpose#308, property_type#309, num_units#310, occupancy_status#311, property_state#312, ... 3 more fields]\n", + " : : : : : : : : : +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n", + " : : : : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#3294]\n", + " : : : : : : : : : : +- GpuProject [quarter#261, loan_id#230L, interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042]\n", + " : : : : : : : : : : +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n", + " : : : : : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#3124]\n", + " : : : : : : : : : : : +- GpuProject [loan_id#230L, interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n", + " : : : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : : : +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n", + " : : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,interest_rate#233,current_actual_upb#234,loan_age#235,msa#239,current_loan_delinquency_status#240,mod_flag#241,non_interest_bearing_upb#256,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n", + " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : : : +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n", + " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n", + " : : : : : : : : : +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n", + " : : : : : : : : : +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n", + " : : : : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n", + " : : : : : : : : : : +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n", + " : : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : : +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n", + " : : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n", + " : : : : : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : : +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n", + " : : : : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : : +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n", + " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n", + " : : : : : : : : +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n", + " : : : : : : : : +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n", + " : : : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n", + " : : : : : : : : : +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n", + " : : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : : +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n", + " : : : : : : : : : +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n", + " : : : : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : : +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : : : : : : : : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n", + " : : : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : : +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n", + " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n", + " : : : : : : : +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n", + " : : : : : : : +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n", + " : : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n", + " : : : : : : : : +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n", + " : : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : : +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n", + " : : : : : : : : +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n", + " : : : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : : +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : : : : : : : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n", + " : : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : : +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n", + " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n", + " : : : : : : +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n", + " : : : : : : +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n", + " : : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n", + " : : : : : : : +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n", + " : : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : : +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n", + " : : : : : : : +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n", + " : : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : : +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : : : : : : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n", + " : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : : +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n", + " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n", + " : : : : : +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n", + " : : : : : +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n", + " : : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n", + " : : : : : : +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n", + " : : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : : +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n", + " : : : : : : +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n", + " : : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : : +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : : : : : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n", + " : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : : +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n", + " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n", + " : : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n", + " : : : : +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n", + " : : : : +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n", + " : : : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n", + " : : : : : +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n", + " : : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : : +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n", + " : : : : : +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n", + " : : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : : +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : : : : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n", + " : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : : +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n", + " : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n", + " : : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n", + " : : : +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n", + " : : : +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n", + " : : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n", + " : : : : +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n", + " : : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : : +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n", + " : : : : +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n", + " : : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : : +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : : : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n", + " : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : : +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n", + " : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n", + " : : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n", + " : : +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n", + " : : +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n", + " : : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n", + " : : : +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n", + " : : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : : +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n", + " : : : +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : : +- GpuCoalesceBatches RequireSingleBatch\n", + " : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n", + " : : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : : +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : +- GpuCoalesceBatches RequireSingleBatch\n", + " : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n", + " : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n", + " : +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n", + " : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n", + " : +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n", + " : +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n", + " : +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n", + " : :- GpuCoalesceBatches TargetSize(536870912)\n", + " : : +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n", + " : : +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n", + " : : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : : +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n", + " : : +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n", + " : +- GpuCoalesceBatches RequireSingleBatch\n", + " : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n", + " : +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n", + " : +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n", + " : +- GpuCoalesceBatches TargetSize(536870912)\n", + " : +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n", + " : +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct\n

Introduction to XGBoost Spark with GPU

\n

Mortgage is an example of xgboost classifier to do binary classification. This notebook will show you how to load data, train the xgboost model and use this model to predict if a mushroom is “poisonous”. Camparing to original XGBoost Spark code, there’re only one API difference.

\n

Load libraries

\n

First load some common libraries will be used by both GPU version and CPU version xgboost.

\n\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580281_1080045385", + "id": "20200712-043620_382811823", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:45+0000", + "dateFinished": "2020-07-13T02:18:45+0000", + "status": "FINISHED", + "focus": true, + "$$hashKey": "object:11086" + }, + { + "text": "import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}\nimport org.apache.spark.sql.SparkSession\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}\nimport org.apache.spark.SparkConf", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:45+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}\nimport org.apache.spark.sql.SparkSession\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}\nimport org.apache.spark.SparkConf\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580282_314340064", + "id": "20200712-043620_1400821320", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:45+0000", + "dateFinished": "2020-07-13T02:18:46+0000", + "status": "FINISHED", + "$$hashKey": "object:11087" + }, + { + "text": "%md\nBesides CPU version requires some extra libraries, such as:\n\n```scala\nimport org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.sql.DataFrame\nimport org.apache.spark.sql.functions._\nimport org.apache.spark.sql.types.FloatType\n```", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:46+0000", + "config": { + "editorMode": "ace/mode/markdown", + "editorHide": true, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true + }, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {} + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "
\n

Besides CPU version requires some extra libraries, such as:

\n
import org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.sql.DataFrame\nimport org.apache.spark.sql.functions._\nimport org.apache.spark.sql.types.FloatType\n
\n\n
" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580282_1068889472", + "id": "20200712-043620_1625961573", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:46+0000", + "dateFinished": "2020-07-13T02:18:46+0000", + "status": "FINISHED", + "$$hashKey": "object:11088" + }, + { + "title": "Set the dataset path", + "text": "// Update all path with your Dataproc Environment\nval trainPath = \"gs://dataproc-nv-demo/mortgage_full/train/\"\nval evalPath = \"gs://dataproc-nv-demo/mortgage_full/test/\"\nval transPath = \"gs://dataproc-nv-demo/mortgage_full/test/\"\nval modelPath = \"gs://dataproc-nv-demo/mortgage_full/model/mortgage\"", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:46+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + }, + "title": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\u001b[1m\u001b[34mtrainPath\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = gs://dataproc-nv-demo/mortgage_full/train/\n\u001b[1m\u001b[34mevalPath\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = gs://dataproc-nv-demo/mortgage_full/test/\n\u001b[1m\u001b[34mtransPath\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = gs://dataproc-nv-demo/mortgage_full/test/\n\u001b[1m\u001b[34mmodelPath\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = gs://dataproc-nv-demo/mortgage_full/model/mortgage\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580282_1437224612", + "id": "20200712-043620_1955827407", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:46+0000", + "dateFinished": "2020-07-13T02:18:46+0000", + "status": "FINISHED", + "$$hashKey": "object:11089" + }, + { + "text": "%md\n## Build the schema and parameters\nThe mortgage data has 27 columns: 26 features and 1 label. \"deinquency_12\" is the label column. The schema will be used to load data in the future.\n\nThe next block also defines some key parameters used in xgboost training process.", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:46+0000", + "config": { + "editorMode": "ace/mode/markdown", + "editorHide": true, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true + }, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {} + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "
\n

Build the schema and parameters

\n

The mortgage data has 27 columns: 26 features and 1 label. “deinquency_12” is the label column. The schema will be used to load data in the future.

\n

The next block also defines some key parameters used in xgboost training process.

\n\n
" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580282_433144999", + "id": "20200712-043620_2043825692", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:46+0000", + "dateFinished": "2020-07-13T02:18:46+0000", + "status": "FINISHED", + "$$hashKey": "object:11090" + }, + { + "text": "val labelColName = \"delinquency_12\"\nval schema = StructType(List(\n StructField(\"orig_channel\", DoubleType),\n StructField(\"first_home_buyer\", DoubleType),\n StructField(\"loan_purpose\", DoubleType),\n StructField(\"property_type\", DoubleType),\n StructField(\"occupancy_status\", DoubleType),\n StructField(\"property_state\", DoubleType),\n StructField(\"product_type\", DoubleType),\n StructField(\"relocation_mortgage_indicator\", DoubleType),\n StructField(\"seller_name\", DoubleType),\n StructField(\"mod_flag\", DoubleType),\n StructField(\"orig_interest_rate\", DoubleType),\n StructField(\"orig_upb\", IntegerType),\n StructField(\"orig_loan_term\", IntegerType),\n StructField(\"orig_ltv\", DoubleType),\n StructField(\"orig_cltv\", DoubleType),\n StructField(\"num_borrowers\", DoubleType),\n StructField(\"dti\", DoubleType),\n StructField(\"borrower_credit_score\", DoubleType),\n StructField(\"num_units\", IntegerType),\n StructField(\"zip\", IntegerType),\n StructField(\"mortgage_insurance_percent\", DoubleType),\n StructField(\"current_loan_delinquency_status\", IntegerType),\n StructField(\"current_actual_upb\", DoubleType),\n StructField(\"interest_rate\", DoubleType),\n StructField(\"loan_age\", DoubleType),\n StructField(\"msa\", DoubleType),\n StructField(\"non_interest_bearing_upb\", DoubleType),\n StructField(labelColName, IntegerType)))\n\nval featureNames = schema.filter(_.name != labelColName).map(_.name)", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:46+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\u001b[1m\u001b[34mlabelColName\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = delinquency_12\n\u001b[1m\u001b[34mschema\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.types.StructType\u001b[0m = StructType(StructField(orig_channel,DoubleType,true), StructField(first_home_buyer,DoubleType,true), StructField(loan_purpose,DoubleType,true), StructField(property_type,DoubleType,true), StructField(occupancy_status,DoubleType,true), StructField(property_state,DoubleType,true), StructField(product_type,DoubleType,true), StructField(relocation_mortgage_indicator,DoubleType,true), StructField(seller_name,DoubleType,true), StructField(mod_flag,DoubleType,true), StructField(orig_interest_rate,DoubleType,true), StructField(orig_upb,IntegerType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_ltv,DoubleType,true), StructField(orig_cltv...\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580282_-318188050", + "id": "20200712-043620_542099397", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:46+0000", + "dateFinished": "2020-07-13T02:18:46+0000", + "status": "FINISHED", + "$$hashKey": "object:11091" + }, + { + "text": "%md\n## Create a new spark session and load data\n\nA new spark session should be created to continue all the following spark operations.\n\nNOTE: in this notebook, the dependency jars have been loaded when installing toree kernel. Alternatively the jars can be loaded into notebook by [%AddJar magic](https://toree.incubator.apache.org/docs/current/user/faq/). However, there's one restriction for `%AddJar`: the jar uploaded can only be available when `AddJar` is called just after a new spark session is created. Do it as below:\n\n```scala\nimport org.apache.spark.sql.SparkSession\nval spark = SparkSession.builder().appName(\"mortgage-GPU\").getOrCreate\n%AddJar file:/data/libs/cudf-XXX-cuda10.jar\n%AddJar file:/data/libs/rapids-4-spark-XXX.jar\n%AddJar file:/data/libs/xgboost4j_3.0-XXX.jar\n%AddJar file:/data/libs/xgboost4j-spark_3.0-XXX.jar\n// ...\n```\n\n##### Please note the new jar \"rapids-4-spark-XXX.jar\" is only needed for GPU version, you can not add it to dependence list for CPU version.", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:47+0000", + "config": { + "editorMode": "ace/mode/markdown", + "editorHide": true, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true + }, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {} + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "
\n

Create a new spark session and load data

\n

A new spark session should be created to continue all the following spark operations.

\n

NOTE: in this notebook, the dependency jars have been loaded when installing toree kernel. Alternatively the jars can be loaded into notebook by %AddJar magic. However, there’s one restriction for %AddJar: the jar uploaded can only be available when AddJar is called just after a new spark session is created. Do it as below:

\n
import org.apache.spark.sql.SparkSession\nval spark = SparkSession.builder().appName("mortgage-GPU").getOrCreate\n%AddJar file:/data/libs/cudf-XXX-cuda10.jar\n%AddJar file:/data/libs/rapids-4-spark-XXX.jar\n%AddJar file:/data/libs/xgboost4j_3.0-XXX.jar\n%AddJar file:/data/libs/xgboost4j-spark_3.0-XXX.jar\n// ...\n
\n
Please note the new jar “rapids-4-spark-XXX.jar” is only needed for GPU version, you can not add it to dependence list for CPU version.
\n\n
" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580283_-1107372761", + "id": "20200712-043620_889594738", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:47+0000", + "dateFinished": "2020-07-13T02:18:47+0000", + "status": "FINISHED", + "$$hashKey": "object:11092" + }, + { + "text": "// Build the spark session and data reader as usual\nval conf = new SparkConf()\nconf.set(\"spark.executor.instances\", \"20\")\nconf.set(\"spark.executor.cores\", \"7\")\nconf.set(\"spark.task.cpus\", \"7\")\nconf.set(\"spark.executor.memory\", \"24g\")\nconf.set(\"spark.rapids.memory.pinnedPool.size\", \"2G\")\nconf.set(\"spark.executor.memoryOverhead\", \"16G\")\nconf.set(\"spark.executor.extraJavaOptions\", \"-Dai.rapids.cudf.prefer-pinned=true\")\nconf.set(\"spark.locality.wait\", \"0s\")\nconf.set(\"spark.sql.files.maxPartitionBytes\", \"512m\")\nconf.set(\"spark.executor.resource.gpu.amount\", \"1\")\nconf.set(\"spark.task.resource.gpu.amount\", \"1\")\nconf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\nconf.set(\"spark.rapids.sql.hasNans\", \"false\")\nconf.set(\"spark.rapids.sql.batchSizeBytes\", \"512M\")\nconf.set(\"spark.rapids.sql.reader.batchSizeBytes\", \"768M\")\nconf.set(\"spark.rapids.sql.variableFloatAgg.enabled\", \"true\")\nconf.set(\"spark.rapids.memory.gpu.pooling.enabled\", \"false\")\n// conf.set(\"spark.rapids.memory.gpu.allocFraction\", \"0.1\")\nval spark = SparkSession.builder.appName(\"mortgage-gpu\")\n .enableHiveSupport()\n .config(conf)\n .getOrCreate\nval reader = spark.read.option(\"header\", true).schema(schema)", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:47+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\u001b[1m\u001b[34mconf\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.SparkConf\u001b[0m = org.apache.spark.SparkConf@1aab0102\n\u001b[1m\u001b[34mspark\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.SparkSession\u001b[0m = org.apache.spark.sql.SparkSession@1239890f\n\u001b[1m\u001b[34mreader\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrameReader\u001b[0m = org.apache.spark.sql.DataFrameReader@7a9bb956\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580283_-892064929", + "id": "20200712-043620_622739089", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:47+0000", + "dateFinished": "2020-07-13T02:18:53+0000", + "status": "FINISHED", + "$$hashKey": "object:11093" + }, + { + "text": "val trainSet = reader.parquet(trainPath)\nval evalSet = reader.parquet(evalPath)\nval transSet = reader.parquet(transPath)", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:53+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\u001b[1m\u001b[34mtrainSet\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrame\u001b[0m = [orig_channel: double, first_home_buyer: double ... 26 more fields]\n\u001b[1m\u001b[34mevalSet\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrame\u001b[0m = [orig_channel: double, first_home_buyer: double ... 26 more fields]\n\u001b[1m\u001b[34mtransSet\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrame\u001b[0m = [orig_channel: double, first_home_buyer: double ... 26 more fields]\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580283_1108385932", + "id": "20200712-043620_562533619", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:53+0000", + "dateFinished": "2020-07-13T02:18:54+0000", + "status": "FINISHED", + "$$hashKey": "object:11094" + }, + { + "text": "%md\n## Set xgboost parameters and build a XGBoostClassifier\n\nFor CPU version, `num_workers` is recommended being equal to the number of CPU cores, while for GPU version, it should be set to the number of GPUs in Spark cluster.\n\nBesides the `tree_method` for CPU version is also different from that for GPU version. Now only \"gpu_hist\" is supported for training on GPU.\n\n```scala\n// difference in parameters\n \"num_workers\" -> 12,\n \"tree_method\" -> \"hist\",\n```", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:54+0000", + "config": { + "editorMode": "ace/mode/markdown", + "editorHide": true, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true + }, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {} + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "
\n

Set xgboost parameters and build a XGBoostClassifier

\n

For CPU version, num_workers is recommended being equal to the number of CPU cores, while for GPU version, it should be set to the number of GPUs in Spark cluster.

\n

Besides the tree_method for CPU version is also different from that for GPU version. Now only “gpu_hist” is supported for training on GPU.

\n
// difference in parameters\n  "num_workers" -> 12,\n  "tree_method" -> "hist",\n
\n\n
" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580283_-880026833", + "id": "20200712-043620_1948369426", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:54+0000", + "dateFinished": "2020-07-13T02:18:54+0000", + "status": "FINISHED", + "$$hashKey": "object:11095" + }, + { + "text": "val commParamMap = Map(\n \"eta\" -> 0.1,\n \"gamma\" -> 0.1,\n \"missing\" -> 0.0,\n \"max_depth\" -> 10,\n \"max_leaves\" -> 256,\n \"objective\" -> \"binary:logistic\",\n \"grow_policy\" -> \"depthwise\",\n \"min_child_weight\" -> 30,\n \"lambda\" -> 1,\n \"scale_pos_weight\" -> 2,\n \"subsample\" -> 1,\n \"num_round\" -> 100)\n \nval xgbParamFinal = commParamMap ++ Map(\"tree_method\" -> \"gpu_hist\", \"num_workers\" -> 20, \"nthread\" -> 7)", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:54+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\u001b[1m\u001b[34mcommParamMap\u001b[0m: \u001b[1m\u001b[32mscala.collection.immutable.Map[String,Any]\u001b[0m = Map(min_child_weight -> 30, grow_policy -> depthwise, scale_pos_weight -> 2, subsample -> 1, lambda -> 1, max_depth -> 10, objective -> binary:logistic, num_round -> 100, missing -> 0.0, eta -> 0.1, max_leaves -> 256, gamma -> 0.1)\n\u001b[1m\u001b[34mxgbParamFinal\u001b[0m: \u001b[1m\u001b[32mscala.collection.immutable.Map[String,Any]\u001b[0m = Map(min_child_weight -> 30, grow_policy -> depthwise, scale_pos_weight -> 2, num_workers -> 20, subsample -> 1, lambda -> 1, max_depth -> 10, objective -> binary:logistic, num_round -> 100, missing -> 0.0, tree_method -> gpu_hist, eta -> 0.1, max_leaves -> 256, gamma -> 0.1, nthread -> 7)\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580283_312126552", + "id": "20200712-043620_726034129", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:54+0000", + "dateFinished": "2020-07-13T02:18:54+0000", + "status": "FINISHED", + "$$hashKey": "object:11096" + }, + { + "text": "%md\nHere comes the only API difference,`setFeaturesCol` in CPU version vs `setFeaturesCols` in GPU version.\n\nIn previous block, it said that CPU version needs `VectorAssembler` to assemble multiple feature columns into one column, because `setFeaturesCol` only accepts one feature column with the type of `vector`.\n\nBut `setFeaturesCols` supports multiple columns directly, so set the feautres column names directly to `XGBoostClassifier`. \n\nCPU version:\n\n```scala\nval xgbClassifier = new XGBoostClassifier(paramMap)\n .setLabelCol(labelName)\n .setFeaturesCol(\"features\")\n```", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:54+0000", + "config": { + "editorMode": "ace/mode/markdown", + "editorHide": true, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true + }, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {} + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "
\n

Here comes the only API difference,setFeaturesCol in CPU version vs setFeaturesCols in GPU version.

\n

In previous block, it said that CPU version needs VectorAssembler to assemble multiple feature columns into one column, because setFeaturesCol only accepts one feature column with the type of vector.

\n

But setFeaturesCols supports multiple columns directly, so set the feautres column names directly to XGBoostClassifier.

\n

CPU version:

\n
val xgbClassifier  = new XGBoostClassifier(paramMap)\n  .setLabelCol(labelName)\n  .setFeaturesCol("features")\n
\n\n
" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580283_1889609272", + "id": "20200712-043620_531120952", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:54+0000", + "dateFinished": "2020-07-13T02:18:54+0000", + "status": "FINISHED", + "$$hashKey": "object:11097" + }, + { + "text": "val xgbClassifier = new XGBoostClassifier(xgbParamFinal)\n .setLabelCol(labelColName)\n // === diff ===\n .setFeaturesCols(featureNames)", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:55+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\u001b[1m\u001b[34mxgbClassifier\u001b[0m: \u001b[1m\u001b[32mml.dmlc.xgboost4j.scala.spark.XGBoostClassifier\u001b[0m = xgbc_2ce07ee0b6cb\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580283_-1143522441", + "id": "20200712-043620_427072123", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:55+0000", + "dateFinished": "2020-07-13T02:18:55+0000", + "status": "FINISHED", + "$$hashKey": "object:11098" + }, + { + "text": "%md\n## Benchmark and train\nThe object `benchmark` is used to compute the elapsed time of some operations.\n\nTraining with evaluation sets is also supported in 2 ways, the same as CPU version's behavior:\n\n* Call API `setEvalSets` after initializing an XGBoostClassifier\n\n```scala\nxgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))\n\n```\n\n* Use parameter `eval_sets` when initializing an XGBoostClassifier\n\n```scala\nval paramMapWithEval = paramMap + (\"eval_sets\" -> Map(\"eval\" -> evalSet))\nval xgbClassifierWithEval = new XGBoostClassifier(paramMapWithEval)\n```\n\nHere chooses the API way to set evaluation sets.", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:55+0000", + "config": { + "editorMode": "ace/mode/markdown", + "editorHide": true, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true + }, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {} + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "
\n

Benchmark and train

\n

The object benchmark is used to compute the elapsed time of some operations.

\n

Training with evaluation sets is also supported in 2 ways, the same as CPU version’s behavior:

\n
    \n
  • Call API setEvalSets after initializing an XGBoostClassifier
  • \n
\n
xgbClassifier.setEvalSets(Map("eval" -> evalSet))\n\n
\n
    \n
  • Use parameter eval_sets when initializing an XGBoostClassifier
  • \n
\n
val paramMapWithEval = paramMap + ("eval_sets" -> Map("eval" -> evalSet))\nval xgbClassifierWithEval = new XGBoostClassifier(paramMapWithEval)\n
\n

Here chooses the API way to set evaluation sets.

\n\n
" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580283_-268123036", + "id": "20200712-043620_1915241764", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:55+0000", + "dateFinished": "2020-07-13T02:18:55+0000", + "status": "FINISHED", + "$$hashKey": "object:11099" + }, + { + "text": "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:55+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\u001b[1m\u001b[34mres86\u001b[0m: \u001b[1m\u001b[32mxgbClassifier.type\u001b[0m = xgbc_2ce07ee0b6cb\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580283_-1163292247", + "id": "20200712-043620_324775014", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:55+0000", + "dateFinished": "2020-07-13T02:18:55+0000", + "status": "FINISHED", + "$$hashKey": "object:11100" + }, + { + "text": "def benchmark[R](phase: String)(block: => R): (R, Float) = {\n val t0 = System.currentTimeMillis\n val result = block // call-by-name\n val t1 = System.currentTimeMillis\n println(\"Elapsed time [\" + phase + \"]: \" + ((t1 - t0).toFloat / 1000) + \"s\")\n (result, (t1 - t0).toFloat / 1000)\n}", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:55+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\u001b[1m\u001b[34mbenchmark\u001b[0m: \u001b[1m\u001b[32m[R](phase: String)(block: => R)(R, Float)\u001b[0m\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580284_-196014933", + "id": "20200712-043620_1233757982", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:55+0000", + "dateFinished": "2020-07-13T02:18:55+0000", + "status": "FINISHED", + "$$hashKey": "object:11101" + }, + { + "text": "%md\nCPU version reqires an extra step before fitting data to classifier, using `VectorAssembler` to assemble all feature columns into one column. The following code snip shows how to do the vectorizing.\n\n```scala\nobject Vectorize {\n def apply(df: DataFrame, featureNames: Seq[String], labelName: String): DataFrame = {\n val toFloat = df.schema.map(f => col(f.name).cast(FloatType))\n new VectorAssembler()\n .setInputCols(featureNames.toArray)\n .setOutputCol(\"features\")\n .transform(df.select(toFloat:_*))\n .select(col(\"features\"), col(labelName))\n }\n}\n\ntrainSet = Vectorize(trainSet, featureCols, labelName)\nevalSet = Vectorize(evalSet, featureCols, labelName)\ntransSet = Vectorize(transSet, featureCols, labelName)\n\n```\n\n`VectorAssembler` is not needed for GPU version. Just fit the loaded data directly to XGBoostClassifier.", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:56+0000", + "config": { + "editorMode": "ace/mode/text", + "editorHide": false, + "editorSetting": { + "language": "text", + "editOnDblClick": false + }, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "tableHide": false + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "
\n

CPU version reqires an extra step before fitting data to classifier, using VectorAssembler to assemble all feature columns into one column. The following code snip shows how to do the vectorizing.

\n
object Vectorize {\n  def apply(df: DataFrame, featureNames: Seq[String], labelName: String): DataFrame = {\n    val toFloat = df.schema.map(f => col(f.name).cast(FloatType))\n    new VectorAssembler()\n      .setInputCols(featureNames.toArray)\n      .setOutputCol("features")\n      .transform(df.select(toFloat:_*))\n      .select(col("features"), col(labelName))\n  }\n}\n\ntrainSet = Vectorize(trainSet, featureCols, labelName)\nevalSet = Vectorize(evalSet, featureCols, labelName)\ntransSet = Vectorize(transSet, featureCols, labelName)\n\n
\n

VectorAssembler is not needed for GPU version. Just fit the loaded data directly to XGBoostClassifier.

\n\n
" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580284_-1513881670", + "id": "20200712-043620_618156060", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:56+0000", + "dateFinished": "2020-07-13T02:18:56+0000", + "status": "FINISHED", + "$$hashKey": "object:11102" + }, + { + "text": "// Start training\nprintln(\"\\n------ Training ------\")\nval (xgbClassificationModel, _) = benchmark(\"train\") {\n xgbClassifier.fit(trainSet)\n}", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:18:56+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580284_-695049679", + "id": "20200712-043620_1418358219", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:18:56+0000", + "dateFinished": "2020-07-13T02:26:51+0000", + "status": "FINISHED", + "$$hashKey": "object:11103", + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\n------ Training ------\nTracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.164.0.17, DMLC_TRACKER_PORT=9091, DMLC_NUM_WORKER=20}\nElapsed time [train]: 475.008s\n\u001b[1m\u001b[34mxgbClassificationModel\u001b[0m: \u001b[1m\u001b[32mml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel\u001b[0m = xgbc_2ce07ee0b6cb\n" + } + ] + } + }, + { + "text": "%md\n## Transformation and evaluation\nHere uses `transSet` to evaluate our model and prints some useful columns to show our prediction result. After that `MulticlassClassificationEvaluator` is used to calculate an overall accuracy of our predictions.", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:26:51+0000", + "config": { + "editorMode": "ace/mode/markdown", + "editorHide": true, + "editorSetting": { + "language": "markdown", + "editOnDblClick": true + }, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {} + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "HTML", + "data": "
\n

Transformation and evaluation

\n

Here uses transSet to evaluate our model and prints some useful columns to show our prediction result. After that MulticlassClassificationEvaluator is used to calculate an overall accuracy of our predictions.

\n\n
" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580284_1090201866", + "id": "20200712-043620_470610364", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:26:51+0000", + "dateFinished": "2020-07-13T02:26:51+0000", + "status": "FINISHED", + "$$hashKey": "object:11104" + }, + { + "text": "println(\"\\n------ Transforming ------\")\nval (results, _) = benchmark(\"transform\") {\n val ret = xgbClassificationModel.transform(transSet).cache()\n ret\n}\nz.show(results.select(\"orig_channel\", labelColName,\"rawPrediction\",\"probability\",\"prediction\").limit(10))\n\nprintln(\"\\n------Accuracy of Evaluation------\")\nval evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)\nval accuracy = evaluator.evaluate(results)\nprintln(accuracy)", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:26:51+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": { + "1": { + "graph": { + "mode": "table", + "height": 300, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "orig_channel": "string", + "delinquency_12": "string", + "rawPrediction": "string", + "probability": "string", + "prediction": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + } + }, + "commonSetting": {} + } + } + }, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + } + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "\n------ Transforming ------\nElapsed time [transform]: 0.143s\n" + }, + { + "type": "TABLE", + "data": "orig_channel\tdelinquency_12\trawPrediction\tprobability\tprediction\n4.9E-324\t0\t[5.001231670379639,-5.001231670379639]\t[0.9933153325691819,0.006684667430818081]\t0.0\n1.0E-323\t0\t[6.777693748474121,-6.777693748474121]\t[0.9988623971585184,0.0011376028414815664]\t0.0\n4.9E-324\t0\t[7.609184741973877,-7.609184741973877]\t[0.999504369799979,4.956302000209689E-4]\t0.0\n1.0E-323\t0\t[8.442628860473633,-8.442628860473633]\t[0.9997845634934492,2.1543650655075908E-4]\t0.0\n1.0E-323\t0\t[8.08891773223877,-8.08891773223877]\t[0.9996931724308524,3.068275691475719E-4]\t0.0\n4.9E-324\t0\t[8.863614082336426,-8.863614082336426]\t[0.999858577051782,1.4142294821795076E-4]\t0.0\n1.0E-323\t0\t[8.85793399810791,-8.85793399810791]\t[0.9998577715887222,1.422284112777561E-4]\t0.0\n4.9E-324\t0\t[7.265506744384766,-7.265506744384766]\t[0.9993012417689897,6.98758231010288E-4]\t0.0\n4.9E-324\t0\t[5.615269184112549,-5.615269184112549]\t[0.9963713854085654,0.003628614591434598]\t0.0\n4.9E-324\t0\t[6.023037910461426,-6.023037910461426]\t[0.997583553660661,0.002416446339339018]\t0.0\n" + }, + { + "type": "TEXT", + "data": "\n------Accuracy of Evaluation------\n0.9982550045083602\n\u001b[1m\u001b[34mresults\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.Dataset[org.apache.spark.sql.Row]\u001b[0m = [orig_channel: double, first_home_buyer: double ... 29 more fields]\n\u001b[1m\u001b[34mevaluator\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\u001b[0m = MulticlassClassificationEvaluator: uid=mcEval_62ee3ceb950d, metricName=f1, metricLabel=0.0, beta=1.0, eps=1.0E-15\n\u001b[1m\u001b[34maccuracy\u001b[0m: \u001b[1m\u001b[32mDouble\u001b[0m = 0.9982550045083602\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580284_-218421974", + "id": "20200712-043620_775095654", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:26:51+0000", + "dateFinished": "2020-07-13T02:27:20+0000", + "status": "FINISHED", + "$$hashKey": "object:11105" + }, + { + "title": "Example to save/load the model, predict with the model", + "text": "xgbClassificationModel.write.overwrite.save(modelPath)\n\nval modelFromDisk = XGBoostClassificationModel.load(modelPath)\n\nval (results2, _) = benchmark(\"transform2\") {\n modelFromDisk.transform(transSet)\n}\nz.show(results2.limit(5))", + "user": "anonymous", + "dateUpdated": "2020-07-13T02:27:20+0000", + "config": { + "editorMode": "ace/mode/scala", + "editorHide": false, + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": { + "1": { + "graph": { + "mode": "table", + "height": 300, + "optionOpen": false, + "setting": { + "table": { + "tableGridState": {}, + "tableColumnTypeState": { + "names": { + "orig_channel": "string", + "first_home_buyer": "string", + "loan_purpose": "string", + "property_type": "string", + "occupancy_status": "string", + "property_state": "string", + "product_type": "string", + "relocation_mortgage_indicator": "string", + "seller_name": "string", + "mod_flag": "string", + "orig_interest_rate": "string", + "orig_upb": "string", + "orig_loan_term": "string", + "orig_ltv": "string", + "orig_cltv": "string", + "num_borrowers": "string", + "dti": "string", + "borrower_credit_score": "string", + "num_units": "string", + "zip": "string", + "mortgage_insurance_percent": "string", + "current_loan_delinquency_status": "string", + "current_actual_upb": "string", + "interest_rate": "string", + "loan_age": "string", + "msa": "string", + "non_interest_bearing_upb": "string", + "delinquency_12": "string", + "rawPrediction": "string", + "probability": "string", + "prediction": "string" + }, + "updated": false + }, + "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]", + "tableOptionValue": { + "useFilter": false, + "showPagination": false, + "showAggregationFooter": false + }, + "updated": false, + "initialized": false + } + }, + "commonSetting": {} + } + } + }, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionSupport": true, + "completionKey": "TAB" + }, + "title": true + }, + "settings": { + "params": {}, + "forms": {} + }, + "results": { + "code": "SUCCESS", + "msg": [ + { + "type": "TEXT", + "data": "Elapsed time [transform2]: 0.058s\n" + }, + { + "type": "TABLE", + "data": "orig_channel\tfirst_home_buyer\tloan_purpose\tproperty_type\toccupancy_status\tproperty_state\tproduct_type\trelocation_mortgage_indicator\tseller_name\tmod_flag\torig_interest_rate\torig_upb\torig_loan_term\torig_ltv\torig_cltv\tnum_borrowers\tdti\tborrower_credit_score\tnum_units\tzip\tmortgage_insurance_percent\tcurrent_loan_delinquency_status\tcurrent_actual_upb\tinterest_rate\tloan_age\tmsa\tnon_interest_bearing_upb\tdelinquency_12\trawPrediction\tprobability\tprediction\n4.9E-324\t4.9E-324\t1.0E-323\t4.9E-324\t1.0E-323\t1.24E-322\tnull\t4.9E-324\t4.9E-324\t4.9E-324\t2.75\t278000\t120\t56.0\t56.0\t1.0\t46.0\t624.0\t1\t295\t0.0\t0\t148441.15\t2.75\t13.0\t34820.0\t0.0\t0\t[5.001231670379639,-5.001231670379639]\t[0.9933153325691819,0.006684667430818081]\t0.0\n1.0E-323\t4.9E-324\t1.5E-323\t4.9E-324\t4.9E-324\t4.9E-324\tnull\t4.9E-324\t6.9E-323\t4.9E-324\t4.25\t579000\t360\t72.0\t72.0\t2.0\t44.0\t714.0\t1\t949\t0.0\t0\t568406.57\t4.25\t13.0\t41860.0\t0.0\t0\t[6.777693748474121,-6.777693748474121]\t[0.9988623971585184,0.0011376028414815664]\t0.0\n4.9E-324\t4.9E-324\t1.5E-323\t4.9E-324\t4.9E-324\t4.4E-323\tnull\t4.9E-324\t4.9E-324\t4.9E-324\t4.0\t240000\t360\t80.0\t80.0\t1.0\t18.0\t820.0\t1\t282\t0.0\t0\t236132.18\t4.0\t10.0\t16740.0\t0.0\t0\t[7.609184741973877,-7.609184741973877]\t[0.999504369799979,4.956302000209689E-4]\t0.0\n1.0E-323\t4.9E-324\t1.0E-323\t4.9E-324\t4.9E-324\t1.04E-322\tnull\t4.9E-324\t3.0E-323\t4.9E-324\t3.0\t241000\t180\t44.0\t44.0\t2.0\t44.0\t787.0\t1\t650\t0.0\t0\t230092.59\t3.0\t9.0\t0.0\t0.0\t0\t[8.442628860473633,-8.442628860473633]\t[0.9997845634934492,2.1543650655075908E-4]\t0.0\n1.0E-323\t4.9E-324\t4.9E-324\t1.5E-323\t4.9E-324\t1.0E-323\tnull\t4.9E-324\t4.9E-324\t4.9E-324\t4.25\t177000\t360\t75.0\t75.0\t2.0\t26.0\t792.0\t1\t787\t0.0\t0\t172387.22\t4.25\t18.0\t12420.0\t0.0\t0\t[8.08891773223877,-8.08891773223877]\t[0.9996931724308524,3.068275691475719E-4]\t0.0\n" + }, + { + "type": "TEXT", + "data": "\u001b[1m\u001b[34mmodelFromDisk\u001b[0m: \u001b[1m\u001b[32mml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel\u001b[0m = xgbc_2ce07ee0b6cb\n\u001b[1m\u001b[34mresults2\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrame\u001b[0m = [orig_channel: double, first_home_buyer: double ... 29 more fields]\n" + } + ] + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528580284_1907963406", + "id": "20200712-043620_1435219490", + "dateCreated": "2020-07-12T04:36:20+0000", + "dateStarted": "2020-07-13T02:27:20+0000", + "dateFinished": "2020-07-13T02:27:23+0000", + "status": "FINISHED", + "$$hashKey": "object:11106" + }, + { + "user": "anonymous", + "dateUpdated": "2020-07-12T04:50:45+0000", + "config": { + "colWidth": 12, + "fontSize": 9, + "enabled": true, + "results": {}, + "editorSetting": { + "language": "scala", + "editOnDblClick": false, + "completionKey": "TAB", + "completionSupport": true + }, + "editorMode": "ace/mode/scala" + }, + "settings": { + "params": {}, + "forms": {} + }, + "apps": [], + "runtimeInfos": {}, + "progressUpdateIntervalMs": 500, + "jobName": "paragraph_1594528930033_-558128424", + "id": "paragraph_1594528930033_-558128424", + "dateCreated": "2020-07-12T04:42:10+0000", + "status": "FINISHED", + "$$hashKey": "object:11107" + } + ], + "name": "mortgage-gpu-scala", + "id": "2FCHJHDT3", + "defaultInterpreterGroup": "spark", + "version": "0.9.0-preview1", + "noteParams": {}, + "noteForms": {}, + "angularObjects": {}, + "config": { + "isZeppelinNotebookCronEnable": false, + "looknfeel": "default", + "personalizedMode": "false" + }, + "info": { + "isRunning": true + }, + "path": "/mortgage-gpu-scala" +} \ No newline at end of file diff --git a/docs/demo/gpu-mortgage_accelerated.ipynb b/docs/demo/gpu-mortgage_accelerated.ipynb new file mode 100644 index 00000000000..2ce911b3a6b --- /dev/null +++ b/docs/demo/gpu-mortgage_accelerated.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"code","source":["%sh\n \nwget http://rapidsai-data.s3-website.us-east-2.amazonaws.com/notebook-mortgage-data/mortgage_2000.tgz -P /Users//\n \nmkdir -p /dbfs/FileStore/tables/mortgage\nmkdir -p /dbfs/FileStore/tables/mortgage_parquet_gpu/perf\nmkdir /dbfs/FileStore/tables/mortgage_parquet_gpu/acq\nmkdir /dbfs/FileStore/tables/mortgage_parquet_gpu/output\n \ntar xfvz /Users//mortgage_2000.tgz --directory /dbfs/FileStore/tables/mortgage\n"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["import time\nfrom pyspark import broadcast\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import *\nfrom pyspark.sql.types import *\n\ndef _get_quarter_from_csv_file_name():\n return substring_index(substring_index(input_file_name(), '.', 1), '_', -1)\n\n_csv_perf_schema = StructType([\n StructField('loan_id', LongType()),\n StructField('monthly_reporting_period', StringType()),\n StructField('servicer', StringType()),\n StructField('interest_rate', DoubleType()),\n StructField('current_actual_upb', DoubleType()),\n StructField('loan_age', DoubleType()),\n StructField('remaining_months_to_legal_maturity', DoubleType()),\n StructField('adj_remaining_months_to_maturity', DoubleType()),\n StructField('maturity_date', StringType()),\n StructField('msa', DoubleType()),\n StructField('current_loan_delinquency_status', IntegerType()),\n StructField('mod_flag', StringType()),\n StructField('zero_balance_code', StringType()),\n StructField('zero_balance_effective_date', StringType()),\n StructField('last_paid_installment_date', StringType()),\n StructField('foreclosed_after', StringType()),\n StructField('disposition_date', StringType()),\n StructField('foreclosure_costs', DoubleType()),\n StructField('prop_preservation_and_repair_costs', DoubleType()),\n StructField('asset_recovery_costs', DoubleType()),\n StructField('misc_holding_expenses', DoubleType()),\n StructField('holding_taxes', DoubleType()),\n StructField('net_sale_proceeds', DoubleType()),\n StructField('credit_enhancement_proceeds', DoubleType()),\n StructField('repurchase_make_whole_proceeds', StringType()),\n StructField('other_foreclosure_proceeds', DoubleType()),\n StructField('non_interest_bearing_upb', DoubleType()),\n StructField('principal_forgiveness_upb', StringType()),\n StructField('repurchase_make_whole_proceeds_flag', StringType()),\n StructField('foreclosure_principal_write_off_amount', StringType()),\n StructField('servicing_activity_indicator', StringType())])\n_csv_acq_schema = StructType([\n StructField('loan_id', LongType()),\n StructField('orig_channel', StringType()),\n StructField('seller_name', StringType()),\n StructField('orig_interest_rate', DoubleType()),\n StructField('orig_upb', IntegerType()),\n StructField('orig_loan_term', IntegerType()),\n StructField('orig_date', StringType()),\n StructField('first_pay_date', StringType()),\n StructField('orig_ltv', DoubleType()),\n StructField('orig_cltv', DoubleType()),\n StructField('num_borrowers', DoubleType()),\n StructField('dti', DoubleType()),\n StructField('borrower_credit_score', DoubleType()),\n StructField('first_home_buyer', StringType()),\n StructField('loan_purpose', StringType()),\n StructField('property_type', StringType()),\n StructField('num_units', IntegerType()),\n StructField('occupancy_status', StringType()),\n StructField('property_state', StringType()),\n StructField('zip', IntegerType()),\n StructField('mortgage_insurance_percent', DoubleType()),\n StructField('product_type', StringType()),\n StructField('coborrow_credit_score', DoubleType()),\n StructField('mortgage_insurance_type', DoubleType()),\n StructField('relocation_mortgage_indicator', StringType())])\n\ndef read_perf_csv(spark, path):\n return spark.read.format('csv') \\\n .option('nullValue', '') \\\n .option('header', 'false') \\\n .option('delimiter', '|') \\\n .schema(_csv_perf_schema) \\\n .load(path) \\\n .withColumn('quarter', _get_quarter_from_csv_file_name())\n\ndef read_acq_csv(spark, path):\n return spark.read.format('csv') \\\n .option('nullValue', '') \\\n .option('header', 'false') \\\n .option('delimiter', '|') \\\n .schema(_csv_acq_schema) \\\n .load(path) \\\n .withColumn('quarter', _get_quarter_from_csv_file_name())\n\ndef _parse_dates(perf):\n return perf \\\n .withColumn('monthly_reporting_period', to_date(col('monthly_reporting_period'), 'MM/dd/yyyy')) \\\n .withColumn('monthly_reporting_period_month', month(col('monthly_reporting_period'))) \\\n .withColumn('monthly_reporting_period_year', year(col('monthly_reporting_period'))) \\\n .withColumn('monthly_reporting_period_day', dayofmonth(col('monthly_reporting_period'))) \\\n .withColumn('last_paid_installment_date', to_date(col('last_paid_installment_date'), 'MM/dd/yyyy')) \\\n .withColumn('foreclosed_after', to_date(col('foreclosed_after'), 'MM/dd/yyyy')) \\\n .withColumn('disposition_date', to_date(col('disposition_date'), 'MM/dd/yyyy')) \\\n .withColumn('maturity_date', to_date(col('maturity_date'), 'MM/yyyy')) \\\n .withColumn('zero_balance_effective_date', to_date(col('zero_balance_effective_date'), 'MM/yyyy'))\n\ndef _create_perf_deliquency(spark, perf):\n aggDF = perf.select(\n col(\"quarter\"),\n col(\"loan_id\"),\n col(\"current_loan_delinquency_status\"),\n when(col(\"current_loan_delinquency_status\") >= 1, col(\"monthly_reporting_period\")).alias(\"delinquency_30\"),\n when(col(\"current_loan_delinquency_status\") >= 3, col(\"monthly_reporting_period\")).alias(\"delinquency_90\"),\n when(col(\"current_loan_delinquency_status\") >= 6, col(\"monthly_reporting_period\")).alias(\"delinquency_180\")) \\\n .groupBy(\"quarter\", \"loan_id\") \\\n .agg(\n max(\"current_loan_delinquency_status\").alias(\"delinquency_12\"),\n min(\"delinquency_30\").alias(\"delinquency_30\"),\n min(\"delinquency_90\").alias(\"delinquency_90\"),\n min(\"delinquency_180\").alias(\"delinquency_180\")) \\\n .select(\n col(\"quarter\"),\n col(\"loan_id\"),\n (col(\"delinquency_12\") >= 1).alias(\"ever_30\"),\n (col(\"delinquency_12\") >= 3).alias(\"ever_90\"),\n (col(\"delinquency_12\") >= 6).alias(\"ever_180\"),\n col(\"delinquency_30\"),\n col(\"delinquency_90\"),\n col(\"delinquency_180\"))\n joinedDf = perf \\\n .withColumnRenamed(\"monthly_reporting_period\", \"timestamp\") \\\n .withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n .withColumnRenamed(\"current_loan_delinquency_status\", \"delinquency_12\") \\\n .withColumnRenamed(\"current_actual_upb\", \"upb_12\") \\\n .select(\"quarter\", \"loan_id\", \"timestamp\", \"delinquency_12\", \"upb_12\", \"timestamp_month\", \"timestamp_year\") \\\n .join(aggDF, [\"loan_id\", \"quarter\"], \"left_outer\")\n\n # calculate the 12 month delinquency and upb values\n months = 12\n monthArray = [lit(x) for x in range(0, 12)]\n # explode on a small amount of data is actually slightly more efficient than a cross join\n testDf = joinedDf \\\n .withColumn(\"month_y\", explode(array(monthArray))) \\\n .select(\n col(\"quarter\"),\n floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000) / months).alias(\"josh_mody\"),\n floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000 - col(\"month_y\")) / months).alias(\"josh_mody_n\"),\n col(\"ever_30\"),\n col(\"ever_90\"),\n col(\"ever_180\"),\n col(\"delinquency_30\"),\n col(\"delinquency_90\"),\n col(\"delinquency_180\"),\n col(\"loan_id\"),\n col(\"month_y\"),\n col(\"delinquency_12\"),\n col(\"upb_12\")) \\\n .groupBy(\"quarter\", \"loan_id\", \"josh_mody_n\", \"ever_30\", \"ever_90\", \"ever_180\", \"delinquency_30\", \"delinquency_90\", \"delinquency_180\", \"month_y\") \\\n .agg(max(\"delinquency_12\").alias(\"delinquency_12\"), min(\"upb_12\").alias(\"upb_12\")) \\\n .withColumn(\"timestamp_year\", floor((lit(24000) + (col(\"josh_mody_n\") * lit(months)) + (col(\"month_y\") - 1)) / lit(12))) \\\n .selectExpr('*', 'pmod(24000 + (josh_mody_n * {}) + month_y, 12) as timestamp_month_tmp'.format(months)) \\\n .withColumn(\"timestamp_month\", when(col(\"timestamp_month_tmp\") == lit(0), lit(12)).otherwise(col(\"timestamp_month_tmp\"))) \\\n .withColumn(\"delinquency_12\", ((col(\"delinquency_12\") > 3).cast(\"int\") + (col(\"upb_12\") == 0).cast(\"int\")).alias(\"delinquency_12\")) \\\n .drop(\"timestamp_month_tmp\", \"josh_mody_n\", \"month_y\")\n\n return perf.withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n .join(testDf, [\"quarter\", \"loan_id\", \"timestamp_year\", \"timestamp_month\"], \"left\") \\\n .drop(\"timestamp_year\", \"timestamp_month\")\n\n_name_mapping = [\n (\"WITMER FUNDING, LLC\", \"Witmer\"),\n (\"WELLS FARGO CREDIT RISK TRANSFER SECURITIES TRUST 2015\", \"Wells Fargo\"),\n (\"WELLS FARGO BANK, NA\" , \"Wells Fargo\"),\n (\"WELLS FARGO BANK, N.A.\" , \"Wells Fargo\"),\n (\"WELLS FARGO BANK, NA\" , \"Wells Fargo\"),\n (\"USAA FEDERAL SAVINGS BANK\" , \"USAA\"),\n (\"UNITED SHORE FINANCIAL SERVICES, LLC D\\\\/B\\\\/A UNITED WHOLESALE MORTGAGE\" , \"United Seq(e\"),\n (\"U.S. BANK N.A.\" , \"US Bank\"),\n (\"SUNTRUST MORTGAGE INC.\" , \"Suntrust\"),\n (\"STONEGATE MORTGAGE CORPORATION\" , \"Stonegate Mortgage\"),\n (\"STEARNS LENDING, LLC\" , \"Stearns Lending\"),\n (\"STEARNS LENDING, INC.\" , \"Stearns Lending\"),\n (\"SIERRA PACIFIC MORTGAGE COMPANY, INC.\" , \"Sierra Pacific Mortgage\"),\n (\"REGIONS BANK\" , \"Regions\"),\n (\"RBC MORTGAGE COMPANY\" , \"RBC\"),\n (\"QUICKEN LOANS INC.\" , \"Quicken Loans\"),\n (\"PULTE MORTGAGE, L.L.C.\" , \"Pulte Mortgage\"),\n (\"PROVIDENT FUNDING ASSOCIATES, L.P.\" , \"Provident Funding\"),\n (\"PROSPECT MORTGAGE, LLC\" , \"Prospect Mortgage\"),\n (\"PRINCIPAL RESIDENTIAL MORTGAGE CAPITAL RESOURCES, LLC\" , \"Principal Residential\"),\n (\"PNC BANK, N.A.\" , \"PNC\"),\n (\"PMT CREDIT RISK TRANSFER TRUST 2015-2\" , \"PennyMac\"),\n (\"PHH MORTGAGE CORPORATION\" , \"PHH Mortgage\"),\n (\"PENNYMAC CORP.\" , \"PennyMac\"),\n (\"PACIFIC UNION FINANCIAL, LLC\" , \"Other\"),\n (\"OTHER\" , \"Other\"),\n (\"NYCB MORTGAGE COMPANY, LLC\" , \"NYCB\"),\n (\"NEW YORK COMMUNITY BANK\" , \"NYCB\"),\n (\"NETBANK FUNDING SERVICES\" , \"Netbank\"),\n (\"NATIONSTAR MORTGAGE, LLC\" , \"Nationstar Mortgage\"),\n (\"METLIFE BANK, NA\" , \"Metlife\"),\n (\"LOANDEPOT.COM, LLC\" , \"LoanDepot.com\"),\n (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2015-1\" , \"JP Morgan Chase\"),\n (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2014-1\" , \"JP Morgan Chase\"),\n (\"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION\" , \"JP Morgan Chase\"),\n (\"JPMORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n (\"JP MORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n (\"IRWIN MORTGAGE, CORPORATION\" , \"Irwin Mortgage\"),\n (\"IMPAC MORTGAGE CORP.\" , \"Impac Mortgage\"),\n (\"HSBC BANK USA, NATIONAL ASSOCIATION\" , \"HSBC\"),\n (\"HOMEWARD RESIDENTIAL, INC.\" , \"Homeward Mortgage\"),\n (\"HOMESTREET BANK\" , \"Other\"),\n (\"HOMEBRIDGE FINANCIAL SERVICES, INC.\" , \"HomeBridge\"),\n (\"HARWOOD STREET FUNDING I, LLC\" , \"Harwood Mortgage\"),\n (\"GUILD MORTGAGE COMPANY\" , \"Guild Mortgage\"),\n (\"GMAC MORTGAGE, LLC (USAA FEDERAL SAVINGS BANK)\" , \"GMAC\"),\n (\"GMAC MORTGAGE, LLC\" , \"GMAC\"),\n (\"GMAC (USAA)\" , \"GMAC\"),\n (\"FREMONT BANK\" , \"Fremont Bank\"),\n (\"FREEDOM MORTGAGE CORP.\" , \"Freedom Mortgage\"),\n (\"FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"Franklin America\"),\n (\"FLEET NATIONAL BANK\" , \"Fleet National\"),\n (\"FLAGSTAR CAPITAL MARKETS CORPORATION\" , \"Flagstar Bank\"),\n (\"FLAGSTAR BANK, FSB\" , \"Flagstar Bank\"),\n (\"FIRST TENNESSEE BANK NATIONAL ASSOCIATION\" , \"Other\"),\n (\"FIFTH THIRD BANK\" , \"Fifth Third Bank\"),\n (\"FEDERAL HOME LOAN BANK OF CHICAGO\" , \"Fedral Home of Chicago\"),\n (\"FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB\" , \"FDIC\"),\n (\"DOWNEY SAVINGS AND LOAN ASSOCIATION, F.A.\" , \"Downey Mortgage\"),\n (\"DITECH FINANCIAL LLC\" , \"Ditech\"),\n (\"CITIMORTGAGE, INC.\" , \"Citi\"),\n (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERFIRST MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERBANK MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n (\"CHASE HOME FINANCE, LLC\" , \"JP Morgan Chase\"),\n (\"CHASE HOME FINANCE FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"JP Morgan Chase\"),\n (\"CHASE HOME FINANCE (CIE 1)\" , \"JP Morgan Chase\"),\n (\"CHASE HOME FINANCE\" , \"JP Morgan Chase\"),\n (\"CASHCALL, INC.\" , \"CashCall\"),\n (\"CAPITAL ONE, NATIONAL ASSOCIATION\" , \"Capital One\"),\n (\"CALIBER HOME LOANS, INC.\" , \"Caliber Funding\"),\n (\"BISHOPS GATE RESIDENTIAL MORTGAGE TRUST\" , \"Bishops Gate Mortgage\"),\n (\"BANK OF AMERICA, N.A.\" , \"Bank of America\"),\n (\"AMTRUST BANK\" , \"AmTrust\"),\n (\"AMERISAVE MORTGAGE CORPORATION\" , \"Amerisave\"),\n (\"AMERIHOME MORTGAGE COMPANY, LLC\" , \"AmeriHome Mortgage\"),\n (\"ALLY BANK\" , \"Ally Bank\"),\n (\"ACADEMY MORTGAGE CORPORATION\" , \"Academy Mortgage\"),\n (\"NO CASH-OUT REFINANCE\" , \"OTHER REFINANCE\"),\n (\"REFINANCE - NOT SPECIFIED\" , \"OTHER REFINANCE\"),\n (\"Other REFINANCE\" , \"OTHER REFINANCE\")]\n\ndef _create_acquisition(spark, acq):\n nameMapping = spark.createDataFrame(_name_mapping, [\"from_seller_name\", \"to_seller_name\"])\n return acq.join(nameMapping, col(\"seller_name\") == col(\"from_seller_name\"), \"left\") \\\n .drop(\"from_seller_name\") \\\n .withColumn(\"old_name\", col(\"seller_name\")) \\\n .withColumn(\"seller_name\", coalesce(col(\"to_seller_name\"), col(\"seller_name\"))) \\\n .drop(\"to_seller_name\") \\\n .withColumn(\"orig_date\", to_date(col(\"orig_date\"), \"MM/yyyy\")) \\\n .withColumn(\"first_pay_date\", to_date(col(\"first_pay_date\"), \"MM/yyyy\")) \\\n\ndef run_mortgage(spark, perf, acq):\n parsed_perf = _parse_dates(perf)\n perf_deliqency = _create_perf_deliquency(spark, parsed_perf)\n cleaned_acq = _create_acquisition(spark, acq)\n return perf_deliqency.join(cleaned_acq, [\"loan_id\", \"quarter\"], \"inner\").drop(\"quarter\")"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["orig_perf_path='dbfs:///FileStore/tables/mortgage/perf/*'\norig_acq_path='dbfs:///FileStore/tables/mortgage/acq/*'\ntmp_perf_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/perf/'\ntmp_acq_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/acq/'\noutput_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/output/'\n\nspark.conf.set('spark.rapids.sql.enabled','true')\nspark.conf.set('spark.rapids.sql.explain', 'ALL')\nspark.conf.set('spark.rapids.sql.incompatibleOps.enabled', 'true')\nspark.conf.set('spark.rapids.sql.batchSizeBytes', '512M')\nspark.conf.set('spark.rapids.sql.reader.batchSizeBytes', '768M')"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["# Lets transcode the data first\nstart = time.time()\n# we want a few big files instead of lots of small files\nspark.conf.set('spark.sql.files.maxPartitionBytes', '200G')\nacq = read_acq_csv(spark, orig_acq_path)\nacq.repartition(12).write.parquet(tmp_acq_path, mode='overwrite')\nperf = read_perf_csv(spark, orig_perf_path)\nperf.coalesce(96).write.parquet(tmp_perf_path, mode='overwrite')\nend = time.time()\nprint(end - start)"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["# Now lets actually process the data\\n\",\nstart = time.time()\nspark.conf.set('spark.sql.files.maxPartitionBytes', '1G')\nspark.conf.set('spark.sql.shuffle.partitions', '192')\nperf = spark.read.parquet(tmp_perf_path)\nacq = spark.read.parquet(tmp_acq_path)\nout = run_mortgage(spark, perf, acq)\nout.write.parquet(output_path, mode='overwrite')\nend = time.time()\nprint(end - start)\n"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":6}],"metadata":{"name":"gpu-mortgage_kr","notebookId":2710846968050572},"nbformat":4,"nbformat_minor":0} diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 00000000000..8cdc8d3c033 --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,15 @@ +--- +layout: page +title: Demos +nav_order: 4 +--- +# Demos + +Example notebooks allow users to test drive "RAPIDS Accelerator for Apache Spark" with public datasets. + +##### [Mortgage ETL Notebook](demo/gpu-mortgage_accelerated.ipynb) [(Dataset)](https://docs.rapids.ai/datasets/mortgage-data) + +##### About the Mortgage Dataset: +Dataset is derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. This processed dataset is redistributed with permission and consent from Fannie Mae. + +For the full raw dataset visit [Fannie Mae](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) to register for an account and to download. diff --git a/docs/get-started/getting-started-gcp.md b/docs/get-started/getting-started-gcp.md new file mode 100644 index 00000000000..f210134a55e --- /dev/null +++ b/docs/get-started/getting-started-gcp.md @@ -0,0 +1,115 @@ +--- +layout: page +title: GCP Dataproc +nav_order: 2 +parent: Getting-Started +--- + +# Getting started with RAPIDS Accelerator on GCP Dataproc + [Google Cloud Dataproc](https://cloud.google.com/dataproc) is Google Cloud's fully managed Apache Spark and Hadoop service. This guide will walk through the steps to show: + +* [How to spin up a Dataproc Cluster Accelerated by GPU](getting-started-gcp#how-to-spin-up-a-dataproc-cluster-accelerated-by-gpu) +* [Run a sample Pyspark or Scala ETL and XGBoost training Notebooks on a Dataproc Cluster Accelerated by GPU](getting-started-gcp#run-pyspark-and-scala-notebook-a-dataproc-cluster-accelerated-by-gpu) +* [Submit the same sample ETL application as a Spark job to a Dataproc Cluster Accelerated by GPU](getting-started-gcp#submit-spark-jobs-to-a-dataproc-cluster-accelerated-by-gpu) + + + +## How to spin up a Dataproc Cluster Accelerated by GPU + + You can use [Cloud Shell](https://cloud.google.com/shell) to execute shell commands that will create a Dataproc cluster. Cloud Shell contains command line tools for interacting with Google Cloud Platform, including gcloud and gsutil. Alternatively, you can install [GCloud SDK](https://cloud.google.com/sdk/install) on your laptop. From the Cloud Shell, users will need to enable services within your project. Enable the Compute and Dataproc APIs in order to access Dataproc, and enable the Storage API as you’ll need a Google Cloud Storage bucket to house your data. This may take several minutes. +```bash +gcloud services enable compute.googleapis.com +gcloud services enable dataproc.googleapis.com +gcloud services enable storage-api.googleapis.com +``` + +After command line environment is setup, log in to your GCP account. We can now create a Dataproc cluster with configuration mentioned below. +The configuration will allow users to run any of the [notebooks demo](../demo/GCP) on GCP. Alternatively, users can also start a 2*2T4 worker nodes. +* [GPU Driver](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/gpu) and [RAPIDS Acclerator for Apache Spark](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/rapids) through initialization actions +* One 8-core master node and 5 32-core worker nodes +* Four NVIDIA T4 to each worker nodes +* [Local SSDs](https://cloud.google.com/dataproc/docs/concepts/compute/dataproc-local-ssds) is recommended to improve IO for Spark scratch places +* Component gateway enabled for accessing Web UIs hosted on the cluster +* Configuration for [GPU scheduling and isolation](/get-started/yarn-gpu.html) + + +```bash + export REGION=[Your Prefer GCP Region] + export GCS_BUCKET=[Your GCS Bucket] + export CLUSTER_NAME=[Your Cluster Name] + export NUM_GPUS=4 + export NUM_WORKERS=5 + +gcloud dataproc clusters create $CLUSTER_NAME \ + --region $REGION \ + --image-version=preview-ubuntu \ + --master-machine-type n1-standard-16 \ + --num-workers $NUM_WORKERS \ + --worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \ + --worker-machine-type n1-highmem-32\ + --num-worker-local-ssds 4 \ + --initialization-actions gs://dataproc-initialization-actions/gpu/install_gpu_driver.sh,gs://dataproc-initialization-actions/rapids/rapids.sh \ + --optional-components=ANACONDA,JUPYTER,ZEPPELIN \ + --metadata gpu-driver-provider="NVIDIA" \ + --metadata rapids-runtime=SPARK \ + --bucket $GCS_BUCKET \ + --enable-component-gateway \ + --properties="^#^spark:spark.yarn.unmanagedAM.enabled=false"` +``` +This may take around 5-15 minutes to complete. You can navigate to Dataproc clusters tab in the Google Cloud Console to see the progress. + +![Dataproc Cluster](../img/dataproc-cluster.png) + +## Run Pyspark and Scala Notebook a Dataproc Cluster Accelerated by GPU +To use notebooks with Dataproc cluster, click on the cluster name under Dataproc cluster tab and navigate to the "Web Interfaces" Tab. Under the "Web Interfaces", click on JupyterLab or Jupyter link to start to use sample [Mortgage ETL on GPU Jupyter Notebook](../demo/GCP/Mortgage-ETL-GPU.ipynb) to process full 17 years [Mortgage data](https://rapidsai.github.io/demos/datasets/mortgage-data). + +![Dataproc Web Interfaces](../img/dataproc-service.png) + +The notebook will first transcode CSV files into Parquet Files and then run a ETL query to prepare the dataset for Training. In the sample notebook, we use 2016 data as evaluation set and the rest as training set, saving to respective GCS location. +First stage with default configuration in notebook should take ~110 seconds (1/3 of CPU execution time with same config) whereas second stage takes ~170 seconds (1/7 of CPU execution time with same config). The notebook depends on pre-compiled [Spark RAPIDS SQL plugin](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark-parent) and [cuDF](https://mvnrepository.com/artifact/ai.rapids/cudf/0.14), which pre-downloaded by GCP Dataproc [RAPIDS init script](). + +Once data is prepared, we use [Mortgage XGBoost4j Scala Notebook](../demo/GCP/mortgage-xgboost4j-gpu-scala.zpln) in Dataproc Zeppelin service to execute the training job on GPU. NVIDIA Spark team also ship [Spark XGBoost4j](https://github.com/NVIDIA/spark-xgboost) which is based on [DMLC xgboost](https://github.com/dmlc/xgboost). Precompiled [XGBoost4j]() and [XGBoost4j Spark](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.0.0-0.1.0/) library could be downloaded from maven, it is pre downloaded by GCP [RAPIDS init action](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/rapids). Since GITHUB cannot render zeppelin notebook, we prepared a [Jupyter Notebook with Scala code](../demo/GCP/mortgage-xgboost4j-gpu-scala.ipynb) for you to view code content. + +The training time should be around 480 seconds (1/10 of CPU execution time with same config). Which is shown under cell: +```scala +// Start training +println("\n------ Training ------") +val (xgbClassificationModel, _) = benchmark("train") { + xgbClassifier.fit(trainSet) +} +``` + +## Submit Spark jobs to a Dataproc Cluster Accelerated by GPU +Similar to spark-submit for on-prem clusters, Dataproc supports a Spark applicaton job to be submitted as a dataproc job. The mortgage examples we use above is also available as [spark application](https://github.com/NVIDIA/spark-xgboost-examples/tree/spark-3/examples/apps/scala). After [build the jar files](https://github.com/NVIDIA/spark-xgboost-examples/blob/spark-3/getting-started-guides/building-sample-apps/scala.md) through maven `mvn package -Dcuda.classifier=cuda10-2` + +Then place the jar file `sample_xgboost_apps-0.2.2.jar` under the `gs://$GCS_BUCKET/scala/` folder by `gsutil cp target/sample_xgboost_apps-0.2.2.jar gs://$GCS_BUCKET/scala/`. To do this you can either drag and drop files from your local machine into the GCP storage browser, or use the gsutil cp as shown before to do this from a command line. In the end, we can thereby submit the jar by: +```bash +export GCS_BUCKET= +export CLUSTER_NAME= +export REGION= +export SPARK_NUM_EXECUTORS=20 +export SPARK_EXECUTOR_MEMORY=20G +export SPARK_EXECUTOR_MEMORYOVERHEAD=16G +export SPARK_NUM_CORES_PER_EXECUTOR=7 +export DATA_PATH=gs://${GCS_BUCKET}/mortgage_full + +gcloud dataproc jobs submit spark \ + --cluster=$CLUSTER_NAME \ + --region=$REGION \ + --class=com.nvidia.spark.examples.mortgage.GPUMain \ + --jars=gs://${GCS_BUCKET}/scala/sample_xgboost_apps-0.2.2.jar \ + --properties=spark.executor.cores=${SPARK_NUM_CORES_PER_EXECUTOR},spark.task.cpus=${SPARK_NUM_CORES_PER_EXECUTOR},spark.executor.memory=${SPARK_EXECUTOR_MEMORY},spark.executor.memoryOverhead=${SPARK_EXECUTOR_MEMORYOVERHEAD},spark.executor.resource.gpu.amount=1,spark.task.resource.gpu.amount=1,spark.rapids.sql.hasNans=false,spark.rapids.sql.batchSizeBytes=512M,spark.rapids.sql.reader.batchSizeBytes=768M,spark.rapids.sql.variableFloatAgg.enabled=true,spark.rapids.memory.gpu.pooling.enabled=false \ + -- \ + -dataPath=train::${DATA_PATH}/train \ + -dataPath=trans::${DATA_PATH}/test \ + -format=parquet \ + -numWorkers=${SPARK_NUM_EXECUTORS} \ + -treeMethod=gpu_hist \ + -numRound=100 \ + -maxDepth=8 +``` + +## Dataproc Hub in AI Platform Notebook to Dataproc cluster +With the integration between AI Platform Notebooks and Dataproc. Users can create a [Dataproc Hub notebook](https://cloud.google.com/blog/products/data-analytics/administering-jupyter-notebooks-for-spark-workloads-on-dataproc) from AI platform will can connect to Dataproc cluster through a yaml configuration. + +In future, user will be able to provision a dataproc cluster through DataprocHub notebook. Please use example [pyspark notebooks](../demo/GCP/Mortgage-ETL-GPU.ipynb) to experiment. diff --git a/docs/get-started/getting-started-menu.md b/docs/get-started/getting-started-menu.md new file mode 100644 index 00000000000..b1e60faf3a0 --- /dev/null +++ b/docs/get-started/getting-started-menu.md @@ -0,0 +1,57 @@ +--- +layout: page +title: Getting-Started +nav_order: 2 +has_children: true +permalink: /Getting-Started/ +--- +# Getting Started with the RAPIDS Accelerator for Apache Spark + +Apache Spark 3.0+ lets users provide a plugin that can replace the backend for SQL and DataFrame +operations. This requires no API changes from the user. The plugin will replace SQL operations it +supports with GPU accelerated versions. If an operation is not supported it will fall back to using +the Spark CPU version. Note that the plugin cannot accelerate operations that manipulate RDDs +directly. + +The accelerator library also provides an implementation of Spark's shuffle that can leverage +[UCX](https://www.openucx.org/) to optimize GPU data transfers keeping as much data on the GPU as +possible and bypassing the CPU to do GPU to GPU transfers. + +The GPU accelerated processing plugin does not require the accelerated shuffle implementation. +However, if accelerated SQL processing is not enabled, the shuffle implementation falls back to the +default `SortShuffleManager`. + +To enable GPU processing acceleration you will need: +- Apache Spark 3.0+ +- A spark cluster configured with GPUs that comply with the requirements for the version of + [cudf](https://github.com/rapidsai/cudf). + - One GPU per executor. +- Add the following jars: + - A cudf jar that corresponds to the version of CUDA available on your cluster. + - RAPIDS Spark accelerator plugin jar. +- Set the config `spark.plugins` to `com.nvidia.spark.SQLPlugin` + +## Spark GPU Scheduling Overview +Apache Spark 3.0 now supports GPU scheduling as long as you are using a cluster manager that +supports it. You can have Spark request GPUs and assign them to tasks. The exact configs you use +will vary depending on your cluster manager. Here are a few of the configs: +- Request your executor to have GPUs: + - `--conf spark.executor.resource.gpu.amount=1` +- Specify the number of GPUs per task: + - `--conf spark.task.resource.gpu.amount=1` +- Specify a GPU discovery script (required on YARN and K8S): + - `--conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh` + +See the deployment specific sections for more details and restrictions. Note that +`spark.task.resource.gpu.amount` can be a decimal amount, so if you want multiple tasks to be run +on an executor at the same time and assigned to the same GPU you can set this to a decimal value +less than 1. You would want this setting to correspond to the `spark.executor.cores` setting. For +instance, if you have `spark.executor.cores=2` which would allow 2 tasks to run on each executor +and you want those 2 tasks to run on the same GPU then you would set +`spark.task.resource.gpu.amount=0.5`. + +You can also refer to the official Apache Spark documentation. +- [Overview](https://github.com/apache/spark/blob/master/docs/configuration.md#custom-resource-scheduling-and-configuration-overview) +- [Kubernetes specific documentation](https://github.com/apache/spark/blob/master/docs/running-on-kubernetes.md#resource-allocation-and-configuration-overview) +- [Yarn specific documentation](https://github.com/apache/spark/blob/master/docs/running-on-yarn.md#resource-allocation-and-configuration-overview) +- [Standalone specific documentation](https://github.com/apache/spark/blob/master/docs/spark-standalone.md#resource-allocation-and-configuration-overview) \ No newline at end of file diff --git a/docs/get-started/getting-started-with-rapids-accelerator-on-databricks.md b/docs/get-started/getting-started-with-rapids-accelerator-on-databricks.md new file mode 100644 index 00000000000..18bf7b0f8ca --- /dev/null +++ b/docs/get-started/getting-started-with-rapids-accelerator-on-databricks.md @@ -0,0 +1,80 @@ +--- +layout: page +title: Databricks +nav_order: 3 +parent: Getting-Started +--- + +# Getting started with RAPIDS Accelerator on Databricks +This guide will run through how to set up the RAPIDS Accelerator for Apache Spark 3.0 on Databricks. At the end of this guide, the reader will be able to run a sample Apache Spark application that runs on NVIDIA GPUs on Databricks. + +## Prerequisites +* Apache Spark 3.0 running in DataBricks Runtime 7.0 ML with GPU + * AWS: 7.0 ML (includes Apache Spark 3.0.0, GPU, Scala 2.12) + * Azure: 7.0 ML (GPU, Scala 2.12, Spark 3.0.0) + +The number of GPUs per node dictates the number of Spark executors that can run in that node. + +## Start a Databricks Cluster +Create a Databricks cluster by going to Clusters, then clicking “+ Create Cluster”. Ensure the cluster meets the prerequisites above by configuring it as follows: +1. On AWS, make sure to use 7.0 ML (includes Apache Spark 3.0.0, GPU, Scala 2.12), or for Azure, choose 7.0 ML (GPU, Scala 2.12, Spark 3.0.0). +2. Under Autopilot Options, disable auto scaling. +3. Choose the number of workers that matches the number of GPUs you want to use. +4. Select a worker type. On AWS, use nodes with 1 GPU each such as p3.xlarge or g4dn.xlarge. p2 nodes do not meet the architecture requirements for the Spark worker (although they can be used for the driver node). For Azure, choose GPU nodes such as Standard_NC6s_v3 +5. Select the driver type. Generally this can be set to be the same as the worker. + +## Advance Cluster Configuration + +We will need to create an initialization script for the cluster that installs the RAPIDS jars to the cluster. + +1. To create the initialization script, import the initialization script notebook from the repo [generate-init-script.ipynb](../demo/Databricks/) to your workspace. See [Managing Notebooks](https://docs.databricks.com/user-guide/notebooks/notebook-manage.html) on how to import a notebook, then open the notebook. +2. Once you are in the notebook, click the “Run All” button. +3. Ensure that the newly created init.sh script is present in the output from cell 2 and that the contents of the script are correct.. +4. Go back and edit your cluster to configure it to use the init script. To do this, click the “Clusters” button on the left panel, then select your cluster. +5. Click the “Edit” button, then navigate down to the “Advanced Options” section. +Select the “Init Scripts” tab in the advanced options section, and paste the initialization script: `dbfs:/databricks/init_scripts/init.sh`, then click “Add” + +![Init Script](../img/initscript.png) + +6. Now select the “Spark” tab, and paste the following config options into the Spark Config section: + ```bash + spark.plugins com.nvidia.spark.SQLPlugin + spark.sql.parquet.filterPushdown false + spark.rapids.sql.incompatibleOps.enabled true + spark.rapids.memory.pinnedPool.size 2G + spark.locality.wait 0s + spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version 2 + spark.executor.extraJavaOptions "-Dai.rapids.cudf.prefer-pinned=true" + ``` + +![Spark Config](../img/sparkconfig.png) + +7. Once you’ve added the Spark config, click “Confirm and Restart”. +8. Once the cluster comes back up, it is now enabled for GPU-accelerated Spark with RAPIDS and cuDF. + +## Import the GPU Mortgage Example Notebook +Import the example [notebook](../demo/gpu-mortgage_accelerated.ipynb) from the repo into your workspace, then open the notebook. +Modify the first cell to point to your workspace, and download a larger dataset if needed. You can find the links to the datasets at [docs.rapids.ai](https://docs.rapids.ai/datasets/mortgage-data) + +```bash +%sh + +wget http://rapidsai-data.s3-website.us-east-2.amazonaws.com/notebook-mortgage-data/mortgage_2000.tgz -P /Users// + +mkdir -p /dbfs/FileStore/tables/mortgage +mkdir -p /dbfs/FileStore/tables/mortgage_parquet_gpu/perf +mkdir /dbfs/FileStore/tables/mortgage_parquet_gpu/acq +mkdir /dbfs/FileStore/tables/mortgage_parquet_gpu/output + +tar xfvz /Users//mortgage_2000.tgz --directory /dbfs/FileStore/tables/mortgage +``` + +In Cell 3, update the data paths if necessary. The example notebook merges the columns and prepares the data for XGoost training. The temp and final output results are written back to the dbfs +```bash +orig_perf_path='dbfs:///FileStore/tables/mortgage/perf/*' +orig_acq_path='dbfs:///FileStore/tables/mortgage/acq/*' +tmp_perf_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/perf/' +tmp_acq_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/acq/' +output_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/output/' +``` +Run the notebook by clicking “Run All” diff --git a/docs/getting-started.md b/docs/get-started/getting-started.md similarity index 83% rename from docs/getting-started.md rename to docs/get-started/getting-started.md index 0da71c9c7a9..43bf17ef79d 100644 --- a/docs/getting-started.md +++ b/docs/get-started/getting-started.md @@ -1,42 +1,21 @@ --- -layout: default -title: Getting Started -nav_order: 2 +layout: page +title: On-Prem +nav_order: 1 +parent: Getting-Started --- +# Getting Started with RAPIDS Accelerator with on premise cluster or local mode +## Spark Deployment Methods +The way you decide to deploy Spark affects the steps you must take to install and setup Spark and +the RAPIDS Accelerator for Apache Spark. The primary methods of deploy Spark are: +- Local mode - this is for dev/testing only, not for production +- Standalone Mode +- On a YARN cluster +- On a Kubernetes cluster -# Getting Started with the RAPIDS Accelerator for Apache Spark - -## Overview -The RAPIDS Accelerator for Apache Spark leverages GPUs to accelerate processing via the -[RAPIDS libraries](http://rapids.ai). - -Apache Spark 3.0+ lets users provide a plugin that can replace the backend for SQL and DataFrame -operations. This requires no API changes from the user. The plugin will replace SQL operations it -supports with GPU accelerated versions. If an operation is not supported it will fall back to using -the Spark CPU version. Note that the plugin cannot accelerate operations that manipulate RDDs -directly. - -The accelerator library also provides an implementation of Spark's shuffle that can leverage -[UCX](https://www.openucx.org/) to optimize GPU data transfers keeping as much data on the GPU as -possible and bypassing the CPU to do GPU to GPU transfers. - -The GPU accelerated processing plugin does not require the accelerated shuffle implementation. -However, if accelerated SQL processing is not enabled, the shuffle implementation falls back to the -default `SortShuffleManager`. - -To enable GPU processing acceleration you will need: -- Apache Spark 3.0+ -- A spark cluster configured with GPUs that comply with the requirements for the version of - [cudf](https://github.com/rapidsai/cudf). - - One GPU per executor. -- Add the following jars: - - A cudf jar that corresponds to the version of CUDA available on your cluster. - - RAPIDS Spark accelerator plugin jar. -- Set the config `spark.plugins` to `com.nvidia.spark.SQLPlugin` - -## Prerequisites -Each node where you are running Spark needs to have the following installed. If you are running +## Apache Spark Setup for GPU +Each GPU node where you are running Spark needs to have the following installed. If you are running with Docker on Kubernetes then skip these as you will do this as part of the docker build. - Install Java 8 - note jdk11 is supported by Spark, but we have been building and testing with jdk8, so we suggest using that for now. @@ -53,39 +32,6 @@ with Docker on Kubernetes then skip these as you will do this as part of the doc - `sudo apt-get update` - `sudo apt-get -y install cuda` -## Spark GPU Scheduling Overview -Apache Spark 3.0 now supports GPU scheduling as long as you are using a cluster manager that -supports it. You can have Spark request GPUs and assign them to tasks. The exact configs you use -will vary depending on your cluster manager. Here are a few of the configs: -- Request your executor to have GPUs: - - `--conf spark.executor.resource.gpu.amount=1` -- Specify the number of GPUs per task: - - `--conf spark.task.resource.gpu.amount=1` -- Specify a GPU discovery script (required on YARN and K8S): - - `--conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh` - -See the deployment specific sections for more details and restrictions. Note that -`spark.task.resource.gpu.amount` can be a decimal amount, so if you want multiple tasks to be run -on an executor at the same time and assigned to the same GPU you can set this to a decimal value -less than 1. You would want this setting to correspond to the `spark.executor.cores` setting. For -instance, if you have `spark.executor.cores=2` which would allow 2 tasks to run on each executor -and you want those 2 tasks to run on the same GPU then you would set -`spark.task.resource.gpu.amount=0.5`. - -You can also refer to the official Apache Spark documentation. -- [Overview](https://github.com/apache/spark/blob/master/docs/configuration.md#custom-resource-scheduling-and-configuration-overview) -- [Kubernetes specific documentation](https://github.com/apache/spark/blob/master/docs/running-on-kubernetes.md#resource-allocation-and-configuration-overview) -- [Yarn specific documentation](https://github.com/apache/spark/blob/master/docs/running-on-yarn.md#resource-allocation-and-configuration-overview) -- [Standalone specific documentation](https://github.com/apache/spark/blob/master/docs/spark-standalone.md#resource-allocation-and-configuration-overview) - -## Spark Deployment Methods -The way you decide to deploy Spark affects the steps you must take to install and setup Spark and -the RAPIDS Accelerator for Apache Spark. The primary methods of deploy Spark are: -- Local mode - this is for dev/testing only, not for production -- Standalone Mode -- On a YARN cluster -- On a Kubernetes cluster - Below are sections on installing Spark and the RAPIDS Accelerator on a single node, you may want to read the deployment method sections before doing any installations. @@ -96,8 +42,8 @@ scala version 2.12 is currently supported by the accelerator. ## Download the RAPIDS jars The [accelerator](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark_2.12) and -[cudf](https://mvnrepository.com/artifact/ai.rapids/cudf) jars are available in -[maven central](https://mvnrepository.com/search?q=ai.rapids) +[cudf](https://mvnrepository.com/artifact/ai.rapids/cudf) jars are available in the +[download](/docs/version/stable-release#download) section. Download the RAPIDS Accelerator for Apache Spark plugin jar. Then download the version of the cudf jar that your version of the accelerator depends on. Each cudf jar is for a specific version of @@ -132,7 +78,7 @@ directory as the plugin jars (`/opt/sparkRapidsPlugin` in the example). This is for testing/dev setup only. It is not to be used in production. In this mode Spark runs everything in a single process on a single node. - [Install Spark](#install-spark) -- [Install the RAPIDS jars](#install-the-rapids-jars) +- [Install the RAPIDS jars](#download-the-rapids-jars) - Launch your Spark shell session Default configs usually work fine in local mode. The required changes are setting the config @@ -164,7 +110,7 @@ Spark Standalone mode requires starting the Spark master and worker(s). You can machine or multiple machines for distributed setup. The first step is to [Install Spark](#install-spark), the -[RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the +[RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the [GPU discovery script](#install-the-gpu-discovery-script) on all the nodes you want to use. After that choose one of the nodes to be your master node and start the master. Note that the master process does **not** need a GPU to function properly. @@ -227,7 +173,7 @@ $SPARK_HOME/bin/spark-shell \ ## Running on YARN YARN requires you to [Install Spark](#install-spark), the -[RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the +[RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the [GPU discovery script](#install-the-gpu-discovery-script) on a launcher node. YARN handles shipping them to the cluster nodes as needed. If you want to use the GPU scheduling feature in Spark it requires YARN version >= 2.10 or >= 3.1.1 and ideally you would use >= 3.1.3 in order to @@ -249,7 +195,7 @@ use - either 3.x or 2.x. - Configure YARN to support [GPU scheduling and isolation](https://hadoop.apache.org/docs/r3.1.3/hadoop-yarn/hadoop-yarn-site/UsingGpus.html). - Install [Spark](#install-spark), the - [RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the + [RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the [GPU discovery script](#install-the-gpu-discovery-script) on the node from which you are launching your Spark application. - Use the following configuration settings when running Spark on YARN, changing the amounts as @@ -278,7 +224,7 @@ $SPARK_HOME/bin/spark-shell \ - Configure YARN to support [GPU scheduling and isolation](https://hadoop.apache.org/docs/r2.10.0/hadoop-yarn/hadoop-yarn-site/ResourceProfiles.html) - Install [Spark](#install-spark), the - [RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the + [RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the [GPU discovery script](#install-the-gpu-discovery-script) on the node from which you are launching your Spark application. - Use the following configs when running Spark on YARN, changing the amounts as necessary: @@ -311,7 +257,7 @@ accessing a GPU at once. Note it does not matter if GPU scheduling support is en - Foreach GPU index set it to `EXCLUSIVE_PROCESS` mode: - `nvidia-smi -c EXCLUSIVE_PROCESS -i $index` - Install [Spark](#install-spark), the - [RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the + [RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the [GPU discovery script](#install-the-gpu-discovery-script) on the node from which you are launching your Spark application. - Use the following configs when running Spark on YARN. Note that we are configuring a resource @@ -349,7 +295,7 @@ This assumes you have Kubernetes already installed and setup. These instruction to setup a Kubernetes cluster. - Install [Spark](#install-spark), the - [RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the + [RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the [GPU discovery script](#install-the-gpu-discovery-script) on the node from which you are going to build your Docker image. Note that you can download these into a local directory and untar the Spark `.tar.gz` rather than installing into a location on the machine. @@ -382,7 +328,7 @@ $SPARK_HOME/bin/spark-shell \ ``` ## RAPIDS Accelerator Configuration and Tuning -Most of what you need you can get from [tuning guide](./tuning-guide.md). +Most of what you need you can get from [tuning guide](../tuning-guide). The following configs will hep you to get started but must be configured based on your cluster and application. @@ -427,7 +373,7 @@ operation “count at ...”, you should see the graph of Spark Execs and some o the label Gpu... For instance, in the screenshot below you will see `GpuRowToColumn`, `GpuFilter`, and `GpuColumnarExchange`. Those correspond to operations that run on the GPU. -![Join Example on Spark SQL UI](img/join-sql-ui-example.png) +![Join Example on Spark SQL UI](../img/join-sql-ui-example.png) ## Advanced Configuration diff --git a/docs/get-started/yarn-gpu.md b/docs/get-started/yarn-gpu.md new file mode 100644 index 00000000000..462b9dc593b --- /dev/null +++ b/docs/get-started/yarn-gpu.md @@ -0,0 +1,120 @@ +--- +layout: page +title: yarn-gpu +nav_exclude: true +--- + +## Spark3 GPU Configuration Guide on Yarn 3.2.1 + +Following files recommended to be configured to enable GPU scheduling on Yarn 3.2.1 and later. + +GPU resource discovery script - `/usr/lib/spark/scripts/gpu/getGpusResources.sh`: +```bash +mkdir -p /usr/lib/spark/scripts/gpu/ +cd /usr/lib/spark/scripts/gpu/ +wget https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh +chmod a+rwx -R /usr/lib/spark/scripts/gpu/ +``` + +Spark config - `/etc/spark/conf/spark-default.conf`: +```bash +spark.rapids.sql.concurrentGpuTasks=2 +spark.executor.resource.gpu.amount=1 +spark.executor.cores=8 +spark.task.cpus=1 +spark.task.resource.gpu.amount=0.125 +spark.rapids.memory.pinnedPool.size=2G +spark.executor.memoryOverhead=2G +spark.plugins=com.nvidia.spark.SQLPlugin +spark.executor.extraJavaOptions='-Dai.rapids.cudf.prefer-pinned=true' +spark.locality.wait=0s +spark.executor.resource.gpu.discoveryScript=/usr/lib/spark/scripts/gpu/getGpusResources.sh # this match the location of discovery script +spark.sql.shuffle.partitions=40 +spark.sql.files.maxPartitionBytes=512m +``` + +Yarn Scheduler config - `/etc/hadoop/conf/capacity-scheduler.xml`: +```xml + + + yarn.scheduler.capacity.resource-calculator + org.apache.hadoop.yarn.util.resource.DominantResourceCalculator + + +``` + +Yarn config - `/etc/hadoop/conf/yarn-site.xml`: +```xml + + + yarn.nodemanager.resource-plugins + yarn.io/gpu + + + yarn.resource-types + yarn.io/gpu + + + yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices + auto + + + yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables + /usr/bin + + + yarn.nodemanager.linux-container-executor.cgroups.mount + true + + + yarn.nodemanager.linux-container-executor.cgroups.mount-path + /sys/fs/cgroup + + + yarn.nodemanager.linux-container-executor.cgroups.hierarchy + yarn + + + yarn.nodemanager.container-executor.class + org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor + + + yarn.nodemanager.linux-container-executor.group + yarn + + +``` + +`/etc/hadoop/conf/container-executor.cfg` - user yarn as service account: +```bash +yarn.nodemanager.linux-container-executor.group=yarn + +#--Original container-exectuor.cfg Content-- + +[gpu] +module.enabled=true +[cgroups] +root=/sys/fs/cgroup +yarn-hierarchy=yarn +``` + +Need to share node manager local dir to all user, run below in bash: +```bash +chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct +chmod a+rwx -R /sys/fs/cgroup/devices +local_dirs=$(bdconfig get_property_value \ + --configuration_file /etc/hadoop/conf/yarn-site.xml \ + --name yarn.nodemanager.local-dirs 2>/dev/null) +mod_local_dirs=${local_dirs//\,/ } +chmod a+rwx -R ${mod_local_dirs} +``` + +In the end, restart node manager and resource manager service: +On all workers: +```bash +sudo systemctl restart hadoop-yarn-nodemanager.service +``` +On all masters: +```bash +sudo systemctl restart hadoop-yarn-resourcemanager.service +``` diff --git a/docs/img/dataproc-cluster.png b/docs/img/dataproc-cluster.png new file mode 100644 index 00000000000..87f3ea40913 Binary files /dev/null and b/docs/img/dataproc-cluster.png differ diff --git a/docs/img/dataproc-service.png b/docs/img/dataproc-service.png new file mode 100644 index 00000000000..79899dc848f Binary files /dev/null and b/docs/img/dataproc-service.png differ diff --git a/docs/img/ease-of-use.png b/docs/img/ease-of-use.png new file mode 100644 index 00000000000..e39c4e4728c Binary files /dev/null and b/docs/img/ease-of-use.png differ diff --git a/docs/img/initscript.png b/docs/img/initscript.png new file mode 100644 index 00000000000..01111a5235a Binary files /dev/null and b/docs/img/initscript.png differ diff --git a/docs/img/perf-cost.png b/docs/img/perf-cost.png new file mode 100644 index 00000000000..1b004d10fb9 Binary files /dev/null and b/docs/img/perf-cost.png differ diff --git a/docs/img/spark3cluster.png b/docs/img/spark3cluster.png new file mode 100644 index 00000000000..73050c63451 Binary files /dev/null and b/docs/img/spark3cluster.png differ diff --git a/docs/img/sparkconfig.png b/docs/img/sparkconfig.png new file mode 100644 index 00000000000..fe1344eb1e4 Binary files /dev/null and b/docs/img/sparkconfig.png differ diff --git a/docs/index.md b/docs/index.md index aed4afce6d9..e4c30e20749 100644 --- a/docs/index.md +++ b/docs/index.md @@ -5,7 +5,29 @@ nav_order: 1 permalink: / description: This site serves as a collection of documentation about the RAPIDS accelerator for Apache Spark --- +# Overview +The RAPIDS Accelerator for Apache Spark leverages GPUs to accelerate processing via the +[RAPIDS libraries](http://rapids.ai). + As data scientists shift from using traditional analytics to leveraging AI applications that better model complex market demands, traditional CPU-based processing can no longer keep up without compromising either speed or cost. The growing adoption of AI in analytics has created the need for a new framework to process data quickly and cost efficiently with GPUs. The RAPIDS Accelerator for Apache Spark combines the power of the RAPIDS cuDF library and the scale of the Spark distributed computing framework. The RAPIDS Accelerator library also has a built-in accelerated shuffle based on UCX that can be configured to leverage GPU-to-GPU communication and RDMA capabilities. +## Perfomance & Cost Benefits +Rapids Accelerator for Apache Spark reaps the benefit of GPU perfomance while saving infrastructure costs. +![Perf-cost](/img/Perf-cost.png) +*ETL for FannieMae Mortgage Dataset (~200GB) as shown in our [demo](https://databricks.com/session_na20/deep-dive-into-gpu-support-in-apache-spark-3-x). Costs based on Cloud T4 GPU instance market price & V100 GPU price on Databricks Standard edition + + +## Ease of Use +Run your existing Apache Spark applications with no code change. Learn more on how to [get started](/Getting-Started/). + +`spark.conf.set('spark.rapids.sql.enabled','true')` + +![ease-of-use](/img/ease-of-use.png) + +## An unified AI framework for ETL + ML/DL +A single pipeline, from ingest to data preparation to model training +![spark3cluster](/img/spark3cluster.png) + + diff --git a/docs/version/stable-release.md b/docs/version/stable-release.md new file mode 100644 index 00000000000..19726294cd5 --- /dev/null +++ b/docs/version/stable-release.md @@ -0,0 +1,36 @@ +--- +layout: page +title: Stable Version +nav_order: 1 +parent: Version +--- + +## Stable Release - v0.1.0 +This is the first public release of the RAPIDS Accelerator for Apache Spark. +The list of supported operations is provided [here](../configs.html#supported-gpu-operators-and-fine-tuning) + +Hardware Requirements: + + GPU Architecture: NVIDIA Pascal™ or better (Tested on V100 and T4 GPU) + +Software Requirements: + + OS: Ubuntu 16.04 & gcc 5.4 OR Ubuntu 18.04/CentOS 7 & gcc 7.3 + (RHEL 7 support is provided through CentOS 7 builds/installs) + + CUDA & NVIDIA Drivers: 10.1.2 & v418.87+ or 10.2 & v440.33+ + + Apache Spark 3.0 + + Apache Hadoop 2.10+ or 3.1.1+ (3.1.1 for nvidia-docker version 2) + + Python 3.x, Scala 2.12, Java 8 + + +## Download +* [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.1.0/rapids-4-spark_2.12-0.1.0.jar) +* [cuDF 10.2 Package](https://repo1.maven.org/maven2/ai/rapids/cudf/0.14/cudf-0.14-cuda10-2.jar) +* [cuDF 10.1 Package](https://repo1.maven.org/maven2/ai/rapids/cudf/0.14/cudf-0.14-cuda10-1.jar) + + + diff --git a/docs/version/version.md b/docs/version/version.md new file mode 100644 index 00000000000..fb5c5b62d7c --- /dev/null +++ b/docs/version/version.md @@ -0,0 +1,7 @@ +--- +layout: page +title: Version +nav_order: 10 +has_children: true +permalink: /Version/ +---