diff --git a/docs/demo/Databricks/generate-init-script.ipynb b/docs/demo/Databricks/generate-init-script.ipynb
new file mode 100644
index 00000000000..bad856791ce
--- /dev/null
+++ b/docs/demo/Databricks/generate-init-script.ipynb
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"code","source":["dbutils.fs.mkdirs(\"dbfs:/databricks/init_scripts/\")\n \ndbutils.fs.put(\"/databricks/init_scripts/init.sh\",\"\"\"\n#!/bin/bash\nsudo wget -O /databricks/jars/rapids-4-spark_2.12-0.1.0-databricks.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.1.0-databricks/rapids-4-spark_2.12-0.1.0-databricks.jar\nsudo wget -O /databricks/jars/cudf-0.14-cuda10-1.jar https://repo1.maven.org/maven2/ai/rapids/cudf/0.14/cudf-0.14-cuda10-1.jar\"\"\", True)"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["%sh\ncd ../../dbfs/databricks/init_scripts\npwd\nls -ltr\ncat init.sh"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":3}],"metadata":{"name":"generate-init-script","notebookId":2645746662301564},"nbformat":4,"nbformat_minor":0}
diff --git a/docs/demo/GCP/Mortgage-ETL-CPU.ipynb b/docs/demo/GCP/Mortgage-ETL-CPU.ipynb
new file mode 100644
index 00000000000..8618ed1f072
--- /dev/null
+++ b/docs/demo/GCP/Mortgage-ETL-CPU.ipynb
@@ -0,0 +1,1174 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Data Source\n",
+    "\n",
+    "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. This processed dataset is redistributed with permission and consent from Fannie Mae. For the full raw dataset visit [Fannie Mae]() to register for an account and to download\n",
+    "\n",
+    "Instruction is available at NVIDIA [RAPIDS demo site](https://rapidsai.github.io/demos/datasets/mortgage-data).\n",
+    "\n",
+    "### Prerequisite\n",
+    "\n",
+    "This notebook runs in a Dataproc cluster with GPU nodes, with [Spark RAPIDS](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/rapids) set up.\n",
+    "\n",
+    "### Define ETL Process\n",
+    "\n",
+    "Define data schema and steps to do the ETL process:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from pyspark import broadcast\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql.functions import *\n",
+    "from pyspark.sql.types import *\n",
+    "from pyspark.sql.window import Window\n",
+    "\n",
+    "def _get_quarter_from_csv_file_name():\n",
+    "    return substring_index(substring_index(input_file_name(), '.', 1), '_', -1)\n",
+    "\n",
+    "_csv_perf_schema = StructType([\n",
+    "    StructField('loan_id', LongType()),\n",
+    "    StructField('monthly_reporting_period', StringType()),\n",
+    "    StructField('servicer', StringType()),\n",
+    "    StructField('interest_rate', DoubleType()),\n",
+    "    StructField('current_actual_upb', DoubleType()),\n",
+    "    StructField('loan_age', DoubleType()),\n",
+    "    StructField('remaining_months_to_legal_maturity', DoubleType()),\n",
+    "    StructField('adj_remaining_months_to_maturity', DoubleType()),\n",
+    "    StructField('maturity_date', StringType()),\n",
+    "    StructField('msa', DoubleType()),\n",
+    "    StructField('current_loan_delinquency_status', IntegerType()),\n",
+    "    StructField('mod_flag', StringType()),\n",
+    "    StructField('zero_balance_code', StringType()),\n",
+    "    StructField('zero_balance_effective_date', StringType()),\n",
+    "    StructField('last_paid_installment_date', StringType()),\n",
+    "    StructField('foreclosed_after', StringType()),\n",
+    "    StructField('disposition_date', StringType()),\n",
+    "    StructField('foreclosure_costs', DoubleType()),\n",
+    "    StructField('prop_preservation_and_repair_costs', DoubleType()),\n",
+    "    StructField('asset_recovery_costs', DoubleType()),\n",
+    "    StructField('misc_holding_expenses', DoubleType()),\n",
+    "    StructField('holding_taxes', DoubleType()),\n",
+    "    StructField('net_sale_proceeds', DoubleType()),\n",
+    "    StructField('credit_enhancement_proceeds', DoubleType()),\n",
+    "    StructField('repurchase_make_whole_proceeds', StringType()),\n",
+    "    StructField('other_foreclosure_proceeds', DoubleType()),\n",
+    "    StructField('non_interest_bearing_upb', DoubleType()),\n",
+    "    StructField('principal_forgiveness_upb', StringType()),\n",
+    "    StructField('repurchase_make_whole_proceeds_flag', StringType()),\n",
+    "    StructField('foreclosure_principal_write_off_amount', StringType()),\n",
+    "    StructField('servicing_activity_indicator', StringType())])\n",
+    "_csv_acq_schema = StructType([\n",
+    "    StructField('loan_id', LongType()),\n",
+    "    StructField('orig_channel', StringType()),\n",
+    "    StructField('seller_name', StringType()),\n",
+    "    StructField('orig_interest_rate', DoubleType()),\n",
+    "    StructField('orig_upb', IntegerType()),\n",
+    "    StructField('orig_loan_term', IntegerType()),\n",
+    "    StructField('orig_date', StringType()),\n",
+    "    StructField('first_pay_date', StringType()),\n",
+    "    StructField('orig_ltv', DoubleType()),\n",
+    "    StructField('orig_cltv', DoubleType()),\n",
+    "    StructField('num_borrowers', DoubleType()),\n",
+    "    StructField('dti', DoubleType()),\n",
+    "    StructField('borrower_credit_score', DoubleType()),\n",
+    "    StructField('first_home_buyer', StringType()),\n",
+    "    StructField('loan_purpose', StringType()),\n",
+    "    StructField('property_type', StringType()),\n",
+    "    StructField('num_units', IntegerType()),\n",
+    "    StructField('occupancy_status', StringType()),\n",
+    "    StructField('property_state', StringType()),\n",
+    "    StructField('zip', IntegerType()),\n",
+    "    StructField('mortgage_insurance_percent', DoubleType()),\n",
+    "    StructField('product_type', StringType()),\n",
+    "    StructField('coborrow_credit_score', DoubleType()),\n",
+    "    StructField('mortgage_insurance_type', DoubleType()),\n",
+    "    StructField('relocation_mortgage_indicator', StringType())])\n",
+    "_name_mapping = [\n",
+    "        (\"WITMER FUNDING, LLC\", \"Witmer\"),\n",
+    "        (\"WELLS FARGO CREDIT RISK TRANSFER SECURITIES TRUST 2015\", \"Wells Fargo\"),\n",
+    "        (\"WELLS FARGO BANK,  NA\" , \"Wells Fargo\"),\n",
+    "        (\"WELLS FARGO BANK, N.A.\" , \"Wells Fargo\"),\n",
+    "        (\"WELLS FARGO BANK, NA\" , \"Wells Fargo\"),\n",
+    "        (\"USAA FEDERAL SAVINGS BANK\" , \"USAA\"),\n",
+    "        (\"UNITED SHORE FINANCIAL SERVICES, LLC D\\\\/B\\\\/A UNITED WHOLESALE MORTGAGE\" , \"United Seq(e\"),\n",
+    "        (\"U.S. BANK N.A.\" , \"US Bank\"),\n",
+    "        (\"SUNTRUST MORTGAGE INC.\" , \"Suntrust\"),\n",
+    "        (\"STONEGATE MORTGAGE CORPORATION\" , \"Stonegate Mortgage\"),\n",
+    "        (\"STEARNS LENDING, LLC\" , \"Stearns Lending\"),\n",
+    "        (\"STEARNS LENDING, INC.\" , \"Stearns Lending\"),\n",
+    "        (\"SIERRA PACIFIC MORTGAGE COMPANY, INC.\" , \"Sierra Pacific Mortgage\"),\n",
+    "        (\"REGIONS BANK\" , \"Regions\"),\n",
+    "        (\"RBC MORTGAGE COMPANY\" , \"RBC\"),\n",
+    "        (\"QUICKEN LOANS INC.\" , \"Quicken Loans\"),\n",
+    "        (\"PULTE MORTGAGE, L.L.C.\" , \"Pulte Mortgage\"),\n",
+    "        (\"PROVIDENT FUNDING ASSOCIATES, L.P.\" , \"Provident Funding\"),\n",
+    "        (\"PROSPECT MORTGAGE, LLC\" , \"Prospect Mortgage\"),\n",
+    "        (\"PRINCIPAL RESIDENTIAL MORTGAGE CAPITAL RESOURCES, LLC\" , \"Principal Residential\"),\n",
+    "        (\"PNC BANK, N.A.\" , \"PNC\"),\n",
+    "        (\"PMT CREDIT RISK TRANSFER TRUST 2015-2\" , \"PennyMac\"),\n",
+    "        (\"PHH MORTGAGE CORPORATION\" , \"PHH Mortgage\"),\n",
+    "        (\"PENNYMAC CORP.\" , \"PennyMac\"),\n",
+    "        (\"PACIFIC UNION FINANCIAL, LLC\" , \"Other\"),\n",
+    "        (\"OTHER\" , \"Other\"),\n",
+    "        (\"NYCB MORTGAGE COMPANY, LLC\" , \"NYCB\"),\n",
+    "        (\"NEW YORK COMMUNITY BANK\" , \"NYCB\"),\n",
+    "        (\"NETBANK FUNDING SERVICES\" , \"Netbank\"),\n",
+    "        (\"NATIONSTAR MORTGAGE, LLC\" , \"Nationstar Mortgage\"),\n",
+    "        (\"METLIFE BANK, NA\" , \"Metlife\"),\n",
+    "        (\"LOANDEPOT.COM, LLC\" , \"LoanDepot.com\"),\n",
+    "        (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2015-1\" , \"JP Morgan Chase\"),\n",
+    "        (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2014-1\" , \"JP Morgan Chase\"),\n",
+    "        (\"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION\" , \"JP Morgan Chase\"),\n",
+    "        (\"JPMORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n",
+    "        (\"JP MORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n",
+    "        (\"IRWIN MORTGAGE, CORPORATION\" , \"Irwin Mortgage\"),\n",
+    "        (\"IMPAC MORTGAGE CORP.\" , \"Impac Mortgage\"),\n",
+    "        (\"HSBC BANK USA, NATIONAL ASSOCIATION\" , \"HSBC\"),\n",
+    "        (\"HOMEWARD RESIDENTIAL, INC.\" , \"Homeward Mortgage\"),\n",
+    "        (\"HOMESTREET BANK\" , \"Other\"),\n",
+    "        (\"HOMEBRIDGE FINANCIAL SERVICES, INC.\" , \"HomeBridge\"),\n",
+    "        (\"HARWOOD STREET FUNDING I, LLC\" , \"Harwood Mortgage\"),\n",
+    "        (\"GUILD MORTGAGE COMPANY\" , \"Guild Mortgage\"),\n",
+    "        (\"GMAC MORTGAGE, LLC (USAA FEDERAL SAVINGS BANK)\" , \"GMAC\"),\n",
+    "        (\"GMAC MORTGAGE, LLC\" , \"GMAC\"),\n",
+    "        (\"GMAC (USAA)\" , \"GMAC\"),\n",
+    "        (\"FREMONT BANK\" , \"Fremont Bank\"),\n",
+    "        (\"FREEDOM MORTGAGE CORP.\" , \"Freedom Mortgage\"),\n",
+    "        (\"FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"Franklin America\"),\n",
+    "        (\"FLEET NATIONAL BANK\" , \"Fleet National\"),\n",
+    "        (\"FLAGSTAR CAPITAL MARKETS CORPORATION\" , \"Flagstar Bank\"),\n",
+    "        (\"FLAGSTAR BANK, FSB\" , \"Flagstar Bank\"),\n",
+    "        (\"FIRST TENNESSEE BANK NATIONAL ASSOCIATION\" , \"Other\"),\n",
+    "        (\"FIFTH THIRD BANK\" , \"Fifth Third Bank\"),\n",
+    "        (\"FEDERAL HOME LOAN BANK OF CHICAGO\" , \"Fedral Home of Chicago\"),\n",
+    "        (\"FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB\" , \"FDIC\"),\n",
+    "        (\"DOWNEY SAVINGS AND LOAN ASSOCIATION, F.A.\" , \"Downey Mortgage\"),\n",
+    "        (\"DITECH FINANCIAL LLC\" , \"Ditech\"),\n",
+    "        (\"CITIMORTGAGE, INC.\" , \"Citi\"),\n",
+    "        (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERFIRST MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n",
+    "        (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERBANK MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n",
+    "        (\"CHASE HOME FINANCE, LLC\" , \"JP Morgan Chase\"),\n",
+    "        (\"CHASE HOME FINANCE FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"JP Morgan Chase\"),\n",
+    "        (\"CHASE HOME FINANCE (CIE 1)\" , \"JP Morgan Chase\"),\n",
+    "        (\"CHASE HOME FINANCE\" , \"JP Morgan Chase\"),\n",
+    "        (\"CASHCALL, INC.\" , \"CashCall\"),\n",
+    "        (\"CAPITAL ONE, NATIONAL ASSOCIATION\" , \"Capital One\"),\n",
+    "        (\"CALIBER HOME LOANS, INC.\" , \"Caliber Funding\"),\n",
+    "        (\"BISHOPS GATE RESIDENTIAL MORTGAGE TRUST\" , \"Bishops Gate Mortgage\"),\n",
+    "        (\"BANK OF AMERICA, N.A.\" , \"Bank of America\"),\n",
+    "        (\"AMTRUST BANK\" , \"AmTrust\"),\n",
+    "        (\"AMERISAVE MORTGAGE CORPORATION\" , \"Amerisave\"),\n",
+    "        (\"AMERIHOME MORTGAGE COMPANY, LLC\" , \"AmeriHome Mortgage\"),\n",
+    "        (\"ALLY BANK\" , \"Ally Bank\"),\n",
+    "        (\"ACADEMY MORTGAGE CORPORATION\" , \"Academy Mortgage\"),\n",
+    "        (\"NO CASH-OUT REFINANCE\" , \"OTHER REFINANCE\"),\n",
+    "        (\"REFINANCE - NOT SPECIFIED\" , \"OTHER REFINANCE\"),\n",
+    "        (\"Other REFINANCE\" , \"OTHER REFINANCE\")]\n",
+    "\n",
+    "cate_col_names = [\n",
+    "        \"orig_channel\",\n",
+    "        \"first_home_buyer\",\n",
+    "        \"loan_purpose\",\n",
+    "        \"property_type\",\n",
+    "        \"occupancy_status\",\n",
+    "        \"property_state\",\n",
+    "        \"relocation_mortgage_indicator\",\n",
+    "        \"seller_name\",\n",
+    "        \"mod_flag\"\n",
+    "]\n",
+    "# Numberic columns\n",
+    "label_col_name = \"delinquency_12\"\n",
+    "numeric_col_names = [\n",
+    "        \"orig_interest_rate\",\n",
+    "        \"orig_upb\",\n",
+    "        \"orig_loan_term\",\n",
+    "        \"orig_ltv\",\n",
+    "        \"orig_cltv\",\n",
+    "        \"num_borrowers\",\n",
+    "        \"dti\",\n",
+    "        \"borrower_credit_score\",\n",
+    "        \"num_units\",\n",
+    "        \"zip\",\n",
+    "        \"mortgage_insurance_percent\",\n",
+    "        \"current_loan_delinquency_status\",\n",
+    "        \"current_actual_upb\",\n",
+    "        \"interest_rate\",\n",
+    "        \"loan_age\",\n",
+    "        \"msa\",\n",
+    "        \"non_interest_bearing_upb\",\n",
+    "        label_col_name\n",
+    "]\n",
+    "all_col_names = cate_col_names + numeric_col_names\n",
+    "\n",
+    "def read_perf_csv(spark, path):\n",
+    "    return spark.read.format('csv') \\\n",
+    "            .option('nullValue', '') \\\n",
+    "            .option('header', 'false') \\\n",
+    "            .option('delimiter', '|') \\\n",
+    "            .schema(_csv_perf_schema) \\\n",
+    "            .load(path) \\\n",
+    "            .withColumn('quarter', _get_quarter_from_csv_file_name())\n",
+    "\n",
+    "def read_acq_csv(spark, path):\n",
+    "    return spark.read.format('csv') \\\n",
+    "            .option('nullValue', '') \\\n",
+    "            .option('header', 'false') \\\n",
+    "            .option('delimiter', '|') \\\n",
+    "            .schema(_csv_acq_schema) \\\n",
+    "            .load(path) \\\n",
+    "            .withColumn('quarter', _get_quarter_from_csv_file_name())\n",
+    "\n",
+    "def _parse_dates(perf):\n",
+    "    return perf \\\n",
+    "            .withColumn('monthly_reporting_period', to_date(col('monthly_reporting_period'), 'MM/dd/yyyy')) \\\n",
+    "            .withColumn('monthly_reporting_period_month', month(col('monthly_reporting_period'))) \\\n",
+    "            .withColumn('monthly_reporting_period_year', year(col('monthly_reporting_period'))) \\\n",
+    "            .withColumn('monthly_reporting_period_day', dayofmonth(col('monthly_reporting_period'))) \\\n",
+    "            .withColumn('last_paid_installment_date', to_date(col('last_paid_installment_date'), 'MM/dd/yyyy')) \\\n",
+    "            .withColumn('foreclosed_after', to_date(col('foreclosed_after'), 'MM/dd/yyyy')) \\\n",
+    "            .withColumn('disposition_date', to_date(col('disposition_date'), 'MM/dd/yyyy')) \\\n",
+    "            .withColumn('maturity_date', to_date(col('maturity_date'), 'MM/yyyy')) \\\n",
+    "            .withColumn('zero_balance_effective_date', to_date(col('zero_balance_effective_date'), 'MM/yyyy'))\n",
+    "\n",
+    "def _create_perf_deliquency(spark, perf):\n",
+    "    aggDF = perf.select(\n",
+    "            col(\"quarter\"),\n",
+    "            col(\"loan_id\"),\n",
+    "            col(\"current_loan_delinquency_status\"),\n",
+    "            when(col(\"current_loan_delinquency_status\") >= 1, col(\"monthly_reporting_period\")).alias(\"delinquency_30\"),\n",
+    "            when(col(\"current_loan_delinquency_status\") >= 3, col(\"monthly_reporting_period\")).alias(\"delinquency_90\"),\n",
+    "            when(col(\"current_loan_delinquency_status\") >= 6, col(\"monthly_reporting_period\")).alias(\"delinquency_180\")) \\\n",
+    "                    .groupBy(\"quarter\", \"loan_id\") \\\n",
+    "                    .agg(\n",
+    "                            max(\"current_loan_delinquency_status\").alias(\"delinquency_12\"),\n",
+    "                            min(\"delinquency_30\").alias(\"delinquency_30\"),\n",
+    "                            min(\"delinquency_90\").alias(\"delinquency_90\"),\n",
+    "                            min(\"delinquency_180\").alias(\"delinquency_180\")) \\\n",
+    "                                    .select(\n",
+    "                                            col(\"quarter\"),\n",
+    "                                            col(\"loan_id\"),\n",
+    "                                            (col(\"delinquency_12\") >= 1).alias(\"ever_30\"),\n",
+    "                                            (col(\"delinquency_12\") >= 3).alias(\"ever_90\"),\n",
+    "                                            (col(\"delinquency_12\") >= 6).alias(\"ever_180\"),\n",
+    "                                            col(\"delinquency_30\"),\n",
+    "                                            col(\"delinquency_90\"),\n",
+    "                                            col(\"delinquency_180\"))\n",
+    "    joinedDf = perf \\\n",
+    "            .withColumnRenamed(\"monthly_reporting_period\", \"timestamp\") \\\n",
+    "            .withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n",
+    "            .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n",
+    "            .withColumnRenamed(\"current_loan_delinquency_status\", \"delinquency_12\") \\\n",
+    "            .withColumnRenamed(\"current_actual_upb\", \"upb_12\") \\\n",
+    "            .select(\"quarter\", \"loan_id\", \"timestamp\", \"delinquency_12\", \"upb_12\", \"timestamp_month\", \"timestamp_year\") \\\n",
+    "            .join(aggDF, [\"loan_id\", \"quarter\"], \"left_outer\")\n",
+    "\n",
+    "    # calculate the 12 month delinquency and upb values\n",
+    "    months = 12\n",
+    "    monthArray = [lit(x) for x in range(0, 12)]\n",
+    "    # explode on a small amount of data is actually slightly more efficient than a cross join\n",
+    "    testDf = joinedDf \\\n",
+    "            .withColumn(\"month_y\", explode(array(monthArray))) \\\n",
+    "            .select(\n",
+    "                    col(\"quarter\"),\n",
+    "                    floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000) / months).alias(\"josh_mody\"),\n",
+    "                    floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000 - col(\"month_y\")) / months).alias(\"josh_mody_n\"),\n",
+    "                    col(\"ever_30\"),\n",
+    "                    col(\"ever_90\"),\n",
+    "                    col(\"ever_180\"),\n",
+    "                    col(\"delinquency_30\"),\n",
+    "                    col(\"delinquency_90\"),\n",
+    "                    col(\"delinquency_180\"),\n",
+    "                    col(\"loan_id\"),\n",
+    "                    col(\"month_y\"),\n",
+    "                    col(\"delinquency_12\"),\n",
+    "                    col(\"upb_12\")) \\\n",
+    "                            .groupBy(\"quarter\", \"loan_id\", \"josh_mody_n\", \"ever_30\", \"ever_90\", \"ever_180\", \"delinquency_30\", \"delinquency_90\", \"delinquency_180\", \"month_y\") \\\n",
+    "                            .agg(max(\"delinquency_12\").alias(\"delinquency_12\"), min(\"upb_12\").alias(\"upb_12\")) \\\n",
+    "                            .withColumn(\"timestamp_year\", floor((lit(24000) + (col(\"josh_mody_n\") * lit(months)) + (col(\"month_y\") - 1)) / lit(12))) \\\n",
+    "                            .selectExpr('*', 'pmod(24000 + (josh_mody_n * {}) + month_y, 12) as timestamp_month_tmp'.format(months)) \\\n",
+    "                            .withColumn(\"timestamp_month\", when(col(\"timestamp_month_tmp\") == lit(0), lit(12)).otherwise(col(\"timestamp_month_tmp\"))) \\\n",
+    "                            .withColumn(\"delinquency_12\", ((col(\"delinquency_12\") > 3).cast(\"int\") + (col(\"upb_12\") == 0).cast(\"int\")).alias(\"delinquency_12\")) \\\n",
+    "                            .drop(\"timestamp_month_tmp\", \"josh_mody_n\", \"month_y\")\n",
+    "\n",
+    "    return perf.withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n",
+    "            .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n",
+    "            .join(testDf, [\"quarter\", \"loan_id\", \"timestamp_year\", \"timestamp_month\"], \"left\") \\\n",
+    "            .drop(\"timestamp_year\", \"timestamp_month\")\n",
+    "\n",
+    "def _create_acquisition(spark, acq):\n",
+    "    nameMapping = spark.createDataFrame(_name_mapping, [\"from_seller_name\", \"to_seller_name\"])\n",
+    "    return acq.join(nameMapping, col(\"seller_name\") == col(\"from_seller_name\"), \"left\") \\\n",
+    "      .drop(\"from_seller_name\") \\\n",
+    "      .withColumn(\"old_name\", col(\"seller_name\")) \\\n",
+    "      .withColumn(\"seller_name\", coalesce(col(\"to_seller_name\"), col(\"seller_name\"))) \\\n",
+    "      .drop(\"to_seller_name\") \\\n",
+    "      .withColumn(\"orig_date\", to_date(col(\"orig_date\"), \"MM/yyyy\")) \\\n",
+    "      .withColumn(\"first_pay_date\", to_date(col(\"first_pay_date\"), \"MM/yyyy\")) \\\n",
+    "\n",
+    "def _gen_dictionary(etl_df, col_names):\n",
+    "    cnt_table = etl_df.select(posexplode(array([col(i) for i in col_names])))\\\n",
+    "                    .withColumnRenamed(\"pos\", \"column_id\")\\\n",
+    "                    .withColumnRenamed(\"col\", \"data\")\\\n",
+    "                    .filter(\"data is not null\")\\\n",
+    "                    .groupBy(\"column_id\", \"data\")\\\n",
+    "                    .count()\n",
+    "    windowed = Window.partitionBy(\"column_id\").orderBy(desc(\"count\"))\n",
+    "    return cnt_table.withColumn(\"id\", row_number().over(windowed)).drop(\"count\")\n",
+    "\n",
+    "\n",
+    "def _cast_string_columns_to_numeric(spark, input_df):\n",
+    "    cached_dict_df = _gen_dictionary(input_df, cate_col_names).cache()\n",
+    "    output_df = input_df\n",
+    "    #  Generate the final table with all columns being numeric.\n",
+    "    for col_pos, col_name in enumerate(cate_col_names):\n",
+    "        col_dict_df = cached_dict_df.filter(col(\"column_id\") == col_pos)\\\n",
+    "                                    .drop(\"column_id\")\\\n",
+    "                                    .withColumnRenamed(\"data\", col_name)\n",
+    "        \n",
+    "        output_df = output_df.join(broadcast(col_dict_df), col_name, \"left\")\\\n",
+    "                        .drop(col_name)\\\n",
+    "                        .withColumnRenamed(\"id\", col_name)\n",
+    "    return output_df\n",
+    "\n",
+    "def run_mortgage(spark, perf, acq):\n",
+    "    parsed_perf = _parse_dates(perf)\n",
+    "    perf_deliqency = _create_perf_deliquency(spark, parsed_perf)\n",
+    "    cleaned_acq = _create_acquisition(spark, acq)\n",
+    "    df = perf_deliqency.join(cleaned_acq, [\"loan_id\", \"quarter\"], \"inner\")\n",
+    "    test_quarters = ['2016Q1','2016Q2','2016Q3','2016Q4']\n",
+    "    train_df = df.filter(~df.quarter.isin(test_quarters)).drop(\"quarter\")\n",
+    "    test_df = df.filter(df.quarter.isin(test_quarters)).drop(\"quarter\")\n",
+    "    casted_train_df = _cast_string_columns_to_numeric(spark, train_df)\\\n",
+    "                    .select(all_col_names)\\\n",
+    "                    .withColumn(label_col_name, when(col(label_col_name) > 0, 1).otherwise(0))\\\n",
+    "                    .fillna(float(0))\n",
+    "    casted_test_df = _cast_string_columns_to_numeric(spark, test_df)\\\n",
+    "                    .select(all_col_names)\\\n",
+    "                    .withColumn(label_col_name, when(col(label_col_name) > 0, 1).otherwise(0))\\\n",
+    "                    .fillna(float(0))\n",
+    "    return casted_train_df, casted_test_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Spark conf and Create Spark Session\n",
+    "For details explanation for spark conf, please go to Spark RAPIDS [config guide](https://nvidia.github.io/spark-rapids/docs/configs.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc.stop()\n",
+    "\n",
+    "conf = SparkConf().setAppName(\"MortgageETL-CPU\")\n",
+    "conf.set(\"spark.executor.instances\", \"20\")\n",
+    "conf.set(\"spark.executor.cores\", \"7\") # spark.executor.cores times spark.executor.instances should equal total cores.\n",
+    "conf.set(\"spark.task.cpus\", \"1\")\n",
+    "conf.set(\"spark.executor.memory\", \"36g\")\n",
+    "conf.set(\"spark.locality.wait\", \"0s\")\n",
+    "conf.set(\"spark.sql.files.maxPartitionBytes\", \"512m\")\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"0\")\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", \"0\")\n",
+    "conf.set(\"spark.plugins\", \" \")\n",
+    "conf.set(\"spark.sql.broadcastTimeout\", \"7200\")\n",
+    "spark = SparkSession.builder \\\n",
+    "                    .config(conf=conf) \\\n",
+    "                    .getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Data Input/Output location"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "orig_perf_path = 'gs://dataproc-nv-demo/mortgage_full/perf/*'\n",
+    "orig_acq_path = 'gs://dataproc-nv-demo/mortgage_full/acq/*'\n",
+    "train_path = 'gs://dataproc-nv-demo/mortgage_cpu/train/'\n",
+    "test_path = 'gs://dataproc-nv-demo/mortgage_cpu/test/'\n",
+    "tmp_perf_path = 'gs://dataproc-nv-demo/mortgage_parquet_cpu/perf/'\n",
+    "tmp_acq_path = 'gs://dataproc-nv-demo/mortgage_parquet_cpu/acq/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Read CSV data and Transcode to Parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Lets transcode the data first\n",
+    "start = time.time()\n",
+    "# we want a few big files instead of lots of small files\n",
+    "spark.conf.set('spark.sql.files.maxPartitionBytes', '200G')\n",
+    "acq = read_acq_csv(spark, orig_acq_path)\n",
+    "acq.repartition(20).write.parquet(tmp_acq_path, mode='overwrite')\n",
+    "perf = read_perf_csv(spark, orig_perf_path)\n",
+    "perf.coalesce(80).write.parquet(tmp_perf_path, mode='overwrite')\n",
+    "end = time.time()\n",
+    "print(end - start)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Execute ETL Code Defined in 1st Cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1194.4553289413452\n",
+      "1813.5378119945526\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Now lets actually process the data\\n\",\n",
+    "start = time.time()\n",
+    "spark.conf.set('spark.sql.shuffle.partitions', '160')\n",
+    "perf = spark.read.parquet(tmp_perf_path)\n",
+    "acq = spark.read.parquet(tmp_acq_path)\n",
+    "train_out, test_out = run_mortgage(spark, perf, acq)\n",
+    "train_out.write.parquet(train_path, mode='overwrite')\n",
+    "end = time.time()\n",
+    "print(end - start)\n",
+    "test_out.write.parquet(test_path, mode='overwrite')\n",
+    "end = time.time()\n",
+    "print(end - start)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Print Physical Plan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "== Physical Plan ==\n",
+      "*(27) Project [coalesce(orig_channel#1675, 0) AS orig_channel#3439, coalesce(first_home_buyer#1877, 0) AS first_home_buyer#3440, coalesce(loan_purpose#2079, 0) AS loan_purpose#3441, coalesce(property_type#2281, 0) AS property_type#3442, coalesce(occupancy_status#2483, 0) AS occupancy_status#3443, coalesce(property_state#2685, 0) AS property_state#3444, coalesce(relocation_mortgage_indicator#2887, 0) AS relocation_mortgage_indicator#3445, coalesce(seller_name#3089, 0) AS seller_name#3446, coalesce(id#1498, 0) AS mod_flag#3447, coalesce(nanvl(orig_interest_rate#67, null), 0.0) AS orig_interest_rate#3448, coalesce(orig_upb#68, 0) AS orig_upb#3449, coalesce(orig_loan_term#69, 0) AS orig_loan_term#3450, coalesce(nanvl(orig_ltv#72, null), 0.0) AS orig_ltv#3451, coalesce(nanvl(orig_cltv#73, null), 0.0) AS orig_cltv#3452, coalesce(nanvl(num_borrowers#74, null), 0.0) AS num_borrowers#3453, coalesce(nanvl(dti#75, null), 0.0) AS dti#3454, coalesce(nanvl(borrower_credit_score#76, null), 0.0) AS borrower_credit_score#3455, coalesce(num_units#80, 0) AS num_units#3456, coalesce(zip#83, 0) AS zip#3457, coalesce(nanvl(mortgage_insurance_percent#84, null), 0.0) AS mortgage_insurance_percent#3458, coalesce(current_loan_delinquency_status#10, 0) AS current_loan_delinquency_status#3459, coalesce(nanvl(current_actual_upb#4, null), 0.0) AS current_actual_upb#3460, coalesce(nanvl(interest_rate#3, null), 0.0) AS interest_rate#3461, coalesce(nanvl(loan_age#5, null), 0.0) AS loan_age#3462, ... 3 more fields]\n",
+      "+- *(27) BroadcastHashJoin [mod_flag#11], [mod_flag#3157], LeftOuter, BuildRight\n",
+      "   :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, zip#83, mortgage_insurance_percent#84, orig_channel#1675, first_home_buyer#1877, loan_purpose#2079, property_type#2281, occupancy_status#2483, ... 3 more fields]\n",
+      "   :  +- *(27) BroadcastHashJoin [seller_name#1172], [seller_name#2955], LeftOuter, BuildRight\n",
+      "   :     :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, zip#83, mortgage_insurance_percent#84, orig_channel#1675, first_home_buyer#1877, loan_purpose#2079, property_type#2281, ... 3 more fields]\n",
+      "   :     :  +- *(27) BroadcastHashJoin [relocation_mortgage_indicator#88], [relocation_mortgage_indicator#2753], LeftOuter, BuildRight\n",
+      "   :     :     :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, orig_channel#1675, first_home_buyer#1877, loan_purpose#2079, ... 3 more fields]\n",
+      "   :     :     :  +- *(27) BroadcastHashJoin [property_state#82], [property_state#2551], LeftOuter, BuildRight\n",
+      "   :     :     :     :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, property_state#82, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, orig_channel#1675, first_home_buyer#1877, ... 3 more fields]\n",
+      "   :     :     :     :  +- *(27) BroadcastHashJoin [occupancy_status#81], [occupancy_status#2349], LeftOuter, BuildRight\n",
+      "   :     :     :     :     :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, num_units#80, occupancy_status#81, property_state#82, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, orig_channel#1675, ... 3 more fields]\n",
+      "   :     :     :     :     :  +- *(27) BroadcastHashJoin [property_type#79], [property_type#2147], LeftOuter, BuildRight\n",
+      "   :     :     :     :     :     :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, property_type#79, num_units#80, occupancy_status#81, property_state#82, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, ... 3 more fields]\n",
+      "   :     :     :     :     :     :  +- *(27) BroadcastHashJoin [loan_purpose#78], [loan_purpose#1945], LeftOuter, BuildRight\n",
+      "   :     :     :     :     :     :     :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, loan_purpose#78, property_type#79, num_units#80, occupancy_status#81, property_state#82, zip#83, mortgage_insurance_percent#84, ... 3 more fields]\n",
+      "   :     :     :     :     :     :     :  +- *(27) BroadcastHashJoin [first_home_buyer#77], [first_home_buyer#1743], LeftOuter, BuildRight\n",
+      "   :     :     :     :     :     :     :     :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, first_home_buyer#77, loan_purpose#78, property_type#79, num_units#80, occupancy_status#81, property_state#82, zip#83, ... 3 more fields]\n",
+      "   :     :     :     :     :     :     :     :  +- *(27) BroadcastHashJoin [orig_channel#65], [orig_channel#1541], LeftOuter, BuildRight\n",
+      "   :     :     :     :     :     :     :     :     :- *(27) Project [interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812, orig_channel#65, seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, first_home_buyer#77, loan_purpose#78, property_type#79, num_units#80, occupancy_status#81, property_state#82, ... 3 more fields]\n",
+      "   :     :     :     :     :     :     :     :     :  +- *(27) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "   :     :     :     :     :     :     :     :     :     :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :     :     :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#2011]\n",
+      "   :     :     :     :     :     :     :     :     :     :     +- *(10) Project [quarter#31, loan_id#0L, interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, delinquency_12#812]\n",
+      "   :     :     :     :     :     :     :     :     :     :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "   :     :     :     :     :     :     :     :     :     :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :     :     :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#1968]\n",
+      "   :     :     :     :     :     :     :     :     :     :           :     +- *(1) Project [loan_id#0L, interest_rate#3, current_actual_upb#4, loan_age#5, msa#9, current_loan_delinquency_status#10, mod_flag#11, non_interest_bearing_upb#26, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "   :     :     :     :     :     :     :     :     :     :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "   :     :     :     :     :     :     :     :     :     :           :           +- *(1) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :     :     :     :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,interest_rate#3,current_actual_upb#4,loan_age#5,msa#9,current_loan_delinquency_status#10,mod_flag#11,non_interest_bearing_upb#26,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,interest_rate:double,current_actual_upb:dou...\n",
+      "   :     :     :     :     :     :     :     :     :     :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :     :     :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#2003]\n",
+      "   :     :     :     :     :     :     :     :     :     :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[max(delinquency_12#582), min(upb_12#618)])\n",
+      "   :     :     :     :     :     :     :     :     :     :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[partial_max(delinquency_12#582), partial_min(upb_12#618)])\n",
+      "   :     :     :     :     :     :     :     :     :     :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707, delinquency_12#582, upb_12#618]\n",
+      "   :     :     :     :     :     :     :     :     :     :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "   :     :     :     :     :     :     :     :     :     :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, delinquency_12#582, upb_12#618, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "   :     :     :     :     :     :     :     :     :     :                                +- *(7) Project [loan_id#912L, quarter#943, delinquency_12#582, upb_12#618, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "   :     :     :     :     :     :     :     :     :     :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "   :     :     :     :     :     :     :     :     :     :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :     :     :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#1977]\n",
+      "   :     :     :     :     :     :     :     :     :     :                                      :     +- *(3) Project [quarter#943, loan_id#912L, current_loan_delinquency_status#922 AS delinquency_12#582, current_actual_upb#916 AS upb_12#618, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "   :     :     :     :     :     :     :     :     :     :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "   :     :     :     :     :     :     :     :     :     :                                      :           +- *(3) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :     :     :     :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,current_actual_upb#916,current_loan_delinquency_status#922,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_actual_upb:double,current_loan_deli...\n",
+      "   :     :     :     :     :     :     :     :     :     :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :     :     :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "   :     :     :     :     :     :     :     :     :     :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#1987]\n",
+      "   :     :     :     :     :     :     :     :     :     :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "   :     :     :     :     :     :     :     :     :     :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "   :     :     :     :     :     :     :     :     :     :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "   :     :     :     :     :     :     :     :     :     :                                                        +- *(5) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :     :     :     :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "   :     :     :     :     :     :     :     :     :     +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :     :        +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#2034]\n",
+      "   :     :     :     :     :     :     :     :     :           +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, first_home_buyer#77, loan_purpose#78, property_type#79, num_units#80, occupancy_status#81, property_state#82, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :     :     :     :     :              +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "   :     :     :     :     :     :     :     :     :                 :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :     :                 :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#2021]\n",
+      "   :     :     :     :     :     :     :     :     :                 :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, orig_interest_rate#67, orig_upb#68, orig_loan_term#69, orig_ltv#72, orig_cltv#73, num_borrowers#74, dti#75, borrower_credit_score#76, first_home_buyer#77, loan_purpose#78, property_type#79, num_units#80, occupancy_status#81, property_state#82, zip#83, mortgage_insurance_percent#84, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :     :     :     :     :                 :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "   :     :     :     :     :     :     :     :     :                 :           +- *(12) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :     :     :                 :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,orig_interest_rate#67,orig_upb#68,orig_loan_term#69,orig_ltv#72,orig_cltv#73,num_borrowers#74,dti#75,borrower_credit_score#76,first_home_buyer#77,loan_purpose#78,property_type#79,num_units#80,occupancy_status#81,property_state#82,zip#83,mortgage_insurance_percent#84,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,orig_interest_rate:double,orig_upb:i...\n",
+      "   :     :     :     :     :     :     :     :     :                 +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :     :                    +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#2026]\n",
+      "   :     :     :     :     :     :     :     :     :                       +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "   :     :     :     :     :     :     :     :     :                          +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "   :     :     :     :     :     :     :     :     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#2045]\n",
+      "   :     :     :     :     :     :     :     :        +- *(18) Project [data#1487 AS orig_channel#1541, id#1498]\n",
+      "   :     :     :     :     :     :     :     :           +- *(18) Filter ((column_id#1484 = 0) AND isnotnull(data#1487))\n",
+      "   :     :     :     :     :     :     :     :              +- InMemoryTableScan [column_id#1484, data#1487, id#1498], [(column_id#1484 = 0), isnotnull(data#1487)]\n",
+      "   :     :     :     :     :     :     :     :                    +- InMemoryRelation [column_id#1484, data#1487, id#1498], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "   :     :     :     :     :     :     :     :                          +- *(22) Project [column_id#1484, data#1487, id#1498]\n",
+      "   :     :     :     :     :     :     :     :                             +- Window [row_number() windowspecdefinition(column_id#1484, count#1493L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#1498], [column_id#1484], [count#1493L DESC NULLS LAST]\n",
+      "   :     :     :     :     :     :     :     :                                +- *(21) Sort [column_id#1484 ASC NULLS FIRST, count#1493L DESC NULLS LAST], false, 0\n",
+      "   :     :     :     :     :     :     :     :                                   +- Exchange hashpartitioning(column_id#1484, 160), true, [id=#313]\n",
+      "   :     :     :     :     :     :     :     :                                      +- *(20) HashAggregate(keys=[column_id#1484, data#1487], functions=[count(1)])\n",
+      "   :     :     :     :     :     :     :     :                                         +- Exchange hashpartitioning(column_id#1484, data#1487, 160), true, [id=#309]\n",
+      "   :     :     :     :     :     :     :     :                                            +- *(19) HashAggregate(keys=[column_id#1484, data#1487], functions=[partial_count(1)])\n",
+      "   :     :     :     :     :     :     :     :                                               +- *(19) Project [pos#1480 AS column_id#1484, col#1481 AS data#1487]\n",
+      "   :     :     :     :     :     :     :     :                                                  +- *(19) Filter isnotnull(col#1481)\n",
+      "   :     :     :     :     :     :     :     :                                                     +- Generate posexplode(array(orig_channel#65, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, seller_name#1172, mod_flag#11)), false, [pos#1480, col#1481]\n",
+      "   :     :     :     :     :     :     :     :                                                        +- *(18) Project [mod_flag#11, orig_channel#65, seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88]\n",
+      "   :     :     :     :     :     :     :     :                                                           +- *(18) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "   :     :     :     :     :     :     :     :                                                              :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :                                                              :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#272]\n",
+      "   :     :     :     :     :     :     :     :                                                              :     +- *(10) Project [quarter#31, loan_id#0L, mod_flag#11]\n",
+      "   :     :     :     :     :     :     :     :                                                              :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "   :     :     :     :     :     :     :     :                                                              :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :                                                              :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#229]\n",
+      "   :     :     :     :     :     :     :     :                                                              :           :     +- *(1) Project [loan_id#0L, mod_flag#11, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "   :     :     :     :     :     :     :     :                                                              :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "   :     :     :     :     :     :     :     :                                                              :           :           +- *(1) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :     :                                                              :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,mod_flag#11,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "   :     :     :     :     :     :     :     :                                                              :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :                                                              :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n",
+      "   :     :     :     :     :     :     :     :                                                              :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :     :     :     :     :                                                              :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :     :     :     :     :                                                              :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n",
+      "   :     :     :     :     :     :     :     :                                                              :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "   :     :     :     :     :     :     :     :                                                              :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                      :     +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                      :           +- *(3) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                                        +- *(5) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :     :                                                              :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "   :     :     :     :     :     :     :     :                                                              +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :                                                                 +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#295]\n",
+      "   :     :     :     :     :     :     :     :                                                                    +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :     :     :     :                                                                       +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "   :     :     :     :     :     :     :     :                                                                          :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :                                                                          :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#282]\n",
+      "   :     :     :     :     :     :     :     :                                                                          :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :     :     :     :                                                                          :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "   :     :     :     :     :     :     :     :                                                                          :           +- *(12) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :     :                                                                          :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,first_home_buyer#77,loan_purpose#78,property_type#79,occupancy_status#81,property_state#82,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "   :     :     :     :     :     :     :     :                                                                          +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :     :                                                                             +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#287]\n",
+      "   :     :     :     :     :     :     :     :                                                                                +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "   :     :     :     :     :     :     :     :                                                                                   +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "   :     :     :     :     :     :     :     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#2053]\n",
+      "   :     :     :     :     :     :     :        +- *(19) Project [data#1487 AS first_home_buyer#1743, id#1498]\n",
+      "   :     :     :     :     :     :     :           +- *(19) Filter ((column_id#1484 = 1) AND isnotnull(data#1487))\n",
+      "   :     :     :     :     :     :     :              +- InMemoryTableScan [column_id#1484, data#1487, id#1498], [(column_id#1484 = 1), isnotnull(data#1487)]\n",
+      "   :     :     :     :     :     :     :                    +- InMemoryRelation [column_id#1484, data#1487, id#1498], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "   :     :     :     :     :     :     :                          +- *(22) Project [column_id#1484, data#1487, id#1498]\n",
+      "   :     :     :     :     :     :     :                             +- Window [row_number() windowspecdefinition(column_id#1484, count#1493L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#1498], [column_id#1484], [count#1493L DESC NULLS LAST]\n",
+      "   :     :     :     :     :     :     :                                +- *(21) Sort [column_id#1484 ASC NULLS FIRST, count#1493L DESC NULLS LAST], false, 0\n",
+      "   :     :     :     :     :     :     :                                   +- Exchange hashpartitioning(column_id#1484, 160), true, [id=#313]\n",
+      "   :     :     :     :     :     :     :                                      +- *(20) HashAggregate(keys=[column_id#1484, data#1487], functions=[count(1)])\n",
+      "   :     :     :     :     :     :     :                                         +- Exchange hashpartitioning(column_id#1484, data#1487, 160), true, [id=#309]\n",
+      "   :     :     :     :     :     :     :                                            +- *(19) HashAggregate(keys=[column_id#1484, data#1487], functions=[partial_count(1)])\n",
+      "   :     :     :     :     :     :     :                                               +- *(19) Project [pos#1480 AS column_id#1484, col#1481 AS data#1487]\n",
+      "   :     :     :     :     :     :     :                                                  +- *(19) Filter isnotnull(col#1481)\n",
+      "   :     :     :     :     :     :     :                                                     +- Generate posexplode(array(orig_channel#65, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, seller_name#1172, mod_flag#11)), false, [pos#1480, col#1481]\n",
+      "   :     :     :     :     :     :     :                                                        +- *(18) Project [mod_flag#11, orig_channel#65, seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88]\n",
+      "   :     :     :     :     :     :     :                                                           +- *(18) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "   :     :     :     :     :     :     :                                                              :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :                                                              :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#272]\n",
+      "   :     :     :     :     :     :     :                                                              :     +- *(10) Project [quarter#31, loan_id#0L, mod_flag#11]\n",
+      "   :     :     :     :     :     :     :                                                              :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "   :     :     :     :     :     :     :                                                              :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :                                                              :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#229]\n",
+      "   :     :     :     :     :     :     :                                                              :           :     +- *(1) Project [loan_id#0L, mod_flag#11, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "   :     :     :     :     :     :     :                                                              :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "   :     :     :     :     :     :     :                                                              :           :           +- *(1) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :                                                              :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,mod_flag#11,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "   :     :     :     :     :     :     :                                                              :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :                                                              :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n",
+      "   :     :     :     :     :     :     :                                                              :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :     :     :     :                                                              :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :     :     :     :                                                              :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n",
+      "   :     :     :     :     :     :     :                                                              :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "   :     :     :     :     :     :     :                                                              :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "   :     :     :     :     :     :     :                                                              :                                +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "   :     :     :     :     :     :     :                                                              :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "   :     :     :     :     :     :     :                                                              :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :                                                              :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n",
+      "   :     :     :     :     :     :     :                                                              :                                      :     +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "   :     :     :     :     :     :     :                                                              :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "   :     :     :     :     :     :     :                                                              :                                      :           +- *(3) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :                                                              :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "   :     :     :     :     :     :     :                                                              :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :                                                              :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "   :     :     :     :     :     :     :                                                              :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n",
+      "   :     :     :     :     :     :     :                                                              :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "   :     :     :     :     :     :     :                                                              :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "   :     :     :     :     :     :     :                                                              :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "   :     :     :     :     :     :     :                                                              :                                                        +- *(5) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :                                                              :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "   :     :     :     :     :     :     :                                                              +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :                                                                 +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#295]\n",
+      "   :     :     :     :     :     :     :                                                                    +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :     :     :                                                                       +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "   :     :     :     :     :     :     :                                                                          :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :                                                                          :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#282]\n",
+      "   :     :     :     :     :     :     :                                                                          :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :     :     :                                                                          :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "   :     :     :     :     :     :     :                                                                          :           +- *(12) ColumnarToRow\n",
+      "   :     :     :     :     :     :     :                                                                          :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,first_home_buyer#77,loan_purpose#78,property_type#79,occupancy_status#81,property_state#82,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "   :     :     :     :     :     :     :                                                                          +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :     :                                                                             +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#287]\n",
+      "   :     :     :     :     :     :     :                                                                                +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "   :     :     :     :     :     :     :                                                                                   +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "   :     :     :     :     :     :     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#2061]\n",
+      "   :     :     :     :     :     :        +- *(20) Project [data#1487 AS loan_purpose#1945, id#1498]\n",
+      "   :     :     :     :     :     :           +- *(20) Filter ((column_id#1484 = 2) AND isnotnull(data#1487))\n",
+      "   :     :     :     :     :     :              +- InMemoryTableScan [column_id#1484, data#1487, id#1498], [(column_id#1484 = 2), isnotnull(data#1487)]\n",
+      "   :     :     :     :     :     :                    +- InMemoryRelation [column_id#1484, data#1487, id#1498], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "   :     :     :     :     :     :                          +- *(22) Project [column_id#1484, data#1487, id#1498]\n",
+      "   :     :     :     :     :     :                             +- Window [row_number() windowspecdefinition(column_id#1484, count#1493L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#1498], [column_id#1484], [count#1493L DESC NULLS LAST]\n",
+      "   :     :     :     :     :     :                                +- *(21) Sort [column_id#1484 ASC NULLS FIRST, count#1493L DESC NULLS LAST], false, 0\n",
+      "   :     :     :     :     :     :                                   +- Exchange hashpartitioning(column_id#1484, 160), true, [id=#313]\n",
+      "   :     :     :     :     :     :                                      +- *(20) HashAggregate(keys=[column_id#1484, data#1487], functions=[count(1)])\n",
+      "   :     :     :     :     :     :                                         +- Exchange hashpartitioning(column_id#1484, data#1487, 160), true, [id=#309]\n",
+      "   :     :     :     :     :     :                                            +- *(19) HashAggregate(keys=[column_id#1484, data#1487], functions=[partial_count(1)])\n",
+      "   :     :     :     :     :     :                                               +- *(19) Project [pos#1480 AS column_id#1484, col#1481 AS data#1487]\n",
+      "   :     :     :     :     :     :                                                  +- *(19) Filter isnotnull(col#1481)\n",
+      "   :     :     :     :     :     :                                                     +- Generate posexplode(array(orig_channel#65, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, seller_name#1172, mod_flag#11)), false, [pos#1480, col#1481]\n",
+      "   :     :     :     :     :     :                                                        +- *(18) Project [mod_flag#11, orig_channel#65, seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88]\n",
+      "   :     :     :     :     :     :                                                           +- *(18) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "   :     :     :     :     :     :                                                              :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :                                                              :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#272]\n",
+      "   :     :     :     :     :     :                                                              :     +- *(10) Project [quarter#31, loan_id#0L, mod_flag#11]\n",
+      "   :     :     :     :     :     :                                                              :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "   :     :     :     :     :     :                                                              :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :                                                              :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#229]\n",
+      "   :     :     :     :     :     :                                                              :           :     +- *(1) Project [loan_id#0L, mod_flag#11, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "   :     :     :     :     :     :                                                              :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "   :     :     :     :     :     :                                                              :           :           +- *(1) ColumnarToRow\n",
+      "   :     :     :     :     :     :                                                              :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,mod_flag#11,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "   :     :     :     :     :     :                                                              :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :                                                              :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n",
+      "   :     :     :     :     :     :                                                              :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :     :     :                                                              :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :     :     :                                                              :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n",
+      "   :     :     :     :     :     :                                                              :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "   :     :     :     :     :     :                                                              :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "   :     :     :     :     :     :                                                              :                                +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "   :     :     :     :     :     :                                                              :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "   :     :     :     :     :     :                                                              :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :                                                              :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n",
+      "   :     :     :     :     :     :                                                              :                                      :     +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "   :     :     :     :     :     :                                                              :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "   :     :     :     :     :     :                                                              :                                      :           +- *(3) ColumnarToRow\n",
+      "   :     :     :     :     :     :                                                              :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "   :     :     :     :     :     :                                                              :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :                                                              :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "   :     :     :     :     :     :                                                              :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n",
+      "   :     :     :     :     :     :                                                              :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "   :     :     :     :     :     :                                                              :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "   :     :     :     :     :     :                                                              :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "   :     :     :     :     :     :                                                              :                                                        +- *(5) ColumnarToRow\n",
+      "   :     :     :     :     :     :                                                              :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "   :     :     :     :     :     :                                                              +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :                                                                 +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#295]\n",
+      "   :     :     :     :     :     :                                                                    +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :     :                                                                       +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "   :     :     :     :     :     :                                                                          :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :                                                                          :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#282]\n",
+      "   :     :     :     :     :     :                                                                          :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :     :                                                                          :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "   :     :     :     :     :     :                                                                          :           +- *(12) ColumnarToRow\n",
+      "   :     :     :     :     :     :                                                                          :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,first_home_buyer#77,loan_purpose#78,property_type#79,occupancy_status#81,property_state#82,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "   :     :     :     :     :     :                                                                          +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :     :                                                                             +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#287]\n",
+      "   :     :     :     :     :     :                                                                                +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "   :     :     :     :     :     :                                                                                   +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "   :     :     :     :     :     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#2069]\n",
+      "   :     :     :     :     :        +- *(21) Project [data#1487 AS property_type#2147, id#1498]\n",
+      "   :     :     :     :     :           +- *(21) Filter ((column_id#1484 = 3) AND isnotnull(data#1487))\n",
+      "   :     :     :     :     :              +- InMemoryTableScan [column_id#1484, data#1487, id#1498], [(column_id#1484 = 3), isnotnull(data#1487)]\n",
+      "   :     :     :     :     :                    +- InMemoryRelation [column_id#1484, data#1487, id#1498], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "   :     :     :     :     :                          +- *(22) Project [column_id#1484, data#1487, id#1498]\n",
+      "   :     :     :     :     :                             +- Window [row_number() windowspecdefinition(column_id#1484, count#1493L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#1498], [column_id#1484], [count#1493L DESC NULLS LAST]\n",
+      "   :     :     :     :     :                                +- *(21) Sort [column_id#1484 ASC NULLS FIRST, count#1493L DESC NULLS LAST], false, 0\n",
+      "   :     :     :     :     :                                   +- Exchange hashpartitioning(column_id#1484, 160), true, [id=#313]\n",
+      "   :     :     :     :     :                                      +- *(20) HashAggregate(keys=[column_id#1484, data#1487], functions=[count(1)])\n",
+      "   :     :     :     :     :                                         +- Exchange hashpartitioning(column_id#1484, data#1487, 160), true, [id=#309]\n",
+      "   :     :     :     :     :                                            +- *(19) HashAggregate(keys=[column_id#1484, data#1487], functions=[partial_count(1)])\n",
+      "   :     :     :     :     :                                               +- *(19) Project [pos#1480 AS column_id#1484, col#1481 AS data#1487]\n",
+      "   :     :     :     :     :                                                  +- *(19) Filter isnotnull(col#1481)\n",
+      "   :     :     :     :     :                                                     +- Generate posexplode(array(orig_channel#65, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, seller_name#1172, mod_flag#11)), false, [pos#1480, col#1481]\n",
+      "   :     :     :     :     :                                                        +- *(18) Project [mod_flag#11, orig_channel#65, seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88]\n",
+      "   :     :     :     :     :                                                           +- *(18) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "   :     :     :     :     :                                                              :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :                                                              :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#272]\n",
+      "   :     :     :     :     :                                                              :     +- *(10) Project [quarter#31, loan_id#0L, mod_flag#11]\n",
+      "   :     :     :     :     :                                                              :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "   :     :     :     :     :                                                              :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :                                                              :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#229]\n",
+      "   :     :     :     :     :                                                              :           :     +- *(1) Project [loan_id#0L, mod_flag#11, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "   :     :     :     :     :                                                              :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "   :     :     :     :     :                                                              :           :           +- *(1) ColumnarToRow\n",
+      "   :     :     :     :     :                                                              :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,mod_flag#11,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "   :     :     :     :     :                                                              :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :                                                              :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n",
+      "   :     :     :     :     :                                                              :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :     :                                                              :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :     :                                                              :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n",
+      "   :     :     :     :     :                                                              :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "   :     :     :     :     :                                                              :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "   :     :     :     :     :                                                              :                                +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "   :     :     :     :     :                                                              :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "   :     :     :     :     :                                                              :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :                                                              :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n",
+      "   :     :     :     :     :                                                              :                                      :     +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "   :     :     :     :     :                                                              :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "   :     :     :     :     :                                                              :                                      :           +- *(3) ColumnarToRow\n",
+      "   :     :     :     :     :                                                              :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "   :     :     :     :     :                                                              :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :                                                              :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "   :     :     :     :     :                                                              :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n",
+      "   :     :     :     :     :                                                              :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "   :     :     :     :     :                                                              :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "   :     :     :     :     :                                                              :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "   :     :     :     :     :                                                              :                                                        +- *(5) ColumnarToRow\n",
+      "   :     :     :     :     :                                                              :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "   :     :     :     :     :                                                              +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :                                                                 +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#295]\n",
+      "   :     :     :     :     :                                                                    +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :                                                                       +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "   :     :     :     :     :                                                                          :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :                                                                          :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#282]\n",
+      "   :     :     :     :     :                                                                          :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :     :                                                                          :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "   :     :     :     :     :                                                                          :           +- *(12) ColumnarToRow\n",
+      "   :     :     :     :     :                                                                          :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,first_home_buyer#77,loan_purpose#78,property_type#79,occupancy_status#81,property_state#82,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "   :     :     :     :     :                                                                          +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :     :                                                                             +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#287]\n",
+      "   :     :     :     :     :                                                                                +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "   :     :     :     :     :                                                                                   +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "   :     :     :     :     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#2077]\n",
+      "   :     :     :     :        +- *(22) Project [data#1487 AS occupancy_status#2349, id#1498]\n",
+      "   :     :     :     :           +- *(22) Filter ((column_id#1484 = 4) AND isnotnull(data#1487))\n",
+      "   :     :     :     :              +- InMemoryTableScan [column_id#1484, data#1487, id#1498], [(column_id#1484 = 4), isnotnull(data#1487)]\n",
+      "   :     :     :     :                    +- InMemoryRelation [column_id#1484, data#1487, id#1498], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "   :     :     :     :                          +- *(22) Project [column_id#1484, data#1487, id#1498]\n",
+      "   :     :     :     :                             +- Window [row_number() windowspecdefinition(column_id#1484, count#1493L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#1498], [column_id#1484], [count#1493L DESC NULLS LAST]\n",
+      "   :     :     :     :                                +- *(21) Sort [column_id#1484 ASC NULLS FIRST, count#1493L DESC NULLS LAST], false, 0\n",
+      "   :     :     :     :                                   +- Exchange hashpartitioning(column_id#1484, 160), true, [id=#313]\n",
+      "   :     :     :     :                                      +- *(20) HashAggregate(keys=[column_id#1484, data#1487], functions=[count(1)])\n",
+      "   :     :     :     :                                         +- Exchange hashpartitioning(column_id#1484, data#1487, 160), true, [id=#309]\n",
+      "   :     :     :     :                                            +- *(19) HashAggregate(keys=[column_id#1484, data#1487], functions=[partial_count(1)])\n",
+      "   :     :     :     :                                               +- *(19) Project [pos#1480 AS column_id#1484, col#1481 AS data#1487]\n",
+      "   :     :     :     :                                                  +- *(19) Filter isnotnull(col#1481)\n",
+      "   :     :     :     :                                                     +- Generate posexplode(array(orig_channel#65, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, seller_name#1172, mod_flag#11)), false, [pos#1480, col#1481]\n",
+      "   :     :     :     :                                                        +- *(18) Project [mod_flag#11, orig_channel#65, seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88]\n",
+      "   :     :     :     :                                                           +- *(18) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "   :     :     :     :                                                              :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :                                                              :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#272]\n",
+      "   :     :     :     :                                                              :     +- *(10) Project [quarter#31, loan_id#0L, mod_flag#11]\n",
+      "   :     :     :     :                                                              :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "   :     :     :     :                                                              :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :                                                              :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#229]\n",
+      "   :     :     :     :                                                              :           :     +- *(1) Project [loan_id#0L, mod_flag#11, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "   :     :     :     :                                                              :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "   :     :     :     :                                                              :           :           +- *(1) ColumnarToRow\n",
+      "   :     :     :     :                                                              :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,mod_flag#11,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "   :     :     :     :                                                              :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :                                                              :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n",
+      "   :     :     :     :                                                              :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :                                                              :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :     :                                                              :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n",
+      "   :     :     :     :                                                              :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "   :     :     :     :                                                              :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "   :     :     :     :                                                              :                                +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "   :     :     :     :                                                              :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "   :     :     :     :                                                              :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :                                                              :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n",
+      "   :     :     :     :                                                              :                                      :     +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "   :     :     :     :                                                              :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "   :     :     :     :                                                              :                                      :           +- *(3) ColumnarToRow\n",
+      "   :     :     :     :                                                              :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "   :     :     :     :                                                              :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :                                                              :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "   :     :     :     :                                                              :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n",
+      "   :     :     :     :                                                              :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "   :     :     :     :                                                              :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "   :     :     :     :                                                              :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "   :     :     :     :                                                              :                                                        +- *(5) ColumnarToRow\n",
+      "   :     :     :     :                                                              :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "   :     :     :     :                                                              +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :                                                                 +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#295]\n",
+      "   :     :     :     :                                                                    +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :                                                                       +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "   :     :     :     :                                                                          :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :                                                                          :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#282]\n",
+      "   :     :     :     :                                                                          :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :     :                                                                          :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "   :     :     :     :                                                                          :           +- *(12) ColumnarToRow\n",
+      "   :     :     :     :                                                                          :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,first_home_buyer#77,loan_purpose#78,property_type#79,occupancy_status#81,property_state#82,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "   :     :     :     :                                                                          +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :     :                                                                             +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#287]\n",
+      "   :     :     :     :                                                                                +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "   :     :     :     :                                                                                   +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "   :     :     :     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#2085]\n",
+      "   :     :     :        +- *(23) Project [data#1487 AS property_state#2551, id#1498]\n",
+      "   :     :     :           +- *(23) Filter ((column_id#1484 = 5) AND isnotnull(data#1487))\n",
+      "   :     :     :              +- InMemoryTableScan [column_id#1484, data#1487, id#1498], [(column_id#1484 = 5), isnotnull(data#1487)]\n",
+      "   :     :     :                    +- InMemoryRelation [column_id#1484, data#1487, id#1498], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "   :     :     :                          +- *(22) Project [column_id#1484, data#1487, id#1498]\n",
+      "   :     :     :                             +- Window [row_number() windowspecdefinition(column_id#1484, count#1493L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#1498], [column_id#1484], [count#1493L DESC NULLS LAST]\n",
+      "   :     :     :                                +- *(21) Sort [column_id#1484 ASC NULLS FIRST, count#1493L DESC NULLS LAST], false, 0\n",
+      "   :     :     :                                   +- Exchange hashpartitioning(column_id#1484, 160), true, [id=#313]\n",
+      "   :     :     :                                      +- *(20) HashAggregate(keys=[column_id#1484, data#1487], functions=[count(1)])\n",
+      "   :     :     :                                         +- Exchange hashpartitioning(column_id#1484, data#1487, 160), true, [id=#309]\n",
+      "   :     :     :                                            +- *(19) HashAggregate(keys=[column_id#1484, data#1487], functions=[partial_count(1)])\n",
+      "   :     :     :                                               +- *(19) Project [pos#1480 AS column_id#1484, col#1481 AS data#1487]\n",
+      "   :     :     :                                                  +- *(19) Filter isnotnull(col#1481)\n",
+      "   :     :     :                                                     +- Generate posexplode(array(orig_channel#65, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, seller_name#1172, mod_flag#11)), false, [pos#1480, col#1481]\n",
+      "   :     :     :                                                        +- *(18) Project [mod_flag#11, orig_channel#65, seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88]\n",
+      "   :     :     :                                                           +- *(18) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "   :     :     :                                                              :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :                                                              :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#272]\n",
+      "   :     :     :                                                              :     +- *(10) Project [quarter#31, loan_id#0L, mod_flag#11]\n",
+      "   :     :     :                                                              :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "   :     :     :                                                              :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "   :     :     :                                                              :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#229]\n",
+      "   :     :     :                                                              :           :     +- *(1) Project [loan_id#0L, mod_flag#11, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "   :     :     :                                                              :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "   :     :     :                                                              :           :           +- *(1) ColumnarToRow\n",
+      "   :     :     :                                                              :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,mod_flag#11,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "   :     :     :                                                              :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :                                                              :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n",
+      "   :     :     :                                                              :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :                                                              :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :     :                                                              :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n",
+      "   :     :     :                                                              :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "   :     :     :                                                              :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "   :     :     :                                                              :                                +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "   :     :     :                                                              :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "   :     :     :                                                              :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :                                                              :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n",
+      "   :     :     :                                                              :                                      :     +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "   :     :     :                                                              :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "   :     :     :                                                              :                                      :           +- *(3) ColumnarToRow\n",
+      "   :     :     :                                                              :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "   :     :     :                                                              :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "   :     :     :                                                              :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "   :     :     :                                                              :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n",
+      "   :     :     :                                                              :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "   :     :     :                                                              :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "   :     :     :                                                              :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "   :     :     :                                                              :                                                        +- *(5) ColumnarToRow\n",
+      "   :     :     :                                                              :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "   :     :     :                                                              +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :                                                                 +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#295]\n",
+      "   :     :     :                                                                    +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :                                                                       +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "   :     :     :                                                                          :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :                                                                          :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#282]\n",
+      "   :     :     :                                                                          :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :     :                                                                          :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "   :     :     :                                                                          :           +- *(12) ColumnarToRow\n",
+      "   :     :     :                                                                          :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,first_home_buyer#77,loan_purpose#78,property_type#79,occupancy_status#81,property_state#82,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "   :     :     :                                                                          +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "   :     :     :                                                                             +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#287]\n",
+      "   :     :     :                                                                                +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "   :     :     :                                                                                   +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "   :     :     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#2093]\n",
+      "   :     :        +- *(24) Project [data#1487 AS relocation_mortgage_indicator#2753, id#1498]\n",
+      "   :     :           +- *(24) Filter ((column_id#1484 = 6) AND isnotnull(data#1487))\n",
+      "   :     :              +- InMemoryTableScan [column_id#1484, data#1487, id#1498], [(column_id#1484 = 6), isnotnull(data#1487)]\n",
+      "   :     :                    +- InMemoryRelation [column_id#1484, data#1487, id#1498], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "   :     :                          +- *(22) Project [column_id#1484, data#1487, id#1498]\n",
+      "   :     :                             +- Window [row_number() windowspecdefinition(column_id#1484, count#1493L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#1498], [column_id#1484], [count#1493L DESC NULLS LAST]\n",
+      "   :     :                                +- *(21) Sort [column_id#1484 ASC NULLS FIRST, count#1493L DESC NULLS LAST], false, 0\n",
+      "   :     :                                   +- Exchange hashpartitioning(column_id#1484, 160), true, [id=#313]\n",
+      "   :     :                                      +- *(20) HashAggregate(keys=[column_id#1484, data#1487], functions=[count(1)])\n",
+      "   :     :                                         +- Exchange hashpartitioning(column_id#1484, data#1487, 160), true, [id=#309]\n",
+      "   :     :                                            +- *(19) HashAggregate(keys=[column_id#1484, data#1487], functions=[partial_count(1)])\n",
+      "   :     :                                               +- *(19) Project [pos#1480 AS column_id#1484, col#1481 AS data#1487]\n",
+      "   :     :                                                  +- *(19) Filter isnotnull(col#1481)\n",
+      "   :     :                                                     +- Generate posexplode(array(orig_channel#65, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, seller_name#1172, mod_flag#11)), false, [pos#1480, col#1481]\n",
+      "   :     :                                                        +- *(18) Project [mod_flag#11, orig_channel#65, seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88]\n",
+      "   :     :                                                           +- *(18) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "   :     :                                                              :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "   :     :                                                              :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#272]\n",
+      "   :     :                                                              :     +- *(10) Project [quarter#31, loan_id#0L, mod_flag#11]\n",
+      "   :     :                                                              :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "   :     :                                                              :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "   :     :                                                              :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#229]\n",
+      "   :     :                                                              :           :     +- *(1) Project [loan_id#0L, mod_flag#11, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "   :     :                                                              :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "   :     :                                                              :           :           +- *(1) ColumnarToRow\n",
+      "   :     :                                                              :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,mod_flag#11,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "   :     :                                                              :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "   :     :                                                              :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n",
+      "   :     :                                                              :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :                                                              :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :     :                                                              :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n",
+      "   :     :                                                              :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "   :     :                                                              :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "   :     :                                                              :                                +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "   :     :                                                              :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "   :     :                                                              :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "   :     :                                                              :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n",
+      "   :     :                                                              :                                      :     +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "   :     :                                                              :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "   :     :                                                              :                                      :           +- *(3) ColumnarToRow\n",
+      "   :     :                                                              :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "   :     :                                                              :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "   :     :                                                              :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "   :     :                                                              :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n",
+      "   :     :                                                              :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "   :     :                                                              :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "   :     :                                                              :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "   :     :                                                              :                                                        +- *(5) ColumnarToRow\n",
+      "   :     :                                                              :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "   :     :                                                              +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "   :     :                                                                 +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#295]\n",
+      "   :     :                                                                    +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :                                                                       +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "   :     :                                                                          :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "   :     :                                                                          :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#282]\n",
+      "   :     :                                                                          :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :     :                                                                          :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "   :     :                                                                          :           +- *(12) ColumnarToRow\n",
+      "   :     :                                                                          :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,first_home_buyer#77,loan_purpose#78,property_type#79,occupancy_status#81,property_state#82,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "   :     :                                                                          +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "   :     :                                                                             +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#287]\n",
+      "   :     :                                                                                +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "   :     :                                                                                   +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "   :     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#2101]\n",
+      "   :        +- *(25) Project [data#1487 AS seller_name#2955, id#1498]\n",
+      "   :           +- *(25) Filter ((column_id#1484 = 7) AND isnotnull(data#1487))\n",
+      "   :              +- InMemoryTableScan [column_id#1484, data#1487, id#1498], [(column_id#1484 = 7), isnotnull(data#1487)]\n",
+      "   :                    +- InMemoryRelation [column_id#1484, data#1487, id#1498], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "   :                          +- *(22) Project [column_id#1484, data#1487, id#1498]\n",
+      "   :                             +- Window [row_number() windowspecdefinition(column_id#1484, count#1493L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#1498], [column_id#1484], [count#1493L DESC NULLS LAST]\n",
+      "   :                                +- *(21) Sort [column_id#1484 ASC NULLS FIRST, count#1493L DESC NULLS LAST], false, 0\n",
+      "   :                                   +- Exchange hashpartitioning(column_id#1484, 160), true, [id=#313]\n",
+      "   :                                      +- *(20) HashAggregate(keys=[column_id#1484, data#1487], functions=[count(1)])\n",
+      "   :                                         +- Exchange hashpartitioning(column_id#1484, data#1487, 160), true, [id=#309]\n",
+      "   :                                            +- *(19) HashAggregate(keys=[column_id#1484, data#1487], functions=[partial_count(1)])\n",
+      "   :                                               +- *(19) Project [pos#1480 AS column_id#1484, col#1481 AS data#1487]\n",
+      "   :                                                  +- *(19) Filter isnotnull(col#1481)\n",
+      "   :                                                     +- Generate posexplode(array(orig_channel#65, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, seller_name#1172, mod_flag#11)), false, [pos#1480, col#1481]\n",
+      "   :                                                        +- *(18) Project [mod_flag#11, orig_channel#65, seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88]\n",
+      "   :                                                           +- *(18) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "   :                                                              :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "   :                                                              :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#272]\n",
+      "   :                                                              :     +- *(10) Project [quarter#31, loan_id#0L, mod_flag#11]\n",
+      "   :                                                              :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "   :                                                              :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "   :                                                              :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#229]\n",
+      "   :                                                              :           :     +- *(1) Project [loan_id#0L, mod_flag#11, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "   :                                                              :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "   :                                                              :           :           +- *(1) ColumnarToRow\n",
+      "   :                                                              :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,mod_flag#11,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "   :                                                              :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "   :                                                              :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n",
+      "   :                                                              :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :                                                              :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "   :                                                              :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n",
+      "   :                                                              :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "   :                                                              :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "   :                                                              :                                +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "   :                                                              :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "   :                                                              :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "   :                                                              :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n",
+      "   :                                                              :                                      :     +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "   :                                                              :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "   :                                                              :                                      :           +- *(3) ColumnarToRow\n",
+      "   :                                                              :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "   :                                                              :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "   :                                                              :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "   :                                                              :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n",
+      "   :                                                              :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "   :                                                              :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "   :                                                              :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "   :                                                              :                                                        +- *(5) ColumnarToRow\n",
+      "   :                                                              :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "   :                                                              +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "   :                                                                 +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#295]\n",
+      "   :                                                                    +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :                                                                       +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "   :                                                                          :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "   :                                                                          :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#282]\n",
+      "   :                                                                          :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "   :                                                                          :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "   :                                                                          :           +- *(12) ColumnarToRow\n",
+      "   :                                                                          :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,first_home_buyer#77,loan_purpose#78,property_type#79,occupancy_status#81,property_state#82,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "   :                                                                          +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "   :                                                                             +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#287]\n",
+      "   :                                                                                +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "   :                                                                                   +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "   +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#2109]\n",
+      "      +- *(26) Project [data#1487 AS mod_flag#3157, id#1498]\n",
+      "         +- *(26) Filter ((column_id#1484 = 8) AND isnotnull(data#1487))\n",
+      "            +- InMemoryTableScan [column_id#1484, data#1487, id#1498], [(column_id#1484 = 8), isnotnull(data#1487)]\n",
+      "                  +- InMemoryRelation [column_id#1484, data#1487, id#1498], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "                        +- *(22) Project [column_id#1484, data#1487, id#1498]\n",
+      "                           +- Window [row_number() windowspecdefinition(column_id#1484, count#1493L DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#1498], [column_id#1484], [count#1493L DESC NULLS LAST]\n",
+      "                              +- *(21) Sort [column_id#1484 ASC NULLS FIRST, count#1493L DESC NULLS LAST], false, 0\n",
+      "                                 +- Exchange hashpartitioning(column_id#1484, 160), true, [id=#313]\n",
+      "                                    +- *(20) HashAggregate(keys=[column_id#1484, data#1487], functions=[count(1)])\n",
+      "                                       +- Exchange hashpartitioning(column_id#1484, data#1487, 160), true, [id=#309]\n",
+      "                                          +- *(19) HashAggregate(keys=[column_id#1484, data#1487], functions=[partial_count(1)])\n",
+      "                                             +- *(19) Project [pos#1480 AS column_id#1484, col#1481 AS data#1487]\n",
+      "                                                +- *(19) Filter isnotnull(col#1481)\n",
+      "                                                   +- Generate posexplode(array(orig_channel#65, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, seller_name#1172, mod_flag#11)), false, [pos#1480, col#1481]\n",
+      "                                                      +- *(18) Project [mod_flag#11, orig_channel#65, seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88]\n",
+      "                                                         +- *(18) SortMergeJoin [loan_id#0L, quarter#31], [loan_id#64L, quarter#89], Inner\n",
+      "                                                            :- *(11) Sort [loan_id#0L ASC NULLS FIRST, quarter#31 ASC NULLS FIRST], false, 0\n",
+      "                                                            :  +- Exchange hashpartitioning(loan_id#0L, quarter#31, 160), true, [id=#272]\n",
+      "                                                            :     +- *(10) Project [quarter#31, loan_id#0L, mod_flag#11]\n",
+      "                                                            :        +- SortMergeJoin [quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint)], [quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L], LeftOuter\n",
+      "                                                            :           :- *(2) Sort [quarter#31 ASC NULLS FIRST, loan_id#0L ASC NULLS FIRST, cast(timestamp_year#876 as bigint) ASC NULLS FIRST, cast(timestamp_month#840 as bigint) ASC NULLS FIRST], false, 0\n",
+      "                                                            :           :  +- Exchange hashpartitioning(quarter#31, loan_id#0L, cast(timestamp_year#876 as bigint), cast(timestamp_month#840 as bigint), 160), true, [id=#229]\n",
+      "                                                            :           :     +- *(1) Project [loan_id#0L, mod_flag#11, quarter#31, month(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#840, year(cast(cast(unix_timestamp(monthly_reporting_period#1, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#876]\n",
+      "                                                            :           :        +- *(1) Filter ((NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#0L)) AND isnotnull(quarter#31))\n",
+      "                                                            :           :           +- *(1) ColumnarToRow\n",
+      "                                                            :           :              +- FileScan parquet [loan_id#0L,monthly_reporting_period#1,mod_flag#11,quarter#31] Batched: true, DataFilters: [NOT quarter#31 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#0L), isnotnull(quarter#31)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "                                                            :           +- *(9) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST, timestamp_year#766L ASC NULLS FIRST, timestamp_month#795L ASC NULLS FIRST], false, 0\n",
+      "                                                            :              +- Exchange hashpartitioning(quarter#943, loan_id#912L, timestamp_year#766L, timestamp_month#795L, 160), true, [id=#264]\n",
+      "                                                            :                 +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "                                                            :                    +- *(8) HashAggregate(keys=[quarter#943, loan_id#912L, josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, month_y#707], functions=[])\n",
+      "                                                            :                       +- *(8) Project [quarter#943, FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) AS josh_mody_n#723L, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456, loan_id#912L, month_y#707]\n",
+      "                                                            :                          +- *(8) Filter (isnotnull(CASE WHEN (pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) = 0) THEN 12 ELSE pmod(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast(month_y#707 as bigint)), 12) END) AND isnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#546 * 12) + timestamp_month#510) - 24000) - month_y#707) as double) / 12.0)) * 12)) + cast((month_y#707 - 1) as bigint)) as double) / 12.0))))\n",
+      "                                                            :                             +- Generate explode([0,1,2,3,4,5,6,7,8,9,10,11]), [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456], false, [month_y#707]\n",
+      "                                                            :                                +- *(7) Project [loan_id#912L, quarter#943, timestamp_month#510, timestamp_year#546, ever_30#463, ever_90#464, ever_180#465, delinquency_30#452, delinquency_90#454, delinquency_180#456]\n",
+      "                                                            :                                   +- SortMergeJoin [quarter#943, loan_id#912L], [quarter#692, loan_id#661L], LeftOuter\n",
+      "                                                            :                                      :- *(4) Sort [quarter#943 ASC NULLS FIRST, loan_id#912L ASC NULLS FIRST], false, 0\n",
+      "                                                            :                                      :  +- Exchange hashpartitioning(quarter#943, loan_id#912L, 160), true, [id=#238]\n",
+      "                                                            :                                      :     +- *(3) Project [quarter#943, loan_id#912L, month(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_month#510, year(cast(cast(unix_timestamp(monthly_reporting_period#913, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date)) AS timestamp_year#546]\n",
+      "                                                            :                                      :        +- *(3) Filter ((NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(quarter#943)) AND isnotnull(loan_id#912L))\n",
+      "                                                            :                                      :           +- *(3) ColumnarToRow\n",
+      "                                                            :                                      :              +- FileScan parquet [loan_id#912L,monthly_reporting_period#913,quarter#943] Batched: true, DataFilters: [NOT quarter#943 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(quarter#943), isnotnull(loan_id#912L)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(quarter), IsNotNull(loan_id)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "                                                            :                                      +- *(6) Sort [quarter#692 ASC NULLS FIRST, loan_id#661L ASC NULLS FIRST], false, 0\n",
+      "                                                            :                                         +- *(6) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[max(current_loan_delinquency_status#671), min(delinquency_30#434), min(delinquency_90#435), min(delinquency_180#436)])\n",
+      "                                                            :                                            +- Exchange hashpartitioning(quarter#692, loan_id#661L, 160), true, [id=#248]\n",
+      "                                                            :                                               +- *(5) HashAggregate(keys=[quarter#692, loan_id#661L], functions=[partial_max(current_loan_delinquency_status#671), partial_min(delinquency_30#434), partial_min(delinquency_90#435), partial_min(delinquency_180#436)])\n",
+      "                                                            :                                                  +- *(5) Project [quarter#692, loan_id#661L, current_loan_delinquency_status#671, CASE WHEN (current_loan_delinquency_status#671 >= 1) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_30#434, CASE WHEN (current_loan_delinquency_status#671 >= 3) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_90#435, CASE WHEN (current_loan_delinquency_status#671 >= 6) THEN cast(cast(unix_timestamp(monthly_reporting_period#662, MM/dd/yyyy, Some(Etc/UTC)) as timestamp) as date) END AS delinquency_180#436]\n",
+      "                                                            :                                                     +- *(5) Filter (isnotnull(loan_id#661L) AND isnotnull(quarter#692))\n",
+      "                                                            :                                                        +- *(5) ColumnarToRow\n",
+      "                                                            :                                                           +- FileScan parquet [loan_id#661L,monthly_reporting_period#662,current_loan_delinquency_status#671,quarter#692] Batched: true, DataFilters: [isnotnull(loan_id#661L), isnotnull(quarter#692)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "                                                            +- *(17) Sort [loan_id#64L ASC NULLS FIRST, quarter#89 ASC NULLS FIRST], false, 0\n",
+      "                                                               +- Exchange hashpartitioning(loan_id#64L, quarter#89, 160), true, [id=#295]\n",
+      "                                                                  +- *(16) Project [loan_id#64L, orig_channel#65, coalesce(to_seller_name#1029, seller_name#66) AS seller_name#1172, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "                                                                     +- SortMergeJoin [seller_name#66], [from_seller_name#1028], LeftOuter\n",
+      "                                                                        :- *(13) Sort [seller_name#66 ASC NULLS FIRST], false, 0\n",
+      "                                                                        :  +- Exchange hashpartitioning(seller_name#66, 160), true, [id=#282]\n",
+      "                                                                        :     +- *(12) Project [loan_id#64L, orig_channel#65, seller_name#66, first_home_buyer#77, loan_purpose#78, property_type#79, occupancy_status#81, property_state#82, relocation_mortgage_indicator#88, quarter#89]\n",
+      "                                                                        :        +- *(12) Filter ((NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4) AND isnotnull(loan_id#64L)) AND isnotnull(quarter#89))\n",
+      "                                                                        :           +- *(12) ColumnarToRow\n",
+      "                                                                        :              +- FileScan parquet [loan_id#64L,orig_channel#65,seller_name#66,first_home_buyer#77,loan_purpose#78,property_type#79,occupancy_status#81,property_state#82,relocation_mortgage_indicator#88,quarter#89] Batched: true, DataFilters: [NOT quarter#89 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#64L), isnotnull(quarter#89)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_cpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "                                                                        +- *(15) Sort [from_seller_name#1028 ASC NULLS FIRST], false, 0\n",
+      "                                                                           +- Exchange hashpartitioning(from_seller_name#1028, 160), true, [id=#287]\n",
+      "                                                                              +- *(14) Filter isnotnull(from_seller_name#1028)\n",
+      "                                                                                 +- *(14) Scan ExistingRDD[from_seller_name#1028,to_seller_name#1029]\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_out.explain()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demo/GCP/Mortgage-ETL-GPU.ipynb b/docs/demo/GCP/Mortgage-ETL-GPU.ipynb
new file mode 100644
index 00000000000..ed70a962732
--- /dev/null
+++ b/docs/demo/GCP/Mortgage-ETL-GPU.ipynb
@@ -0,0 +1,1292 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "### Data Source\n",
+    "\n",
+    "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. This processed dataset is redistributed with permission and consent from Fannie Mae. For the full raw dataset visit [Fannie Mae]() to register for an account and to download\n",
+    "\n",
+    "Instruction is available at NVIDIA [RAPIDS demo site](https://rapidsai.github.io/demos/datasets/mortgage-data).\n",
+    "\n",
+    "### Prerequisite\n",
+    "\n",
+    "This notebook runs in a Dataproc cluster with GPU nodes, with [Spark RAPIDS](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/rapids) set up.\n",
+    "\n",
+    "### Define ETL Process\n",
+    "\n",
+    "Define data schema and steps to do the ETL process:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "from pyspark import broadcast\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql.functions import *\n",
+    "from pyspark.sql.types import *\n",
+    "from pyspark.sql.window import Window\n",
+    "\n",
+    "def _get_quarter_from_csv_file_name():\n",
+    "    return substring_index(substring_index(input_file_name(), '.', 1), '_', -1)\n",
+    "\n",
+    "_csv_perf_schema = StructType([\n",
+    "    StructField('loan_id', LongType()),\n",
+    "    StructField('monthly_reporting_period', StringType()),\n",
+    "    StructField('servicer', StringType()),\n",
+    "    StructField('interest_rate', DoubleType()),\n",
+    "    StructField('current_actual_upb', DoubleType()),\n",
+    "    StructField('loan_age', DoubleType()),\n",
+    "    StructField('remaining_months_to_legal_maturity', DoubleType()),\n",
+    "    StructField('adj_remaining_months_to_maturity', DoubleType()),\n",
+    "    StructField('maturity_date', StringType()),\n",
+    "    StructField('msa', DoubleType()),\n",
+    "    StructField('current_loan_delinquency_status', IntegerType()),\n",
+    "    StructField('mod_flag', StringType()),\n",
+    "    StructField('zero_balance_code', StringType()),\n",
+    "    StructField('zero_balance_effective_date', StringType()),\n",
+    "    StructField('last_paid_installment_date', StringType()),\n",
+    "    StructField('foreclosed_after', StringType()),\n",
+    "    StructField('disposition_date', StringType()),\n",
+    "    StructField('foreclosure_costs', DoubleType()),\n",
+    "    StructField('prop_preservation_and_repair_costs', DoubleType()),\n",
+    "    StructField('asset_recovery_costs', DoubleType()),\n",
+    "    StructField('misc_holding_expenses', DoubleType()),\n",
+    "    StructField('holding_taxes', DoubleType()),\n",
+    "    StructField('net_sale_proceeds', DoubleType()),\n",
+    "    StructField('credit_enhancement_proceeds', DoubleType()),\n",
+    "    StructField('repurchase_make_whole_proceeds', StringType()),\n",
+    "    StructField('other_foreclosure_proceeds', DoubleType()),\n",
+    "    StructField('non_interest_bearing_upb', DoubleType()),\n",
+    "    StructField('principal_forgiveness_upb', StringType()),\n",
+    "    StructField('repurchase_make_whole_proceeds_flag', StringType()),\n",
+    "    StructField('foreclosure_principal_write_off_amount', StringType()),\n",
+    "    StructField('servicing_activity_indicator', StringType())])\n",
+    "_csv_acq_schema = StructType([\n",
+    "    StructField('loan_id', LongType()),\n",
+    "    StructField('orig_channel', StringType()),\n",
+    "    StructField('seller_name', StringType()),\n",
+    "    StructField('orig_interest_rate', DoubleType()),\n",
+    "    StructField('orig_upb', IntegerType()),\n",
+    "    StructField('orig_loan_term', IntegerType()),\n",
+    "    StructField('orig_date', StringType()),\n",
+    "    StructField('first_pay_date', StringType()),\n",
+    "    StructField('orig_ltv', DoubleType()),\n",
+    "    StructField('orig_cltv', DoubleType()),\n",
+    "    StructField('num_borrowers', DoubleType()),\n",
+    "    StructField('dti', DoubleType()),\n",
+    "    StructField('borrower_credit_score', DoubleType()),\n",
+    "    StructField('first_home_buyer', StringType()),\n",
+    "    StructField('loan_purpose', StringType()),\n",
+    "    StructField('property_type', StringType()),\n",
+    "    StructField('num_units', IntegerType()),\n",
+    "    StructField('occupancy_status', StringType()),\n",
+    "    StructField('property_state', StringType()),\n",
+    "    StructField('zip', IntegerType()),\n",
+    "    StructField('mortgage_insurance_percent', DoubleType()),\n",
+    "    StructField('product_type', StringType()),\n",
+    "    StructField('coborrow_credit_score', DoubleType()),\n",
+    "    StructField('mortgage_insurance_type', DoubleType()),\n",
+    "    StructField('relocation_mortgage_indicator', StringType())])\n",
+    "_name_mapping = [\n",
+    "        (\"WITMER FUNDING, LLC\", \"Witmer\"),\n",
+    "        (\"WELLS FARGO CREDIT RISK TRANSFER SECURITIES TRUST 2015\", \"Wells Fargo\"),\n",
+    "        (\"WELLS FARGO BANK,  NA\" , \"Wells Fargo\"),\n",
+    "        (\"WELLS FARGO BANK, N.A.\" , \"Wells Fargo\"),\n",
+    "        (\"WELLS FARGO BANK, NA\" , \"Wells Fargo\"),\n",
+    "        (\"USAA FEDERAL SAVINGS BANK\" , \"USAA\"),\n",
+    "        (\"UNITED SHORE FINANCIAL SERVICES, LLC D\\\\/B\\\\/A UNITED WHOLESALE MORTGAGE\" , \"United Seq(e\"),\n",
+    "        (\"U.S. BANK N.A.\" , \"US Bank\"),\n",
+    "        (\"SUNTRUST MORTGAGE INC.\" , \"Suntrust\"),\n",
+    "        (\"STONEGATE MORTGAGE CORPORATION\" , \"Stonegate Mortgage\"),\n",
+    "        (\"STEARNS LENDING, LLC\" , \"Stearns Lending\"),\n",
+    "        (\"STEARNS LENDING, INC.\" , \"Stearns Lending\"),\n",
+    "        (\"SIERRA PACIFIC MORTGAGE COMPANY, INC.\" , \"Sierra Pacific Mortgage\"),\n",
+    "        (\"REGIONS BANK\" , \"Regions\"),\n",
+    "        (\"RBC MORTGAGE COMPANY\" , \"RBC\"),\n",
+    "        (\"QUICKEN LOANS INC.\" , \"Quicken Loans\"),\n",
+    "        (\"PULTE MORTGAGE, L.L.C.\" , \"Pulte Mortgage\"),\n",
+    "        (\"PROVIDENT FUNDING ASSOCIATES, L.P.\" , \"Provident Funding\"),\n",
+    "        (\"PROSPECT MORTGAGE, LLC\" , \"Prospect Mortgage\"),\n",
+    "        (\"PRINCIPAL RESIDENTIAL MORTGAGE CAPITAL RESOURCES, LLC\" , \"Principal Residential\"),\n",
+    "        (\"PNC BANK, N.A.\" , \"PNC\"),\n",
+    "        (\"PMT CREDIT RISK TRANSFER TRUST 2015-2\" , \"PennyMac\"),\n",
+    "        (\"PHH MORTGAGE CORPORATION\" , \"PHH Mortgage\"),\n",
+    "        (\"PENNYMAC CORP.\" , \"PennyMac\"),\n",
+    "        (\"PACIFIC UNION FINANCIAL, LLC\" , \"Other\"),\n",
+    "        (\"OTHER\" , \"Other\"),\n",
+    "        (\"NYCB MORTGAGE COMPANY, LLC\" , \"NYCB\"),\n",
+    "        (\"NEW YORK COMMUNITY BANK\" , \"NYCB\"),\n",
+    "        (\"NETBANK FUNDING SERVICES\" , \"Netbank\"),\n",
+    "        (\"NATIONSTAR MORTGAGE, LLC\" , \"Nationstar Mortgage\"),\n",
+    "        (\"METLIFE BANK, NA\" , \"Metlife\"),\n",
+    "        (\"LOANDEPOT.COM, LLC\" , \"LoanDepot.com\"),\n",
+    "        (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2015-1\" , \"JP Morgan Chase\"),\n",
+    "        (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2014-1\" , \"JP Morgan Chase\"),\n",
+    "        (\"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION\" , \"JP Morgan Chase\"),\n",
+    "        (\"JPMORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n",
+    "        (\"JP MORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n",
+    "        (\"IRWIN MORTGAGE, CORPORATION\" , \"Irwin Mortgage\"),\n",
+    "        (\"IMPAC MORTGAGE CORP.\" , \"Impac Mortgage\"),\n",
+    "        (\"HSBC BANK USA, NATIONAL ASSOCIATION\" , \"HSBC\"),\n",
+    "        (\"HOMEWARD RESIDENTIAL, INC.\" , \"Homeward Mortgage\"),\n",
+    "        (\"HOMESTREET BANK\" , \"Other\"),\n",
+    "        (\"HOMEBRIDGE FINANCIAL SERVICES, INC.\" , \"HomeBridge\"),\n",
+    "        (\"HARWOOD STREET FUNDING I, LLC\" , \"Harwood Mortgage\"),\n",
+    "        (\"GUILD MORTGAGE COMPANY\" , \"Guild Mortgage\"),\n",
+    "        (\"GMAC MORTGAGE, LLC (USAA FEDERAL SAVINGS BANK)\" , \"GMAC\"),\n",
+    "        (\"GMAC MORTGAGE, LLC\" , \"GMAC\"),\n",
+    "        (\"GMAC (USAA)\" , \"GMAC\"),\n",
+    "        (\"FREMONT BANK\" , \"Fremont Bank\"),\n",
+    "        (\"FREEDOM MORTGAGE CORP.\" , \"Freedom Mortgage\"),\n",
+    "        (\"FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"Franklin America\"),\n",
+    "        (\"FLEET NATIONAL BANK\" , \"Fleet National\"),\n",
+    "        (\"FLAGSTAR CAPITAL MARKETS CORPORATION\" , \"Flagstar Bank\"),\n",
+    "        (\"FLAGSTAR BANK, FSB\" , \"Flagstar Bank\"),\n",
+    "        (\"FIRST TENNESSEE BANK NATIONAL ASSOCIATION\" , \"Other\"),\n",
+    "        (\"FIFTH THIRD BANK\" , \"Fifth Third Bank\"),\n",
+    "        (\"FEDERAL HOME LOAN BANK OF CHICAGO\" , \"Fedral Home of Chicago\"),\n",
+    "        (\"FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB\" , \"FDIC\"),\n",
+    "        (\"DOWNEY SAVINGS AND LOAN ASSOCIATION, F.A.\" , \"Downey Mortgage\"),\n",
+    "        (\"DITECH FINANCIAL LLC\" , \"Ditech\"),\n",
+    "        (\"CITIMORTGAGE, INC.\" , \"Citi\"),\n",
+    "        (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERFIRST MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n",
+    "        (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERBANK MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n",
+    "        (\"CHASE HOME FINANCE, LLC\" , \"JP Morgan Chase\"),\n",
+    "        (\"CHASE HOME FINANCE FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"JP Morgan Chase\"),\n",
+    "        (\"CHASE HOME FINANCE (CIE 1)\" , \"JP Morgan Chase\"),\n",
+    "        (\"CHASE HOME FINANCE\" , \"JP Morgan Chase\"),\n",
+    "        (\"CASHCALL, INC.\" , \"CashCall\"),\n",
+    "        (\"CAPITAL ONE, NATIONAL ASSOCIATION\" , \"Capital One\"),\n",
+    "        (\"CALIBER HOME LOANS, INC.\" , \"Caliber Funding\"),\n",
+    "        (\"BISHOPS GATE RESIDENTIAL MORTGAGE TRUST\" , \"Bishops Gate Mortgage\"),\n",
+    "        (\"BANK OF AMERICA, N.A.\" , \"Bank of America\"),\n",
+    "        (\"AMTRUST BANK\" , \"AmTrust\"),\n",
+    "        (\"AMERISAVE MORTGAGE CORPORATION\" , \"Amerisave\"),\n",
+    "        (\"AMERIHOME MORTGAGE COMPANY, LLC\" , \"AmeriHome Mortgage\"),\n",
+    "        (\"ALLY BANK\" , \"Ally Bank\"),\n",
+    "        (\"ACADEMY MORTGAGE CORPORATION\" , \"Academy Mortgage\"),\n",
+    "        (\"NO CASH-OUT REFINANCE\" , \"OTHER REFINANCE\"),\n",
+    "        (\"REFINANCE - NOT SPECIFIED\" , \"OTHER REFINANCE\"),\n",
+    "        (\"Other REFINANCE\" , \"OTHER REFINANCE\")]\n",
+    "\n",
+    "cate_col_names = [\n",
+    "        \"orig_channel\",\n",
+    "        \"first_home_buyer\",\n",
+    "        \"loan_purpose\",\n",
+    "        \"property_type\",\n",
+    "        \"occupancy_status\",\n",
+    "        \"property_state\",\n",
+    "        \"relocation_mortgage_indicator\",\n",
+    "        \"seller_name\",\n",
+    "        \"mod_flag\"\n",
+    "]\n",
+    "# Numberic columns\n",
+    "label_col_name = \"delinquency_12\"\n",
+    "numeric_col_names = [\n",
+    "        \"orig_interest_rate\",\n",
+    "        \"orig_upb\",\n",
+    "        \"orig_loan_term\",\n",
+    "        \"orig_ltv\",\n",
+    "        \"orig_cltv\",\n",
+    "        \"num_borrowers\",\n",
+    "        \"dti\",\n",
+    "        \"borrower_credit_score\",\n",
+    "        \"num_units\",\n",
+    "        \"zip\",\n",
+    "        \"mortgage_insurance_percent\",\n",
+    "        \"current_loan_delinquency_status\",\n",
+    "        \"current_actual_upb\",\n",
+    "        \"interest_rate\",\n",
+    "        \"loan_age\",\n",
+    "        \"msa\",\n",
+    "        \"non_interest_bearing_upb\",\n",
+    "        label_col_name\n",
+    "]\n",
+    "all_col_names = cate_col_names + numeric_col_names\n",
+    "\n",
+    "def read_perf_csv(spark, path):\n",
+    "    return spark.read.format('csv') \\\n",
+    "            .option('nullValue', '') \\\n",
+    "            .option('header', 'false') \\\n",
+    "            .option('delimiter', '|') \\\n",
+    "            .schema(_csv_perf_schema) \\\n",
+    "            .load(path) \\\n",
+    "            .withColumn('quarter', _get_quarter_from_csv_file_name())\n",
+    "\n",
+    "def read_acq_csv(spark, path):\n",
+    "    return spark.read.format('csv') \\\n",
+    "            .option('nullValue', '') \\\n",
+    "            .option('header', 'false') \\\n",
+    "            .option('delimiter', '|') \\\n",
+    "            .schema(_csv_acq_schema) \\\n",
+    "            .load(path) \\\n",
+    "            .withColumn('quarter', _get_quarter_from_csv_file_name())\n",
+    "\n",
+    "def _parse_dates(perf):\n",
+    "    return perf \\\n",
+    "            .withColumn('monthly_reporting_period', to_date(col('monthly_reporting_period'), 'MM/dd/yyyy')) \\\n",
+    "            .withColumn('monthly_reporting_period_month', month(col('monthly_reporting_period'))) \\\n",
+    "            .withColumn('monthly_reporting_period_year', year(col('monthly_reporting_period'))) \\\n",
+    "            .withColumn('monthly_reporting_period_day', dayofmonth(col('monthly_reporting_period'))) \\\n",
+    "            .withColumn('last_paid_installment_date', to_date(col('last_paid_installment_date'), 'MM/dd/yyyy')) \\\n",
+    "            .withColumn('foreclosed_after', to_date(col('foreclosed_after'), 'MM/dd/yyyy')) \\\n",
+    "            .withColumn('disposition_date', to_date(col('disposition_date'), 'MM/dd/yyyy')) \\\n",
+    "            .withColumn('maturity_date', to_date(col('maturity_date'), 'MM/yyyy')) \\\n",
+    "            .withColumn('zero_balance_effective_date', to_date(col('zero_balance_effective_date'), 'MM/yyyy'))\n",
+    "\n",
+    "def _create_perf_deliquency(spark, perf):\n",
+    "    aggDF = perf.select(\n",
+    "            col(\"quarter\"),\n",
+    "            col(\"loan_id\"),\n",
+    "            col(\"current_loan_delinquency_status\"),\n",
+    "            when(col(\"current_loan_delinquency_status\") >= 1, col(\"monthly_reporting_period\")).alias(\"delinquency_30\"),\n",
+    "            when(col(\"current_loan_delinquency_status\") >= 3, col(\"monthly_reporting_period\")).alias(\"delinquency_90\"),\n",
+    "            when(col(\"current_loan_delinquency_status\") >= 6, col(\"monthly_reporting_period\")).alias(\"delinquency_180\")) \\\n",
+    "                    .groupBy(\"quarter\", \"loan_id\") \\\n",
+    "                    .agg(\n",
+    "                            max(\"current_loan_delinquency_status\").alias(\"delinquency_12\"),\n",
+    "                            min(\"delinquency_30\").alias(\"delinquency_30\"),\n",
+    "                            min(\"delinquency_90\").alias(\"delinquency_90\"),\n",
+    "                            min(\"delinquency_180\").alias(\"delinquency_180\")) \\\n",
+    "                                    .select(\n",
+    "                                            col(\"quarter\"),\n",
+    "                                            col(\"loan_id\"),\n",
+    "                                            (col(\"delinquency_12\") >= 1).alias(\"ever_30\"),\n",
+    "                                            (col(\"delinquency_12\") >= 3).alias(\"ever_90\"),\n",
+    "                                            (col(\"delinquency_12\") >= 6).alias(\"ever_180\"),\n",
+    "                                            col(\"delinquency_30\"),\n",
+    "                                            col(\"delinquency_90\"),\n",
+    "                                            col(\"delinquency_180\"))\n",
+    "    joinedDf = perf \\\n",
+    "            .withColumnRenamed(\"monthly_reporting_period\", \"timestamp\") \\\n",
+    "            .withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n",
+    "            .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n",
+    "            .withColumnRenamed(\"current_loan_delinquency_status\", \"delinquency_12\") \\\n",
+    "            .withColumnRenamed(\"current_actual_upb\", \"upb_12\") \\\n",
+    "            .select(\"quarter\", \"loan_id\", \"timestamp\", \"delinquency_12\", \"upb_12\", \"timestamp_month\", \"timestamp_year\") \\\n",
+    "            .join(aggDF, [\"loan_id\", \"quarter\"], \"left_outer\")\n",
+    "\n",
+    "    # calculate the 12 month delinquency and upb values\n",
+    "    months = 12\n",
+    "    monthArray = [lit(x) for x in range(0, 12)]\n",
+    "    # explode on a small amount of data is actually slightly more efficient than a cross join\n",
+    "    testDf = joinedDf \\\n",
+    "            .withColumn(\"month_y\", explode(array(monthArray))) \\\n",
+    "            .select(\n",
+    "                    col(\"quarter\"),\n",
+    "                    floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000) / months).alias(\"josh_mody\"),\n",
+    "                    floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000 - col(\"month_y\")) / months).alias(\"josh_mody_n\"),\n",
+    "                    col(\"ever_30\"),\n",
+    "                    col(\"ever_90\"),\n",
+    "                    col(\"ever_180\"),\n",
+    "                    col(\"delinquency_30\"),\n",
+    "                    col(\"delinquency_90\"),\n",
+    "                    col(\"delinquency_180\"),\n",
+    "                    col(\"loan_id\"),\n",
+    "                    col(\"month_y\"),\n",
+    "                    col(\"delinquency_12\"),\n",
+    "                    col(\"upb_12\")) \\\n",
+    "                            .groupBy(\"quarter\", \"loan_id\", \"josh_mody_n\", \"ever_30\", \"ever_90\", \"ever_180\", \"delinquency_30\", \"delinquency_90\", \"delinquency_180\", \"month_y\") \\\n",
+    "                            .agg(max(\"delinquency_12\").alias(\"delinquency_12\"), min(\"upb_12\").alias(\"upb_12\")) \\\n",
+    "                            .withColumn(\"timestamp_year\", floor((lit(24000) + (col(\"josh_mody_n\") * lit(months)) + (col(\"month_y\") - 1)) / lit(12))) \\\n",
+    "                            .selectExpr('*', 'pmod(24000 + (josh_mody_n * {}) + month_y, 12) as timestamp_month_tmp'.format(months)) \\\n",
+    "                            .withColumn(\"timestamp_month\", when(col(\"timestamp_month_tmp\") == lit(0), lit(12)).otherwise(col(\"timestamp_month_tmp\"))) \\\n",
+    "                            .withColumn(\"delinquency_12\", ((col(\"delinquency_12\") > 3).cast(\"int\") + (col(\"upb_12\") == 0).cast(\"int\")).alias(\"delinquency_12\")) \\\n",
+    "                            .drop(\"timestamp_month_tmp\", \"josh_mody_n\", \"month_y\")\n",
+    "\n",
+    "    return perf.withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n",
+    "            .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n",
+    "            .join(testDf, [\"quarter\", \"loan_id\", \"timestamp_year\", \"timestamp_month\"], \"left\") \\\n",
+    "            .drop(\"timestamp_year\", \"timestamp_month\")\n",
+    "\n",
+    "def _create_acquisition(spark, acq):\n",
+    "    nameMapping = spark.createDataFrame(_name_mapping, [\"from_seller_name\", \"to_seller_name\"])\n",
+    "    return acq.join(nameMapping, col(\"seller_name\") == col(\"from_seller_name\"), \"left\") \\\n",
+    "      .drop(\"from_seller_name\") \\\n",
+    "      .withColumn(\"old_name\", col(\"seller_name\")) \\\n",
+    "      .withColumn(\"seller_name\", coalesce(col(\"to_seller_name\"), col(\"seller_name\"))) \\\n",
+    "      .drop(\"to_seller_name\") \\\n",
+    "      .withColumn(\"orig_date\", to_date(col(\"orig_date\"), \"MM/yyyy\")) \\\n",
+    "      .withColumn(\"first_pay_date\", to_date(col(\"first_pay_date\"), \"MM/yyyy\")) \\\n",
+    "\n",
+    "def _gen_dictionary(etl_df, col_names):\n",
+    "    cnt_table = etl_df.select(posexplode(array([col(i) for i in col_names])))\\\n",
+    "                    .withColumnRenamed(\"pos\", \"column_id\")\\\n",
+    "                    .withColumnRenamed(\"col\", \"data\")\\\n",
+    "                    .filter(\"data is not null\")\\\n",
+    "                    .groupBy(\"column_id\", \"data\")\\\n",
+    "                    .count()\n",
+    "    windowed = Window.partitionBy(\"column_id\").orderBy(desc(\"count\"))\n",
+    "    return cnt_table.withColumn(\"id\", row_number().over(windowed)).drop(\"count\")\n",
+    "\n",
+    "\n",
+    "def _cast_string_columns_to_numeric(spark, input_df):\n",
+    "    cached_dict_df = _gen_dictionary(input_df, cate_col_names).cache()\n",
+    "    output_df = input_df\n",
+    "    #  Generate the final table with all columns being numeric.\n",
+    "    for col_pos, col_name in enumerate(cate_col_names):\n",
+    "        col_dict_df = cached_dict_df.filter(col(\"column_id\") == col_pos)\\\n",
+    "                                    .drop(\"column_id\")\\\n",
+    "                                    .withColumnRenamed(\"data\", col_name)\n",
+    "        \n",
+    "        output_df = output_df.join(broadcast(col_dict_df), col_name, \"left\")\\\n",
+    "                        .drop(col_name)\\\n",
+    "                        .withColumnRenamed(\"id\", col_name)\n",
+    "    return output_df\n",
+    "\n",
+    "def run_mortgage(spark, perf, acq):\n",
+    "    parsed_perf = _parse_dates(perf)\n",
+    "    perf_deliqency = _create_perf_deliquency(spark, parsed_perf)\n",
+    "    cleaned_acq = _create_acquisition(spark, acq)\n",
+    "    df = perf_deliqency.join(cleaned_acq, [\"loan_id\", \"quarter\"], \"inner\")\n",
+    "    test_quarters = ['2016Q1','2016Q2','2016Q3','2016Q4']\n",
+    "    train_df = df.filter(~df.quarter.isin(test_quarters)).drop(\"quarter\")\n",
+    "    test_df = df.filter(df.quarter.isin(test_quarters)).drop(\"quarter\")\n",
+    "    casted_train_df = _cast_string_columns_to_numeric(spark, train_df)\\\n",
+    "                    .select(all_col_names)\\\n",
+    "                    .withColumn(label_col_name, when(col(label_col_name) > 0, 1).otherwise(0))\\\n",
+    "                    .fillna(float(0))\n",
+    "    casted_test_df = _cast_string_columns_to_numeric(spark, test_df)\\\n",
+    "                    .select(all_col_names)\\\n",
+    "                    .withColumn(label_col_name, when(col(label_col_name) > 0, 1).otherwise(0))\\\n",
+    "                    .fillna(float(0))\n",
+    "    return casted_train_df, casted_test_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Spark conf and Create Spark Session\n",
+    "For details explanation for spark conf, please go to Spark RAPIDS [config guide](https://nvidia.github.io/spark-rapids/docs/configs.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sc.stop()\n",
+    "\n",
+    "conf = SparkConf().setAppName(\"MortgageETL\")\n",
+    "conf.set('spark.rapids.sql.explain', 'ALL')\n",
+    "conf.set(\"spark.executor.instances\", \"20\")\n",
+    "conf.set(\"spark.executor.cores\", \"7\")\n",
+    "conf.set(\"spark.task.cpus\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", \"2\")\n",
+    "conf.set(\"spark.executor.memory\", \"4g\")\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", \"2G\")\n",
+    "conf.set(\"spark.executor.memoryOverhead\", \"2G\")\n",
+    "conf.set(\"spark.executor.extraJavaOptions\", \"-Dai.rapids.cudf.prefer-pinned=true\")\n",
+    "conf.set(\"spark.locality.wait\", \"0s\")\n",
+    "conf.set(\"spark.sql.files.maxPartitionBytes\", \"512m\")\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", \"0.142\")\n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.rapids.sql.hasNans\", \"false\")\n",
+    "conf.set('spark.rapids.sql.batchSizeBytes', '512M')\n",
+    "conf.set('spark.rapids.sql.reader.batchSizeBytes', '768M')\n",
+    "conf.set('spark.rapids.sql.variableFloatAgg.enabled', 'true')\n",
+    "\n",
+    "spark = SparkSession.builder \\\n",
+    "                    .config(conf=conf) \\\n",
+    "                    .getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Define Data Input/Output location"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "orig_perf_path = 'gs://dataproc-nv-demo/mortgage_full/perf/*'\n",
+    "orig_acq_path = 'gs://dataproc-nv-demo/mortgage_full/acq/*'\n",
+    "\n",
+    "train_path = 'gs://dataproc-nv-demo/mortgage_full/train/'\n",
+    "test_path = 'gs://dataproc-nv-demo/mortgage_full/test/'\n",
+    "tmp_perf_path = 'gs://dataproc-nv-demo/mortgage_parquet_gpu/perf/'\n",
+    "tmp_acq_path = 'gs://dataproc-nv-demo/mortgage_parquet_gpu/acq/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Read CSV data and Transcode to Parquet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "108.28529238700867\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Lets transcode the data first\n",
+    "start = time.time()\n",
+    "# we want a few big files instead of lots of small files\n",
+    "spark.conf.set('spark.sql.files.maxPartitionBytes', '200G')\n",
+    "acq = read_acq_csv(spark, orig_acq_path)\n",
+    "acq.repartition(20).write.parquet(tmp_acq_path, mode='overwrite')\n",
+    "perf = read_perf_csv(spark, orig_perf_path)\n",
+    "perf.coalesce(80).write.parquet(tmp_perf_path, mode='overwrite')\n",
+    "end = time.time()\n",
+    "print(end - start)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Execute ETL Code Defined in 1st Cell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "137.99262690544128\n",
+      "171.97584056854248\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Now lets actually process the data\\n\",\n",
+    "start = time.time()\n",
+    "spark.conf.set('spark.sql.files.maxPartitionBytes', '1G')\n",
+    "spark.conf.set('spark.sql.shuffle.partitions', '160')\n",
+    "perf = spark.read.parquet(tmp_perf_path)\n",
+    "acq = spark.read.parquet(tmp_acq_path)\n",
+    "train_out, test_out = run_mortgage(spark, perf, acq)\n",
+    "train_out.write.parquet(train_path, mode='overwrite')\n",
+    "end = time.time()\n",
+    "print(end - start)\n",
+    "test_out.write.parquet(test_path, mode='overwrite')\n",
+    "end = time.time()\n",
+    "print(end - start)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Print Physical Plan"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "== Physical Plan ==\n",
+      "*(2) GpuColumnarToRow false\n",
+      "+- GpuProject [gpucoalesce(orig_channel#1922, 0) AS orig_channel#3686, gpucoalesce(first_home_buyer#2124, 0) AS first_home_buyer#3687, gpucoalesce(loan_purpose#2326, 0) AS loan_purpose#3688, gpucoalesce(property_type#2528, 0) AS property_type#3689, gpucoalesce(occupancy_status#2730, 0) AS occupancy_status#3690, gpucoalesce(property_state#2932, 0) AS property_state#3691, gpucoalesce(relocation_mortgage_indicator#3134, 0) AS relocation_mortgage_indicator#3692, gpucoalesce(seller_name#3336, 0) AS seller_name#3693, gpucoalesce(id#1728, 0) AS mod_flag#3694, gpucoalesce(gpunanvl(orig_interest_rate#297, null), 0.0) AS orig_interest_rate#3695, gpucoalesce(orig_upb#298, 0) AS orig_upb#3696, gpucoalesce(orig_loan_term#299, 0) AS orig_loan_term#3697, gpucoalesce(gpunanvl(orig_ltv#302, null), 0.0) AS orig_ltv#3698, gpucoalesce(gpunanvl(orig_cltv#303, null), 0.0) AS orig_cltv#3699, gpucoalesce(gpunanvl(num_borrowers#304, null), 0.0) AS num_borrowers#3700, gpucoalesce(gpunanvl(dti#305, null), 0.0) AS dti#3701, gpucoalesce(gpunanvl(borrower_credit_score#306, null), 0.0) AS borrower_credit_score#3702, gpucoalesce(num_units#310, 0) AS num_units#3703, gpucoalesce(zip#313, 0) AS zip#3704, gpucoalesce(gpunanvl(mortgage_insurance_percent#314, null), 0.0) AS mortgage_insurance_percent#3705, gpucoalesce(current_loan_delinquency_status#240, 0) AS current_loan_delinquency_status#3706, gpucoalesce(gpunanvl(current_actual_upb#234, null), 0.0) AS current_actual_upb#3707, gpucoalesce(gpunanvl(interest_rate#233, null), 0.0) AS interest_rate#3708, gpucoalesce(gpunanvl(loan_age#235, null), 0.0) AS loan_age#3709, ... 3 more fields]\n",
+      "   +- GpuBroadcastHashJoin [mod_flag#241], [mod_flag#3404], LeftOuter, BuildRight\n",
+      "      :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, zip#313, mortgage_insurance_percent#314, orig_channel#1922, first_home_buyer#2124, loan_purpose#2326, property_type#2528, occupancy_status#2730, ... 3 more fields]\n",
+      "      :  +- GpuBroadcastHashJoin [seller_name#1402], [seller_name#3202], LeftOuter, BuildRight\n",
+      "      :     :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, zip#313, mortgage_insurance_percent#314, orig_channel#1922, first_home_buyer#2124, loan_purpose#2326, property_type#2528, ... 3 more fields]\n",
+      "      :     :  +- GpuBroadcastHashJoin [relocation_mortgage_indicator#318], [relocation_mortgage_indicator#3000], LeftOuter, BuildRight\n",
+      "      :     :     :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, orig_channel#1922, first_home_buyer#2124, loan_purpose#2326, ... 3 more fields]\n",
+      "      :     :     :  +- GpuBroadcastHashJoin [property_state#312], [property_state#2798], LeftOuter, BuildRight\n",
+      "      :     :     :     :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, property_state#312, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, orig_channel#1922, first_home_buyer#2124, ... 3 more fields]\n",
+      "      :     :     :     :  +- GpuBroadcastHashJoin [occupancy_status#311], [occupancy_status#2596], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, num_units#310, occupancy_status#311, property_state#312, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, orig_channel#1922, ... 3 more fields]\n",
+      "      :     :     :     :     :  +- GpuBroadcastHashJoin [property_type#309], [property_type#2394], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, property_type#309, num_units#310, occupancy_status#311, property_state#312, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, ... 3 more fields]\n",
+      "      :     :     :     :     :     :  +- GpuBroadcastHashJoin [loan_purpose#308], [loan_purpose#2192], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, loan_purpose#308, property_type#309, num_units#310, occupancy_status#311, property_state#312, zip#313, mortgage_insurance_percent#314, ... 3 more fields]\n",
+      "      :     :     :     :     :     :     :  +- GpuBroadcastHashJoin [first_home_buyer#307], [first_home_buyer#1990], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :     :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, first_home_buyer#307, loan_purpose#308, property_type#309, num_units#310, occupancy_status#311, property_state#312, zip#313, ... 3 more fields]\n",
+      "      :     :     :     :     :     :     :     :  +- GpuBroadcastHashJoin [orig_channel#295], [orig_channel#1788], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :     :     :- GpuProject [interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042, orig_channel#295, seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, first_home_buyer#307, loan_purpose#308, property_type#309, num_units#310, occupancy_status#311, property_state#312, ... 3 more fields]\n",
+      "      :     :     :     :     :     :     :     :     :  +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "      :     :     :     :     :     :     :     :     :     :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :     :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#3294]\n",
+      "      :     :     :     :     :     :     :     :     :     :     +- GpuProject [quarter#261, loan_id#230L, interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, delinquency_12#1042]\n",
+      "      :     :     :     :     :     :     :     :     :     :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :     :     :     :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :     :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#3124]\n",
+      "      :     :     :     :     :     :     :     :     :     :           :     +- GpuProject [loan_id#230L, interest_rate#233, current_actual_upb#234, loan_age#235, msa#239, current_loan_delinquency_status#240, mod_flag#241, non_interest_bearing_upb#256, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "      :     :     :     :     :     :     :     :     :     :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :     :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "      :     :     :     :     :     :     :     :     :     :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,interest_rate#233,current_actual_upb#234,loan_age#235,msa#239,current_loan_delinquency_status#240,mod_flag#241,non_interest_bearing_upb#256,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,interest_rate:double,current_actual_upb:dou...\n",
+      "      :     :     :     :     :     :     :     :     :     :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :     :     :     :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#3145]\n",
+      "      :     :     :     :     :     :     :     :     :     :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[gpumax(delinquency_12#812), gpumin(upb_12#848)]), filters=ArrayBuffer(None, None))\n",
+      "      :     :     :     :     :     :     :     :     :     :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[partial_gpumax(delinquency_12#812), partial_gpumin(upb_12#848)]), filters=ArrayBuffer(None, None))\n",
+      "      :     :     :     :     :     :     :     :     :     :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937, delinquency_12#812, upb_12#848]\n",
+      "      :     :     :     :     :     :     :     :     :     :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :     :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "      :     :     :     :     :     :     :     :     :     :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, delinquency_12#812, upb_12#848, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "      :     :     :     :     :     :     :     :     :     :                                   +- GpuProject [loan_id#1142L, quarter#1173, delinquency_12#812, upb_12#848, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "      :     :     :     :     :     :     :     :     :     :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :     :     :     :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :     :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#3128]\n",
+      "      :     :     :     :     :     :     :     :     :     :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, current_loan_delinquency_status#1152 AS delinquency_12#812, current_actual_upb#1146 AS upb_12#848, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "      :     :     :     :     :     :     :     :     :     :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :     :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "      :     :     :     :     :     :     :     :     :     :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,current_actual_upb#1146,current_loan_delinquency_status#1152,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_actual_upb:double,current_loan_deli...\n",
+      "      :     :     :     :     :     :     :     :     :     :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :     :     :     :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :     :     :     :     :     :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :     :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#3133]\n",
+      "      :     :     :     :     :     :     :     :     :     :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :     :     :     :     :     :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "      :     :     :     :     :     :     :     :     :     :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :     :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "      :     :     :     :     :     :     :     :     :     :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "      :     :     :     :     :     :     :     :     :     +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :     :     :        +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#3330]\n",
+      "      :     :     :     :     :     :     :     :     :           +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, first_home_buyer#307, loan_purpose#308, property_type#309, num_units#310, occupancy_status#311, property_state#312, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :     :     :     :     :              +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :     :     :                 :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :                 :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#3154]\n",
+      "      :     :     :     :     :     :     :     :     :                 :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, orig_interest_rate#297, orig_upb#298, orig_loan_term#299, orig_ltv#302, orig_cltv#303, num_borrowers#304, dti#305, borrower_credit_score#306, first_home_buyer#307, loan_purpose#308, property_type#309, num_units#310, occupancy_status#311, property_state#312, zip#313, mortgage_insurance_percent#314, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :     :     :     :     :                 :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :                 :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "      :     :     :     :     :     :     :     :     :                 :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,orig_interest_rate#297,orig_upb#298,orig_loan_term#299,orig_ltv#302,orig_cltv#303,num_borrowers#304,dti#305,borrower_credit_score#306,first_home_buyer#307,loan_purpose#308,property_type#309,num_units#310,occupancy_status#311,property_state#312,zip#313,mortgage_insurance_percent#314,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,orig_interest_rate:double,orig_upb:i...\n",
+      "      :     :     :     :     :     :     :     :     :                 +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :     :     :                    +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#3326]\n",
+      "      :     :     :     :     :     :     :     :     :                       +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :                          +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "      :     :     :     :     :     :     :     :     :                             +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :     :                                +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "      :     :     :     :     :     :     :     :     +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#3231]\n",
+      "      :     :     :     :     :     :     :     :        +- GpuProject [data#1717 AS orig_channel#1788, id#1728]\n",
+      "      :     :     :     :     :     :     :     :           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :              +- GpuFilter ((column_id#1714 = 0) AND gpuisnotnull(data#1717))\n",
+      "      :     :     :     :     :     :     :     :                 +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                    +- InMemoryTableScan [column_id#1714, data#1717, id#1728], [(column_id#1714 = 0), isnotnull(data#1717)]\n",
+      "      :     :     :     :     :     :     :     :                          +- InMemoryRelation [column_id#1714, data#1717, id#1728], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "      :     :     :     :     :     :     :     :                                +- *(2) GpuColumnarToRow false\n",
+      "      :     :     :     :     :     :     :     :                                   +- GpuProject [column_id#1714, data#1717, id#1728]\n",
+      "      :     :     :     :     :     :     :     :                                      +- GpuWindow [gpurownumber() gpuwindowspecdefinition(column_id#1714, count#1723L DESC NULLS LAST, gpuspecifiedwindowframe(RowFrame, gpuspecialframeboundary(unboundedpreceding$()), gpuspecialframeboundary(currentrow$()))) AS id#1728], false\n",
+      "      :     :     :     :     :     :     :     :                                         +- GpuSort [column_id#1714 ASC NULLS FIRST, count#1723L DESC NULLS LAST], false, RequireSingleBatch, 0\n",
+      "      :     :     :     :     :     :     :     :                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :     :                                               +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, 160), true, [id=#451]\n",
+      "      :     :     :     :     :     :     :     :                                                  +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :     :     :     :     :                                                     +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                        +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, data#1717, 160), true, [id=#448]\n",
+      "      :     :     :     :     :     :     :     :                                                           +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[partial_gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :     :     :     :     :                                                              +- GpuProject [pos#1710 AS column_id#1714, col#1711 AS data#1717]\n",
+      "      :     :     :     :     :     :     :     :                                                                 +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                    +- GpuFilter gpuisnotnull(col#1711)\n",
+      "      :     :     :     :     :     :     :     :                                                                       +- GpuGenerate true, [orig_channel#295, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, seller_name#1402, mod_flag#241], [pos#1710, col#1711]\n",
+      "      :     :     :     :     :     :     :     :                                                                          +- GpuProject [mod_flag#241, orig_channel#295, seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318]\n",
+      "      :     :     :     :     :     :     :     :                                                                             +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "      :     :     :     :     :     :     :     :                                                                                :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#408]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :     +- GpuProject [quarter#261, loan_id#230L, mod_flag#241]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :     :                                                                                :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#318]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :           :     +- GpuProject [loan_id#230L, mod_flag#241, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "      :     :     :     :     :     :     :     :                                                                                :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,mod_flag#241,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "      :     :     :     :     :     :     :     :                                                                                :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :     :                                                                                :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                   +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "      :     :     :     :     :     :     :     :                                                                                :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "      :     :     :     :     :     :     :     :                                                                                +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :     :                                                                                   +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#439]\n",
+      "      :     :     :     :     :     :     :     :                                                                                      +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :     :     :     :                                                                                         +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :     :                                                                                            :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                            :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#348]\n",
+      "      :     :     :     :     :     :     :     :                                                                                            :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :     :     :     :                                                                                            :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                            :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "      :     :     :     :     :     :     :     :                                                                                            :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,first_home_buyer#307,loan_purpose#308,property_type#309,occupancy_status#311,property_state#312,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "      :     :     :     :     :     :     :     :                                                                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :     :                                                                                               +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#435]\n",
+      "      :     :     :     :     :     :     :     :                                                                                                  +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                                     +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "      :     :     :     :     :     :     :     :                                                                                                        +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :     :                                                                                                           +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "      :     :     :     :     :     :     :     +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#3238]\n",
+      "      :     :     :     :     :     :     :        +- GpuProject [data#1717 AS first_home_buyer#1990, id#1728]\n",
+      "      :     :     :     :     :     :     :           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :              +- GpuFilter ((column_id#1714 = 1) AND gpuisnotnull(data#1717))\n",
+      "      :     :     :     :     :     :     :                 +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                    +- InMemoryTableScan [column_id#1714, data#1717, id#1728], [(column_id#1714 = 1), isnotnull(data#1717)]\n",
+      "      :     :     :     :     :     :     :                          +- InMemoryRelation [column_id#1714, data#1717, id#1728], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "      :     :     :     :     :     :     :                                +- *(2) GpuColumnarToRow false\n",
+      "      :     :     :     :     :     :     :                                   +- GpuProject [column_id#1714, data#1717, id#1728]\n",
+      "      :     :     :     :     :     :     :                                      +- GpuWindow [gpurownumber() gpuwindowspecdefinition(column_id#1714, count#1723L DESC NULLS LAST, gpuspecifiedwindowframe(RowFrame, gpuspecialframeboundary(unboundedpreceding$()), gpuspecialframeboundary(currentrow$()))) AS id#1728], false\n",
+      "      :     :     :     :     :     :     :                                         +- GpuSort [column_id#1714 ASC NULLS FIRST, count#1723L DESC NULLS LAST], false, RequireSingleBatch, 0\n",
+      "      :     :     :     :     :     :     :                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :                                               +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, 160), true, [id=#451]\n",
+      "      :     :     :     :     :     :     :                                                  +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :     :     :     :                                                     +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                        +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, data#1717, 160), true, [id=#448]\n",
+      "      :     :     :     :     :     :     :                                                           +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[partial_gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :     :     :     :                                                              +- GpuProject [pos#1710 AS column_id#1714, col#1711 AS data#1717]\n",
+      "      :     :     :     :     :     :     :                                                                 +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                    +- GpuFilter gpuisnotnull(col#1711)\n",
+      "      :     :     :     :     :     :     :                                                                       +- GpuGenerate true, [orig_channel#295, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, seller_name#1402, mod_flag#241], [pos#1710, col#1711]\n",
+      "      :     :     :     :     :     :     :                                                                          +- GpuProject [mod_flag#241, orig_channel#295, seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318]\n",
+      "      :     :     :     :     :     :     :                                                                             +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "      :     :     :     :     :     :     :                                                                                :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#408]\n",
+      "      :     :     :     :     :     :     :                                                                                :     +- GpuProject [quarter#261, loan_id#230L, mod_flag#241]\n",
+      "      :     :     :     :     :     :     :                                                                                :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :                                                                                :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#318]\n",
+      "      :     :     :     :     :     :     :                                                                                :           :     +- GpuProject [loan_id#230L, mod_flag#241, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "      :     :     :     :     :     :     :                                                                                :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "      :     :     :     :     :     :     :                                                                                :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,mod_flag#241,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "      :     :     :     :     :     :     :                                                                                :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :                                                                                :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n",
+      "      :     :     :     :     :     :     :                                                                                :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :     :     :     :                                                                                :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :     :     :     :                                                                                :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n",
+      "      :     :     :     :     :     :     :                                                                                :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "      :     :     :     :     :     :     :                                                                                :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "      :     :     :     :     :     :     :                                                                                :                                   +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "      :     :     :     :     :     :     :                                                                                :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :                                                                                :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n",
+      "      :     :     :     :     :     :     :                                                                                :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "      :     :     :     :     :     :     :                                                                                :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "      :     :     :     :     :     :     :                                                                                :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "      :     :     :     :     :     :     :                                                                                :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :                                                                                :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :     :     :                                                                                :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n",
+      "      :     :     :     :     :     :     :                                                                                :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :     :     :                                                                                :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "      :     :     :     :     :     :     :                                                                                :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "      :     :     :     :     :     :     :                                                                                :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "      :     :     :     :     :     :     :                                                                                +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :                                                                                   +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#439]\n",
+      "      :     :     :     :     :     :     :                                                                                      +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :     :     :                                                                                         +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :     :                                                                                            :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                            :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#348]\n",
+      "      :     :     :     :     :     :     :                                                                                            :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :     :     :                                                                                            :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                            :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "      :     :     :     :     :     :     :                                                                                            :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,first_home_buyer#307,loan_purpose#308,property_type#309,occupancy_status#311,property_state#312,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "      :     :     :     :     :     :     :                                                                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :     :                                                                                               +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#435]\n",
+      "      :     :     :     :     :     :     :                                                                                                  +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                                     +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "      :     :     :     :     :     :     :                                                                                                        +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :     :     :     :                                                                                                           +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "      :     :     :     :     :     :     +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#3245]\n",
+      "      :     :     :     :     :     :        +- GpuProject [data#1717 AS loan_purpose#2192, id#1728]\n",
+      "      :     :     :     :     :     :           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :              +- GpuFilter ((column_id#1714 = 2) AND gpuisnotnull(data#1717))\n",
+      "      :     :     :     :     :     :                 +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                    +- InMemoryTableScan [column_id#1714, data#1717, id#1728], [(column_id#1714 = 2), isnotnull(data#1717)]\n",
+      "      :     :     :     :     :     :                          +- InMemoryRelation [column_id#1714, data#1717, id#1728], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "      :     :     :     :     :     :                                +- *(2) GpuColumnarToRow false\n",
+      "      :     :     :     :     :     :                                   +- GpuProject [column_id#1714, data#1717, id#1728]\n",
+      "      :     :     :     :     :     :                                      +- GpuWindow [gpurownumber() gpuwindowspecdefinition(column_id#1714, count#1723L DESC NULLS LAST, gpuspecifiedwindowframe(RowFrame, gpuspecialframeboundary(unboundedpreceding$()), gpuspecialframeboundary(currentrow$()))) AS id#1728], false\n",
+      "      :     :     :     :     :     :                                         +- GpuSort [column_id#1714 ASC NULLS FIRST, count#1723L DESC NULLS LAST], false, RequireSingleBatch, 0\n",
+      "      :     :     :     :     :     :                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :                                               +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, 160), true, [id=#451]\n",
+      "      :     :     :     :     :     :                                                  +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :     :     :                                                     +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                        +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, data#1717, 160), true, [id=#448]\n",
+      "      :     :     :     :     :     :                                                           +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[partial_gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :     :     :                                                              +- GpuProject [pos#1710 AS column_id#1714, col#1711 AS data#1717]\n",
+      "      :     :     :     :     :     :                                                                 +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                    +- GpuFilter gpuisnotnull(col#1711)\n",
+      "      :     :     :     :     :     :                                                                       +- GpuGenerate true, [orig_channel#295, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, seller_name#1402, mod_flag#241], [pos#1710, col#1711]\n",
+      "      :     :     :     :     :     :                                                                          +- GpuProject [mod_flag#241, orig_channel#295, seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318]\n",
+      "      :     :     :     :     :     :                                                                             +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "      :     :     :     :     :     :                                                                                :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#408]\n",
+      "      :     :     :     :     :     :                                                                                :     +- GpuProject [quarter#261, loan_id#230L, mod_flag#241]\n",
+      "      :     :     :     :     :     :                                                                                :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :                                                                                :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#318]\n",
+      "      :     :     :     :     :     :                                                                                :           :     +- GpuProject [loan_id#230L, mod_flag#241, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "      :     :     :     :     :     :                                                                                :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "      :     :     :     :     :     :                                                                                :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,mod_flag#241,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "      :     :     :     :     :     :                                                                                :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :                                                                                :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n",
+      "      :     :     :     :     :     :                                                                                :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :     :     :                                                                                :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :     :     :                                                                                :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n",
+      "      :     :     :     :     :     :                                                                                :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "      :     :     :     :     :     :                                                                                :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "      :     :     :     :     :     :                                                                                :                                   +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "      :     :     :     :     :     :                                                                                :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :                                                                                :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n",
+      "      :     :     :     :     :     :                                                                                :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "      :     :     :     :     :     :                                                                                :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "      :     :     :     :     :     :                                                                                :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "      :     :     :     :     :     :                                                                                :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :                                                                                :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :     :                                                                                :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n",
+      "      :     :     :     :     :     :                                                                                :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :     :                                                                                :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "      :     :     :     :     :     :                                                                                :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "      :     :     :     :     :     :                                                                                :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "      :     :     :     :     :     :                                                                                +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :                                                                                   +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#439]\n",
+      "      :     :     :     :     :     :                                                                                      +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :     :                                                                                         +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :     :                                                                                            :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                            :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#348]\n",
+      "      :     :     :     :     :     :                                                                                            :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :     :                                                                                            :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                            :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "      :     :     :     :     :     :                                                                                            :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,first_home_buyer#307,loan_purpose#308,property_type#309,occupancy_status#311,property_state#312,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "      :     :     :     :     :     :                                                                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :     :                                                                                               +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#435]\n",
+      "      :     :     :     :     :     :                                                                                                  +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                                     +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "      :     :     :     :     :     :                                                                                                        +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :     :     :                                                                                                           +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "      :     :     :     :     :     +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#3252]\n",
+      "      :     :     :     :     :        +- GpuProject [data#1717 AS property_type#2394, id#1728]\n",
+      "      :     :     :     :     :           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :              +- GpuFilter ((column_id#1714 = 3) AND gpuisnotnull(data#1717))\n",
+      "      :     :     :     :     :                 +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :     :                    +- InMemoryTableScan [column_id#1714, data#1717, id#1728], [(column_id#1714 = 3), isnotnull(data#1717)]\n",
+      "      :     :     :     :     :                          +- InMemoryRelation [column_id#1714, data#1717, id#1728], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "      :     :     :     :     :                                +- *(2) GpuColumnarToRow false\n",
+      "      :     :     :     :     :                                   +- GpuProject [column_id#1714, data#1717, id#1728]\n",
+      "      :     :     :     :     :                                      +- GpuWindow [gpurownumber() gpuwindowspecdefinition(column_id#1714, count#1723L DESC NULLS LAST, gpuspecifiedwindowframe(RowFrame, gpuspecialframeboundary(unboundedpreceding$()), gpuspecialframeboundary(currentrow$()))) AS id#1728], false\n",
+      "      :     :     :     :     :                                         +- GpuSort [column_id#1714 ASC NULLS FIRST, count#1723L DESC NULLS LAST], false, RequireSingleBatch, 0\n",
+      "      :     :     :     :     :                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :                                               +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, 160), true, [id=#451]\n",
+      "      :     :     :     :     :                                                  +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :     :                                                     +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                        +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, data#1717, 160), true, [id=#448]\n",
+      "      :     :     :     :     :                                                           +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[partial_gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :     :                                                              +- GpuProject [pos#1710 AS column_id#1714, col#1711 AS data#1717]\n",
+      "      :     :     :     :     :                                                                 +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                    +- GpuFilter gpuisnotnull(col#1711)\n",
+      "      :     :     :     :     :                                                                       +- GpuGenerate true, [orig_channel#295, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, seller_name#1402, mod_flag#241], [pos#1710, col#1711]\n",
+      "      :     :     :     :     :                                                                          +- GpuProject [mod_flag#241, orig_channel#295, seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318]\n",
+      "      :     :     :     :     :                                                                             +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "      :     :     :     :     :                                                                                :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#408]\n",
+      "      :     :     :     :     :                                                                                :     +- GpuProject [quarter#261, loan_id#230L, mod_flag#241]\n",
+      "      :     :     :     :     :                                                                                :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :                                                                                :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#318]\n",
+      "      :     :     :     :     :                                                                                :           :     +- GpuProject [loan_id#230L, mod_flag#241, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "      :     :     :     :     :                                                                                :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "      :     :     :     :     :                                                                                :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,mod_flag#241,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "      :     :     :     :     :                                                                                :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :                                                                                :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n",
+      "      :     :     :     :     :                                                                                :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :     :                                                                                :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :     :                                                                                :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n",
+      "      :     :     :     :     :                                                                                :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "      :     :     :     :     :                                                                                :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "      :     :     :     :     :                                                                                :                                   +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "      :     :     :     :     :                                                                                :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :                                                                                :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n",
+      "      :     :     :     :     :                                                                                :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "      :     :     :     :     :                                                                                :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "      :     :     :     :     :                                                                                :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "      :     :     :     :     :                                                                                :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :                                                                                :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :                                                                                :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n",
+      "      :     :     :     :     :                                                                                :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :     :                                                                                :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "      :     :     :     :     :                                                                                :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "      :     :     :     :     :                                                                                :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "      :     :     :     :     :                                                                                +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :                                                                                   +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#439]\n",
+      "      :     :     :     :     :                                                                                      +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :                                                                                         +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "      :     :     :     :     :                                                                                            :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                            :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#348]\n",
+      "      :     :     :     :     :                                                                                            :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :     :                                                                                            :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                            :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "      :     :     :     :     :                                                                                            :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,first_home_buyer#307,loan_purpose#308,property_type#309,occupancy_status#311,property_state#312,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "      :     :     :     :     :                                                                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :     :                                                                                               +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#435]\n",
+      "      :     :     :     :     :                                                                                                  +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                                     +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "      :     :     :     :     :                                                                                                        +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :     :                                                                                                           +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "      :     :     :     :     +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#3259]\n",
+      "      :     :     :     :        +- GpuProject [data#1717 AS occupancy_status#2596, id#1728]\n",
+      "      :     :     :     :           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :              +- GpuFilter ((column_id#1714 = 4) AND gpuisnotnull(data#1717))\n",
+      "      :     :     :     :                 +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :                    +- InMemoryTableScan [column_id#1714, data#1717, id#1728], [(column_id#1714 = 4), isnotnull(data#1717)]\n",
+      "      :     :     :     :                          +- InMemoryRelation [column_id#1714, data#1717, id#1728], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "      :     :     :     :                                +- *(2) GpuColumnarToRow false\n",
+      "      :     :     :     :                                   +- GpuProject [column_id#1714, data#1717, id#1728]\n",
+      "      :     :     :     :                                      +- GpuWindow [gpurownumber() gpuwindowspecdefinition(column_id#1714, count#1723L DESC NULLS LAST, gpuspecifiedwindowframe(RowFrame, gpuspecialframeboundary(unboundedpreceding$()), gpuspecialframeboundary(currentrow$()))) AS id#1728], false\n",
+      "      :     :     :     :                                         +- GpuSort [column_id#1714 ASC NULLS FIRST, count#1723L DESC NULLS LAST], false, RequireSingleBatch, 0\n",
+      "      :     :     :     :                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :                                               +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, 160), true, [id=#451]\n",
+      "      :     :     :     :                                                  +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :                                                     +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                        +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, data#1717, 160), true, [id=#448]\n",
+      "      :     :     :     :                                                           +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[partial_gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :     :                                                              +- GpuProject [pos#1710 AS column_id#1714, col#1711 AS data#1717]\n",
+      "      :     :     :     :                                                                 +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                    +- GpuFilter gpuisnotnull(col#1711)\n",
+      "      :     :     :     :                                                                       +- GpuGenerate true, [orig_channel#295, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, seller_name#1402, mod_flag#241], [pos#1710, col#1711]\n",
+      "      :     :     :     :                                                                          +- GpuProject [mod_flag#241, orig_channel#295, seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318]\n",
+      "      :     :     :     :                                                                             +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "      :     :     :     :                                                                                :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#408]\n",
+      "      :     :     :     :                                                                                :     +- GpuProject [quarter#261, loan_id#230L, mod_flag#241]\n",
+      "      :     :     :     :                                                                                :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "      :     :     :     :                                                                                :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#318]\n",
+      "      :     :     :     :                                                                                :           :     +- GpuProject [loan_id#230L, mod_flag#241, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "      :     :     :     :                                                                                :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "      :     :     :     :                                                                                :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,mod_flag#241,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "      :     :     :     :                                                                                :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :                                                                                :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n",
+      "      :     :     :     :                                                                                :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :                                                                                :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :     :                                                                                :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n",
+      "      :     :     :     :                                                                                :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "      :     :     :     :                                                                                :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "      :     :     :     :                                                                                :                                   +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "      :     :     :     :                                                                                :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "      :     :     :     :                                                                                :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n",
+      "      :     :     :     :                                                                                :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "      :     :     :     :                                                                                :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "      :     :     :     :                                                                                :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "      :     :     :     :                                                                                :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :                                                                                :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :                                                                                :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n",
+      "      :     :     :     :                                                                                :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :     :                                                                                :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "      :     :     :     :                                                                                :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "      :     :     :     :                                                                                :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "      :     :     :     :                                                                                +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :                                                                                   +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#439]\n",
+      "      :     :     :     :                                                                                      +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :                                                                                         +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "      :     :     :     :                                                                                            :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                            :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#348]\n",
+      "      :     :     :     :                                                                                            :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :     :                                                                                            :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                            :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "      :     :     :     :                                                                                            :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,first_home_buyer#307,loan_purpose#308,property_type#309,occupancy_status#311,property_state#312,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "      :     :     :     :                                                                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :     :                                                                                               +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#435]\n",
+      "      :     :     :     :                                                                                                  +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                                     +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "      :     :     :     :                                                                                                        +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :     :                                                                                                           +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "      :     :     :     +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#3266]\n",
+      "      :     :     :        +- GpuProject [data#1717 AS property_state#2798, id#1728]\n",
+      "      :     :     :           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :              +- GpuFilter ((column_id#1714 = 5) AND gpuisnotnull(data#1717))\n",
+      "      :     :     :                 +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :                    +- InMemoryTableScan [column_id#1714, data#1717, id#1728], [(column_id#1714 = 5), isnotnull(data#1717)]\n",
+      "      :     :     :                          +- InMemoryRelation [column_id#1714, data#1717, id#1728], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "      :     :     :                                +- *(2) GpuColumnarToRow false\n",
+      "      :     :     :                                   +- GpuProject [column_id#1714, data#1717, id#1728]\n",
+      "      :     :     :                                      +- GpuWindow [gpurownumber() gpuwindowspecdefinition(column_id#1714, count#1723L DESC NULLS LAST, gpuspecifiedwindowframe(RowFrame, gpuspecialframeboundary(unboundedpreceding$()), gpuspecialframeboundary(currentrow$()))) AS id#1728], false\n",
+      "      :     :     :                                         +- GpuSort [column_id#1714 ASC NULLS FIRST, count#1723L DESC NULLS LAST], false, RequireSingleBatch, 0\n",
+      "      :     :     :                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :                                               +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, 160), true, [id=#451]\n",
+      "      :     :     :                                                  +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :                                                     +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                        +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, data#1717, 160), true, [id=#448]\n",
+      "      :     :     :                                                           +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[partial_gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :     :                                                              +- GpuProject [pos#1710 AS column_id#1714, col#1711 AS data#1717]\n",
+      "      :     :     :                                                                 +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                    +- GpuFilter gpuisnotnull(col#1711)\n",
+      "      :     :     :                                                                       +- GpuGenerate true, [orig_channel#295, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, seller_name#1402, mod_flag#241], [pos#1710, col#1711]\n",
+      "      :     :     :                                                                          +- GpuProject [mod_flag#241, orig_channel#295, seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318]\n",
+      "      :     :     :                                                                             +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "      :     :     :                                                                                :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#408]\n",
+      "      :     :     :                                                                                :     +- GpuProject [quarter#261, loan_id#230L, mod_flag#241]\n",
+      "      :     :     :                                                                                :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "      :     :     :                                                                                :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#318]\n",
+      "      :     :     :                                                                                :           :     +- GpuProject [loan_id#230L, mod_flag#241, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "      :     :     :                                                                                :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "      :     :     :                                                                                :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,mod_flag#241,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "      :     :     :                                                                                :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :                                                                                :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n",
+      "      :     :     :                                                                                :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :                                                                                :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :     :                                                                                :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n",
+      "      :     :     :                                                                                :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "      :     :     :                                                                                :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "      :     :     :                                                                                :                                   +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "      :     :     :                                                                                :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "      :     :     :                                                                                :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n",
+      "      :     :     :                                                                                :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "      :     :     :                                                                                :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "      :     :     :                                                                                :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "      :     :     :                                                                                :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :                                                                                :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :                                                                                :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n",
+      "      :     :     :                                                                                :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :     :                                                                                :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "      :     :     :                                                                                :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "      :     :     :                                                                                :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "      :     :     :                                                                                +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :                                                                                   +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#439]\n",
+      "      :     :     :                                                                                      +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :                                                                                         +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "      :     :     :                                                                                            :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                            :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#348]\n",
+      "      :     :     :                                                                                            :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :     :                                                                                            :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                            :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "      :     :     :                                                                                            :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,first_home_buyer#307,loan_purpose#308,property_type#309,occupancy_status#311,property_state#312,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "      :     :     :                                                                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :     :                                                                                               +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#435]\n",
+      "      :     :     :                                                                                                  +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :     :                                                                                                     +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "      :     :     :                                                                                                        +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :     :                                                                                                           +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "      :     :     +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#3273]\n",
+      "      :     :        +- GpuProject [data#1717 AS relocation_mortgage_indicator#3000, id#1728]\n",
+      "      :     :           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :              +- GpuFilter ((column_id#1714 = 6) AND gpuisnotnull(data#1717))\n",
+      "      :     :                 +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :                    +- InMemoryTableScan [column_id#1714, data#1717, id#1728], [(column_id#1714 = 6), isnotnull(data#1717)]\n",
+      "      :     :                          +- InMemoryRelation [column_id#1714, data#1717, id#1728], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "      :     :                                +- *(2) GpuColumnarToRow false\n",
+      "      :     :                                   +- GpuProject [column_id#1714, data#1717, id#1728]\n",
+      "      :     :                                      +- GpuWindow [gpurownumber() gpuwindowspecdefinition(column_id#1714, count#1723L DESC NULLS LAST, gpuspecifiedwindowframe(RowFrame, gpuspecialframeboundary(unboundedpreceding$()), gpuspecialframeboundary(currentrow$()))) AS id#1728], false\n",
+      "      :     :                                         +- GpuSort [column_id#1714 ASC NULLS FIRST, count#1723L DESC NULLS LAST], false, RequireSingleBatch, 0\n",
+      "      :     :                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :                                               +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, 160), true, [id=#451]\n",
+      "      :     :                                                  +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :                                                     +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                        +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, data#1717, 160), true, [id=#448]\n",
+      "      :     :                                                           +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[partial_gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :     :                                                              +- GpuProject [pos#1710 AS column_id#1714, col#1711 AS data#1717]\n",
+      "      :     :                                                                 +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                    +- GpuFilter gpuisnotnull(col#1711)\n",
+      "      :     :                                                                       +- GpuGenerate true, [orig_channel#295, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, seller_name#1402, mod_flag#241], [pos#1710, col#1711]\n",
+      "      :     :                                                                          +- GpuProject [mod_flag#241, orig_channel#295, seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318]\n",
+      "      :     :                                                                             +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "      :     :                                                                                :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#408]\n",
+      "      :     :                                                                                :     +- GpuProject [quarter#261, loan_id#230L, mod_flag#241]\n",
+      "      :     :                                                                                :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "      :     :                                                                                :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#318]\n",
+      "      :     :                                                                                :           :     +- GpuProject [loan_id#230L, mod_flag#241, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "      :     :                                                                                :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "      :     :                                                                                :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,mod_flag#241,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "      :     :                                                                                :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :                                                                                :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n",
+      "      :     :                                                                                :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :                                                                                :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :     :                                                                                :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n",
+      "      :     :                                                                                :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "      :     :                                                                                :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "      :     :                                                                                :                                   +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "      :     :                                                                                :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "      :     :                                                                                :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n",
+      "      :     :                                                                                :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "      :     :                                                                                :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "      :     :                                                                                :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "      :     :                                                                                :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :                                                                                :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :                                                                                :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n",
+      "      :     :                                                                                :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :     :                                                                                :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "      :     :                                                                                :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "      :     :                                                                                :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "      :     :                                                                                +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :                                                                                   +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#439]\n",
+      "      :     :                                                                                      +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :                                                                                         +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "      :     :                                                                                            :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                            :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#348]\n",
+      "      :     :                                                                                            :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :     :                                                                                            :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                            :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "      :     :                                                                                            :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,first_home_buyer#307,loan_purpose#308,property_type#309,occupancy_status#311,property_state#312,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "      :     :                                                                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :     :                                                                                               +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#435]\n",
+      "      :     :                                                                                                  +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :     :                                                                                                     +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "      :     :                                                                                                        +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :     :                                                                                                           +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "      :     +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#3280]\n",
+      "      :        +- GpuProject [data#1717 AS seller_name#3202, id#1728]\n",
+      "      :           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :              +- GpuFilter ((column_id#1714 = 7) AND gpuisnotnull(data#1717))\n",
+      "      :                 +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :                    +- InMemoryTableScan [column_id#1714, data#1717, id#1728], [(column_id#1714 = 7), isnotnull(data#1717)]\n",
+      "      :                          +- InMemoryRelation [column_id#1714, data#1717, id#1728], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "      :                                +- *(2) GpuColumnarToRow false\n",
+      "      :                                   +- GpuProject [column_id#1714, data#1717, id#1728]\n",
+      "      :                                      +- GpuWindow [gpurownumber() gpuwindowspecdefinition(column_id#1714, count#1723L DESC NULLS LAST, gpuspecifiedwindowframe(RowFrame, gpuspecialframeboundary(unboundedpreceding$()), gpuspecialframeboundary(currentrow$()))) AS id#1728], false\n",
+      "      :                                         +- GpuSort [column_id#1714 ASC NULLS FIRST, count#1723L DESC NULLS LAST], false, RequireSingleBatch, 0\n",
+      "      :                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :                                               +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, 160), true, [id=#451]\n",
+      "      :                                                  +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :                                                     +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                        +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, data#1717, 160), true, [id=#448]\n",
+      "      :                                                           +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[partial_gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "      :                                                              +- GpuProject [pos#1710 AS column_id#1714, col#1711 AS data#1717]\n",
+      "      :                                                                 +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                    +- GpuFilter gpuisnotnull(col#1711)\n",
+      "      :                                                                       +- GpuGenerate true, [orig_channel#295, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, seller_name#1402, mod_flag#241], [pos#1710, col#1711]\n",
+      "      :                                                                          +- GpuProject [mod_flag#241, orig_channel#295, seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318]\n",
+      "      :                                                                             +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "      :                                                                                :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#408]\n",
+      "      :                                                                                :     +- GpuProject [quarter#261, loan_id#230L, mod_flag#241]\n",
+      "      :                                                                                :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "      :                                                                                :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#318]\n",
+      "      :                                                                                :           :     +- GpuProject [loan_id#230L, mod_flag#241, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "      :                                                                                :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "      :                                                                                :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,mod_flag#241,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "      :                                                                                :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :                                                                                :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n",
+      "      :                                                                                :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :                                                                                :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "      :                                                                                :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n",
+      "      :                                                                                :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "      :                                                                                :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "      :                                                                                :                                   +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "      :                                                                                :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "      :                                                                                :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n",
+      "      :                                                                                :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "      :                                                                                :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "      :                                                                                :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "      :                                                                                :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :                                                                                :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :                                                                                :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n",
+      "      :                                                                                :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "      :                                                                                :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "      :                                                                                :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "      :                                                                                :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "      :                                                                                +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :                                                                                   +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#439]\n",
+      "      :                                                                                      +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :                                                                                         +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "      :                                                                                            :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                            :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#348]\n",
+      "      :                                                                                            :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "      :                                                                                            :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                            :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "      :                                                                                            :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,first_home_buyer#307,loan_purpose#308,property_type#309,occupancy_status#311,property_state#312,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "      :                                                                                            +- GpuCoalesceBatches RequireSingleBatch\n",
+      "      :                                                                                               +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#435]\n",
+      "      :                                                                                                  +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "      :                                                                                                     +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "      :                                                                                                        +- GpuRowToColumnar TargetSize(536870912)\n",
+      "      :                                                                                                           +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "      +- GpuBroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#3287]\n",
+      "         +- GpuProject [data#1717 AS mod_flag#3404, id#1728]\n",
+      "            +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "               +- GpuFilter ((column_id#1714 = 8) AND gpuisnotnull(data#1717))\n",
+      "                  +- GpuRowToColumnar TargetSize(536870912)\n",
+      "                     +- InMemoryTableScan [column_id#1714, data#1717, id#1728], [(column_id#1714 = 8), isnotnull(data#1717)]\n",
+      "                           +- InMemoryRelation [column_id#1714, data#1717, id#1728], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "                                 +- *(2) GpuColumnarToRow false\n",
+      "                                    +- GpuProject [column_id#1714, data#1717, id#1728]\n",
+      "                                       +- GpuWindow [gpurownumber() gpuwindowspecdefinition(column_id#1714, count#1723L DESC NULLS LAST, gpuspecifiedwindowframe(RowFrame, gpuspecialframeboundary(unboundedpreceding$()), gpuspecialframeboundary(currentrow$()))) AS id#1728], false\n",
+      "                                          +- GpuSort [column_id#1714 ASC NULLS FIRST, count#1723L DESC NULLS LAST], false, RequireSingleBatch, 0\n",
+      "                                             +- GpuCoalesceBatches RequireSingleBatch\n",
+      "                                                +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, 160), true, [id=#451]\n",
+      "                                                   +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "                                                      +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                         +- GpuColumnarExchange gpuhashpartitioning(column_id#1714, data#1717, 160), true, [id=#448]\n",
+      "                                                            +- GpuHashAggregate(keys=[column_id#1714, data#1717], functions=[partial_gpucount(1)]), filters=ArrayBuffer(None))\n",
+      "                                                               +- GpuProject [pos#1710 AS column_id#1714, col#1711 AS data#1717]\n",
+      "                                                                  +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                     +- GpuFilter gpuisnotnull(col#1711)\n",
+      "                                                                        +- GpuGenerate true, [orig_channel#295, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, seller_name#1402, mod_flag#241], [pos#1710, col#1711]\n",
+      "                                                                           +- GpuProject [mod_flag#241, orig_channel#295, seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318]\n",
+      "                                                                              +- GpuShuffledHashJoin [loan_id#230L, quarter#261], [loan_id#294L, quarter#319], Inner, BuildRight\n",
+      "                                                                                 :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                 :  +- GpuColumnarExchange gpuhashpartitioning(loan_id#230L, quarter#261, 160), true, [id=#408]\n",
+      "                                                                                 :     +- GpuProject [quarter#261, loan_id#230L, mod_flag#241]\n",
+      "                                                                                 :        +- GpuShuffledHashJoin [quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint)], [quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L], LeftOuter, BuildRight\n",
+      "                                                                                 :           :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                 :           :  +- GpuColumnarExchange gpuhashpartitioning(quarter#261, loan_id#230L, cast(timestamp_year#1106 as bigint), cast(timestamp_month#1070 as bigint), 160), true, [id=#318]\n",
+      "                                                                                 :           :     +- GpuProject [loan_id#230L, mod_flag#241, quarter#261, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#1070, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#231, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#1106]\n",
+      "                                                                                 :           :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                 :           :           +- GpuFilter ((NOT quarter#261 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#230L)) AND gpuisnotnull(quarter#261))\n",
+      "                                                                                 :           :              +- GpuFileGpuScan parquet [loan_id#230L,monthly_reporting_period#231,mod_flag#241,quarter#261] Batched: true, DataFilters: [NOT quarter#261 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#230L), isnotnull(quarter#261)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,mod_flag:string,quarter:string>\n",
+      "                                                                                 :           +- GpuCoalesceBatches RequireSingleBatch\n",
+      "                                                                                 :              +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, timestamp_year#996L, timestamp_month#1025L, 160), true, [id=#339]\n",
+      "                                                                                 :                 +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "                                                                                 :                    +- GpuHashAggregate(keys=[quarter#1173, loan_id#1142L, josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, month_y#937], functions=[]), filters=ArrayBuffer())\n",
+      "                                                                                 :                       +- GpuProject [quarter#1173, FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) AS josh_mody_n#953L, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686, loan_id#1142L, month_y#937]\n",
+      "                                                                                 :                          +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                 :                             +- GpuFilter (gpuisnotnull(CASE WHEN ((((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) = 0) THEN 12 ELSE (((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast(month_y#937 as bigint)) pmod 12) END) AND gpuisnotnull(FLOOR((cast(((24000 + (FLOOR((cast(((((timestamp_year#776 * 12) + timestamp_month#740) - 24000) - month_y#937) as double) / 12.0)) * 12)) + cast((month_y#937 - 1) as bigint)) as double) / 12.0))))\n",
+      "                                                                                 :                                +- GpuGenerate false, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686], [month_y#937]\n",
+      "                                                                                 :                                   +- GpuProject [loan_id#1142L, quarter#1173, timestamp_month#740, timestamp_year#776, ever_30#693, ever_90#694, ever_180#695, delinquency_30#682, delinquency_90#684, delinquency_180#686]\n",
+      "                                                                                 :                                      +- GpuShuffledHashJoin [quarter#1173, loan_id#1142L], [quarter#922, loan_id#891L], LeftOuter, BuildRight\n",
+      "                                                                                 :                                         :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                 :                                         :  +- GpuColumnarExchange gpuhashpartitioning(quarter#1173, loan_id#1142L, 160), true, [id=#322]\n",
+      "                                                                                 :                                         :     +- GpuProject [quarter#1173, loan_id#1142L, gpumonth(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_month#740, gpuyear(cast(cast(gpuunixtimestamp(monthly_reporting_period#1143, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date)) AS timestamp_year#776]\n",
+      "                                                                                 :                                         :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                 :                                         :           +- GpuFilter ((NOT quarter#1173 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#1142L)) AND gpuisnotnull(quarter#1173))\n",
+      "                                                                                 :                                         :              +- GpuFileGpuScan parquet [loan_id#1142L,monthly_reporting_period#1143,quarter#1173] Batched: true, DataFilters: [NOT quarter#1173 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#1142L), isnotnull(quarter#1..., Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,quarter:string>\n",
+      "                                                                                 :                                         +- GpuCoalesceBatches RequireSingleBatch\n",
+      "                                                                                 :                                            +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[gpumax(current_loan_delinquency_status#901), gpumin(delinquency_30#664), gpumin(delinquency_90#665), gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "                                                                                 :                                               +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                 :                                                  +- GpuColumnarExchange gpuhashpartitioning(quarter#922, loan_id#891L, 160), true, [id=#327]\n",
+      "                                                                                 :                                                     +- GpuHashAggregate(keys=[quarter#922, loan_id#891L], functions=[partial_gpumax(current_loan_delinquency_status#901), partial_gpumin(delinquency_30#664), partial_gpumin(delinquency_90#665), partial_gpumin(delinquency_180#666)]), filters=ArrayBuffer(None, None, None, None))\n",
+      "                                                                                 :                                                        +- GpuProject [quarter#922, loan_id#891L, current_loan_delinquency_status#901, CASE WHEN (current_loan_delinquency_status#901 >= 1) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_30#664, CASE WHEN (current_loan_delinquency_status#901 >= 3) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_90#665, CASE WHEN (current_loan_delinquency_status#901 >= 6) THEN cast(cast(gpuunixtimestamp(monthly_reporting_period#892, MM/dd/yyyy, %m/%d/%Y, None) as timestamp) as date) END AS delinquency_180#666]\n",
+      "                                                                                 :                                                           +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                 :                                                              +- GpuFilter (gpuisnotnull(loan_id#891L) AND gpuisnotnull(quarter#922))\n",
+      "                                                                                 :                                                                 +- GpuFileGpuScan parquet [loan_id#891L,monthly_reporting_period#892,current_loan_delinquency_status#901,quarter#922] Batched: true, DataFilters: [isnotnull(loan_id#891L), isnotnull(quarter#922)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/perf], PartitionFilters: [], PushedFilters: [IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,monthly_reporting_period:string,current_loan_delinquency_status:int,quarter...\n",
+      "                                                                                 +- GpuCoalesceBatches RequireSingleBatch\n",
+      "                                                                                    +- GpuColumnarExchange gpuhashpartitioning(loan_id#294L, quarter#319, 160), true, [id=#439]\n",
+      "                                                                                       +- GpuProject [loan_id#294L, orig_channel#295, gpucoalesce(to_seller_name#1259, seller_name#296) AS seller_name#1402, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "                                                                                          +- GpuShuffledHashJoin [seller_name#296], [from_seller_name#1258], LeftOuter, BuildRight\n",
+      "                                                                                             :- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                             :  +- GpuColumnarExchange gpuhashpartitioning(seller_name#296, 160), true, [id=#348]\n",
+      "                                                                                             :     +- GpuProject [loan_id#294L, orig_channel#295, seller_name#296, first_home_buyer#307, loan_purpose#308, property_type#309, occupancy_status#311, property_state#312, relocation_mortgage_indicator#318, quarter#319]\n",
+      "                                                                                             :        +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                             :           +- GpuFilter ((NOT quarter#319 INSET (2016Q1,2016Q2,2016Q3,2016Q4) AND gpuisnotnull(loan_id#294L)) AND gpuisnotnull(quarter#319))\n",
+      "                                                                                             :              +- GpuFileGpuScan parquet [loan_id#294L,orig_channel#295,seller_name#296,first_home_buyer#307,loan_purpose#308,property_type#309,occupancy_status#311,property_state#312,relocation_mortgage_indicator#318,quarter#319] Batched: true, DataFilters: [NOT quarter#319 IN (2016Q1,2016Q2,2016Q3,2016Q4), isnotnull(loan_id#294L), isnotnull(quarter#319)], Format: Parquet, Location: InMemoryFileIndex[gs://dataproc-nv-demo/mortgage_parquet_gpu/acq], PartitionFilters: [], PushedFilters: [Not(In(quarter, [2016Q1,2016Q2,2016Q3,2016Q4])), IsNotNull(loan_id), IsNotNull(quarter)], ReadSchema: struct<loan_id:bigint,orig_channel:string,seller_name:string,first_home_buyer:string,loan_purpose...\n",
+      "                                                                                             +- GpuCoalesceBatches RequireSingleBatch\n",
+      "                                                                                                +- GpuColumnarExchange gpuhashpartitioning(from_seller_name#1258, 160), true, [id=#435]\n",
+      "                                                                                                   +- GpuCoalesceBatches TargetSize(536870912)\n",
+      "                                                                                                      +- GpuFilter gpuisnotnull(from_seller_name#1258)\n",
+      "                                                                                                         +- GpuRowToColumnar TargetSize(536870912)\n",
+      "                                                                                                            +- *(1) Scan ExistingRDD[from_seller_name#1258,to_seller_name#1259]\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "train_out.explain()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs/demo/GCP/mortgage-xgboost4j-gpu-scala.ipynb b/docs/demo/GCP/mortgage-xgboost4j-gpu-scala.ipynb
new file mode 100644
index 00000000000..c0719959327
--- /dev/null
+++ b/docs/demo/GCP/mortgage-xgboost4j-gpu-scala.ipynb
@@ -0,0 +1,173 @@
+﻿{
+  "metadata": {
+    "name": "mortgage-gpu-scala",
+    "kernelspec": {
+      "language": "scala",
+      "name": "spark2-scala"
+    },
+    "language_info": {
+      "codemirror_mode": "text/x-scala",
+      "file_extension": ".scala",
+      "mimetype": "text/x-scala",
+      "name": "scala",
+      "pygments_lexer": "scala"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 2,
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "# Introduction to XGBoost Spark with GPU\n\nMortgage is an example of xgboost classifier to do binary classification. This notebook will show you how to load data, train the xgboost model and use this model to predict if a mushroom is \"poisonous\". Camparing to original XGBoost Spark code, there\u0027re only one API difference.\n\n## Load libraries\nFirst load some common libraries will be used by both GPU version and CPU version xgboost."
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}\nimport org.apache.spark.sql.SparkSession\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}\nimport org.apache.spark.SparkConf"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "Besides CPU version requires some extra libraries, such as:\n\n```scala\nimport org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.sql.DataFrame\nimport org.apache.spark.sql.functions._\nimport org.apache.spark.sql.types.FloatType\n```"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "// Update all path with your Dataproc Environment\nval trainPath \u003d \"gs://dataproc-nv-demo/mortgage_full/train/\"\nval evalPath  \u003d \"gs://dataproc-nv-demo/mortgage_full/test/\"\nval transPath \u003d \"gs://dataproc-nv-demo/mortgage_full/test/\"\nval modelPath \u003d \"gs://dataproc-nv-demo/mortgage_full/model/mortgage\""
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "## Build the schema and parameters\nThe mortgage data has 27 columns: 26 features and 1 label. \"deinquency_12\" is the label column. The schema will be used to load data in the future.\n\nThe next block also defines some key parameters used in xgboost training process."
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "val labelColName \u003d \"delinquency_12\"\nval schema \u003d StructType(List(\n  StructField(\"orig_channel\", DoubleType),\n  StructField(\"first_home_buyer\", DoubleType),\n  StructField(\"loan_purpose\", DoubleType),\n  StructField(\"property_type\", DoubleType),\n  StructField(\"occupancy_status\", DoubleType),\n  StructField(\"property_state\", DoubleType),\n  StructField(\"product_type\", DoubleType),\n  StructField(\"relocation_mortgage_indicator\", DoubleType),\n  StructField(\"seller_name\", DoubleType),\n  StructField(\"mod_flag\", DoubleType),\n  StructField(\"orig_interest_rate\", DoubleType),\n  StructField(\"orig_upb\", IntegerType),\n  StructField(\"orig_loan_term\", IntegerType),\n  StructField(\"orig_ltv\", DoubleType),\n  StructField(\"orig_cltv\", DoubleType),\n  StructField(\"num_borrowers\", DoubleType),\n  StructField(\"dti\", DoubleType),\n  StructField(\"borrower_credit_score\", DoubleType),\n  StructField(\"num_units\", IntegerType),\n  StructField(\"zip\", IntegerType),\n  StructField(\"mortgage_insurance_percent\", DoubleType),\n  StructField(\"current_loan_delinquency_status\", IntegerType),\n  StructField(\"current_actual_upb\", DoubleType),\n  StructField(\"interest_rate\", DoubleType),\n  StructField(\"loan_age\", DoubleType),\n  StructField(\"msa\", DoubleType),\n  StructField(\"non_interest_bearing_upb\", DoubleType),\n  StructField(labelColName, IntegerType)))\n\nval featureNames \u003d schema.filter(_.name !\u003d labelColName).map(_.name)"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "## Create a new spark session and load data\n\nA new spark session should be created to continue all the following spark operations.\n\nNOTE: in this notebook, the dependency jars have been loaded when installing toree kernel. Alternatively the jars can be loaded into notebook by [%AddJar magic](https://toree.incubator.apache.org/docs/current/user/faq/). However, there\u0027s one restriction for `%AddJar`: the jar uploaded can only be available when `AddJar` is called just after a new spark session is created. Do it as below:\n\n```scala\nimport org.apache.spark.sql.SparkSession\nval spark \u003d SparkSession.builder().appName(\"mortgage-GPU\").getOrCreate\n%AddJar file:/data/libs/cudf-XXX-cuda10.jar\n%AddJar file:/data/libs/rapids-4-spark-XXX.jar\n%AddJar file:/data/libs/xgboost4j_3.0-XXX.jar\n%AddJar file:/data/libs/xgboost4j-spark_3.0-XXX.jar\n// ...\n```\n\n##### Please note the new jar \"rapids-4-spark-XXX.jar\" is only needed for GPU version, you can not add it to dependence list for CPU version."
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "// Build the spark session and data reader as usual\nval conf \u003d new SparkConf()\nconf.set(\"spark.executor.instances\", \"20\")\nconf.set(\"spark.executor.cores\", \"7\")\nconf.set(\"spark.task.cpus\", \"7\")\nconf.set(\"spark.executor.memory\", \"24g\")\nconf.set(\"spark.rapids.memory.pinnedPool.size\", \"2G\")\nconf.set(\"spark.executor.memoryOverhead\", \"16G\")\nconf.set(\"spark.executor.extraJavaOptions\", \"-Dai.rapids.cudf.prefer-pinned\u003dtrue\")\nconf.set(\"spark.locality.wait\", \"0s\")\nconf.set(\"spark.sql.files.maxPartitionBytes\", \"512m\")\nconf.set(\"spark.executor.resource.gpu.amount\", \"1\")\nconf.set(\"spark.task.resource.gpu.amount\", \"1\")\nconf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\nconf.set(\"spark.rapids.sql.hasNans\", \"false\")\nconf.set(\"spark.rapids.sql.batchSizeBytes\", \"512M\")\nconf.set(\"spark.rapids.sql.reader.batchSizeBytes\", \"768M\")\nconf.set(\"spark.rapids.sql.variableFloatAgg.enabled\", \"true\")\nconf.set(\"spark.rapids.memory.gpu.pooling.enabled\", \"false\")\n// conf.set(\"spark.rapids.memory.gpu.allocFraction\", \"0.1\")\nval spark \u003d SparkSession.builder.appName(\"mortgage-gpu\")\n                               .enableHiveSupport()\n                               .config(conf)\n                               .getOrCreate\nval reader \u003d spark.read.option(\"header\", true).schema(schema)"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "val trainSet \u003d reader.parquet(trainPath)\nval evalSet  \u003d reader.parquet(evalPath)\nval transSet \u003d reader.parquet(transPath)"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "## Set xgboost parameters and build a XGBoostClassifier\n\nFor CPU version, `num_workers` is recommended being equal to the number of CPU cores, while for GPU version, it should be set to the number of GPUs in Spark cluster.\n\nBesides the `tree_method` for CPU version is also different from that for GPU version. Now only \"gpu_hist\" is supported for training on GPU.\n\n```scala\n// difference in parameters\n  \"num_workers\" -\u003e 12,\n  \"tree_method\" -\u003e \"hist\",\n```"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "val commParamMap \u003d Map(\n  \"eta\" -\u003e 0.1,\n  \"gamma\" -\u003e 0.1,\n  \"missing\" -\u003e 0.0,\n  \"max_depth\" -\u003e 10,\n  \"max_leaves\" -\u003e 256,\n  \"objective\" -\u003e \"binary:logistic\",\n  \"grow_policy\" -\u003e \"depthwise\",\n  \"min_child_weight\" -\u003e 30,\n  \"lambda\" -\u003e 1,\n  \"scale_pos_weight\" -\u003e 2,\n  \"subsample\" -\u003e 1,\n  \"num_round\" -\u003e 100)\n  \nval xgbParamFinal \u003d commParamMap ++ Map(\"tree_method\" -\u003e \"gpu_hist\", \"num_workers\" -\u003e 20, \"nthread\" -\u003e 7)"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "Here comes the only API difference,`setFeaturesCol` in CPU version vs `setFeaturesCols` in GPU version.\n\nIn previous block, it said that CPU version needs `VectorAssembler` to assemble multiple feature columns into one column, because `setFeaturesCol` only accepts one feature column with the type of `vector`.\n\nBut `setFeaturesCols` supports multiple columns directly, so set the feautres column names directly to `XGBoostClassifier`. \n\nCPU version:\n\n```scala\nval xgbClassifier  \u003d new XGBoostClassifier(paramMap)\n  .setLabelCol(labelName)\n  .setFeaturesCol(\"features\")\n```"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "val xgbClassifier \u003d new XGBoostClassifier(xgbParamFinal)\n      .setLabelCol(labelColName)\n      // \u003d\u003d\u003d diff \u003d\u003d\u003d\n      .setFeaturesCols(featureNames)"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "## Benchmark and train\nThe object `benchmark` is used to compute the elapsed time of some operations.\n\nTraining with evaluation sets is also supported in 2 ways, the same as CPU version\u0027s behavior:\n\n* Call API `setEvalSets` after initializing an XGBoostClassifier\n\n```scala\nxgbClassifier.setEvalSets(Map(\"eval\" -\u003e evalSet))\n\n```\n\n* Use parameter `eval_sets` when initializing an XGBoostClassifier\n\n```scala\nval paramMapWithEval \u003d paramMap + (\"eval_sets\" -\u003e Map(\"eval\" -\u003e evalSet))\nval xgbClassifierWithEval \u003d new XGBoostClassifier(paramMapWithEval)\n```\n\nHere chooses the API way to set evaluation sets."
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "xgbClassifier.setEvalSets(Map(\"eval\" -\u003e evalSet))"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "def benchmark[R](phase: String)(block: \u003d\u003e R): (R, Float) \u003d {\n  val t0 \u003d System.currentTimeMillis\n  val result \u003d block // call-by-name\n  val t1 \u003d System.currentTimeMillis\n  println(\"Elapsed time [\" + phase + \"]: \" + ((t1 - t0).toFloat / 1000) + \"s\")\n  (result, (t1 - t0).toFloat / 1000)\n}"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "CPU version reqires an extra step before fitting data to classifier, using `VectorAssembler` to assemble all feature columns into one column. The following code snip shows how to do the vectorizing.\n\n```scala\nobject Vectorize {\n  def apply(df: DataFrame, featureNames: Seq[String], labelName: String): DataFrame \u003d {\n    val toFloat \u003d df.schema.map(f \u003d\u003e col(f.name).cast(FloatType))\n    new VectorAssembler()\n      .setInputCols(featureNames.toArray)\n      .setOutputCol(\"features\")\n      .transform(df.select(toFloat:_*))\n      .select(col(\"features\"), col(labelName))\n  }\n}\n\ntrainSet \u003d Vectorize(trainSet, featureCols, labelName)\nevalSet \u003d Vectorize(evalSet, featureCols, labelName)\ntransSet \u003d Vectorize(transSet, featureCols, labelName)\n\n```\n\n`VectorAssembler` is not needed for GPU version. Just fit the loaded data directly to XGBoostClassifier."
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "// Start training\nprintln(\"\\n------ Training ------\")\nval (xgbClassificationModel, _) \u003d benchmark(\"train\") {\n  xgbClassifier.fit(trainSet)\n}"
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": "## Transformation and evaluation\nHere uses `transSet` to evaluate our model and prints some useful columns to show our prediction result. After that `MulticlassClassificationEvaluator` is used to calculate an overall accuracy of our predictions."
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 19,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "println(\"\\n------ Transforming ------\")\nval (results, _) \u003d benchmark(\"transform\") {\n  val ret \u003d xgbClassificationModel.transform(transSet).cache()\n  ret\n}\nz.show(results.select(\"orig_channel\", labelColName,\"rawPrediction\",\"probability\",\"prediction\").limit(10))\n\nprintln(\"\\n------Accuracy of Evaluation------\")\nval evaluator \u003d new MulticlassClassificationEvaluator().setLabelCol(labelColName)\nval accuracy \u003d evaluator.evaluate(results)\nprintln(accuracy)"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 20,
+      "metadata": {
+        "autoscroll": "auto"
+      },
+      "outputs": [],
+      "source": "xgbClassificationModel.write.overwrite.save(modelPath)\n\nval modelFromDisk \u003d XGBoostClassificationModel.load(modelPath)\n\nval (results2, _) \u003d benchmark(\"transform2\") {\n  modelFromDisk.transform(transSet)\n}\nz.show(results2.limit(5))"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/docs/demo/GCP/mortgage-xgboost4j-gpu-scala.zpln b/docs/demo/GCP/mortgage-xgboost4j-gpu-scala.zpln
new file mode 100644
index 00000000000..1ae1a901708
--- /dev/null
+++ b/docs/demo/GCP/mortgage-xgboost4j-gpu-scala.zpln
@@ -0,0 +1,1025 @@
+﻿{
+  "paragraphs": [
+    {
+      "text": "%md\n# Introduction to XGBoost Spark with GPU\n\nMortgage is an example of xgboost classifier to do binary classification. This notebook will show you how to load data, train the xgboost model and use this model to predict if a mushroom is \"poisonous\". Camparing to original XGBoost Spark code, there're only one API difference.\n\n## Load libraries\nFirst load some common libraries will be used by both GPU version and CPU version xgboost.",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:45+0000",
+      "config": {
+        "editorMode": "ace/mode/markdown",
+        "editorHide": true,
+        "editorSetting": {
+          "language": "markdown",
+          "editOnDblClick": true
+        },
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {}
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "HTML",
+            "data": "<div class=\"markdown-body\">\n<h1>Introduction to XGBoost Spark with GPU</h1>\n<p>Mortgage is an example of xgboost classifier to do binary classification. This notebook will show you how to load data, train the xgboost model and use this model to predict if a mushroom is &ldquo;poisonous&rdquo;. Camparing to original XGBoost Spark code, there&rsquo;re only one API difference.</p>\n<h2>Load libraries</h2>\n<p>First load some common libraries will be used by both GPU version and CPU version xgboost.</p>\n\n</div>"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580281_1080045385",
+      "id": "20200712-043620_382811823",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:45+0000",
+      "dateFinished": "2020-07-13T02:18:45+0000",
+      "status": "FINISHED",
+      "focus": true,
+      "$$hashKey": "object:11086"
+    },
+    {
+      "text": "import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}\nimport org.apache.spark.sql.SparkSession\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}\nimport org.apache.spark.SparkConf",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:45+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostClassificationModel}\nimport org.apache.spark.sql.SparkSession\nimport org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\nimport org.apache.spark.sql.types.{DoubleType, IntegerType, StructField, StructType}\nimport org.apache.spark.SparkConf\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580282_314340064",
+      "id": "20200712-043620_1400821320",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:45+0000",
+      "dateFinished": "2020-07-13T02:18:46+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11087"
+    },
+    {
+      "text": "%md\nBesides CPU version requires some extra libraries, such as:\n\n```scala\nimport org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.sql.DataFrame\nimport org.apache.spark.sql.functions._\nimport org.apache.spark.sql.types.FloatType\n```",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:46+0000",
+      "config": {
+        "editorMode": "ace/mode/markdown",
+        "editorHide": true,
+        "editorSetting": {
+          "language": "markdown",
+          "editOnDblClick": true
+        },
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {}
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "HTML",
+            "data": "<div class=\"markdown-body\">\n<p>Besides CPU version requires some extra libraries, such as:</p>\n<pre><code class=\"language-scala\">import org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.sql.DataFrame\nimport org.apache.spark.sql.functions._\nimport org.apache.spark.sql.types.FloatType\n</code></pre>\n\n</div>"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580282_1068889472",
+      "id": "20200712-043620_1625961573",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:46+0000",
+      "dateFinished": "2020-07-13T02:18:46+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11088"
+    },
+    {
+      "title": "Set the dataset path",
+      "text": "// Update all path with your Dataproc Environment\nval trainPath = \"gs://dataproc-nv-demo/mortgage_full/train/\"\nval evalPath  = \"gs://dataproc-nv-demo/mortgage_full/test/\"\nval transPath = \"gs://dataproc-nv-demo/mortgage_full/test/\"\nval modelPath = \"gs://dataproc-nv-demo/mortgage_full/model/mortgage\"",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:46+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        },
+        "title": true
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\u001b[1m\u001b[34mtrainPath\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = gs://dataproc-nv-demo/mortgage_full/train/\n\u001b[1m\u001b[34mevalPath\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = gs://dataproc-nv-demo/mortgage_full/test/\n\u001b[1m\u001b[34mtransPath\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = gs://dataproc-nv-demo/mortgage_full/test/\n\u001b[1m\u001b[34mmodelPath\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = gs://dataproc-nv-demo/mortgage_full/model/mortgage\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580282_1437224612",
+      "id": "20200712-043620_1955827407",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:46+0000",
+      "dateFinished": "2020-07-13T02:18:46+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11089"
+    },
+    {
+      "text": "%md\n## Build the schema and parameters\nThe mortgage data has 27 columns: 26 features and 1 label. \"deinquency_12\" is the label column. The schema will be used to load data in the future.\n\nThe next block also defines some key parameters used in xgboost training process.",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:46+0000",
+      "config": {
+        "editorMode": "ace/mode/markdown",
+        "editorHide": true,
+        "editorSetting": {
+          "language": "markdown",
+          "editOnDblClick": true
+        },
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {}
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "HTML",
+            "data": "<div class=\"markdown-body\">\n<h2>Build the schema and parameters</h2>\n<p>The mortgage data has 27 columns: 26 features and 1 label. &ldquo;deinquency_12&rdquo; is the label column. The schema will be used to load data in the future.</p>\n<p>The next block also defines some key parameters used in xgboost training process.</p>\n\n</div>"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580282_433144999",
+      "id": "20200712-043620_2043825692",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:46+0000",
+      "dateFinished": "2020-07-13T02:18:46+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11090"
+    },
+    {
+      "text": "val labelColName = \"delinquency_12\"\nval schema = StructType(List(\n  StructField(\"orig_channel\", DoubleType),\n  StructField(\"first_home_buyer\", DoubleType),\n  StructField(\"loan_purpose\", DoubleType),\n  StructField(\"property_type\", DoubleType),\n  StructField(\"occupancy_status\", DoubleType),\n  StructField(\"property_state\", DoubleType),\n  StructField(\"product_type\", DoubleType),\n  StructField(\"relocation_mortgage_indicator\", DoubleType),\n  StructField(\"seller_name\", DoubleType),\n  StructField(\"mod_flag\", DoubleType),\n  StructField(\"orig_interest_rate\", DoubleType),\n  StructField(\"orig_upb\", IntegerType),\n  StructField(\"orig_loan_term\", IntegerType),\n  StructField(\"orig_ltv\", DoubleType),\n  StructField(\"orig_cltv\", DoubleType),\n  StructField(\"num_borrowers\", DoubleType),\n  StructField(\"dti\", DoubleType),\n  StructField(\"borrower_credit_score\", DoubleType),\n  StructField(\"num_units\", IntegerType),\n  StructField(\"zip\", IntegerType),\n  StructField(\"mortgage_insurance_percent\", DoubleType),\n  StructField(\"current_loan_delinquency_status\", IntegerType),\n  StructField(\"current_actual_upb\", DoubleType),\n  StructField(\"interest_rate\", DoubleType),\n  StructField(\"loan_age\", DoubleType),\n  StructField(\"msa\", DoubleType),\n  StructField(\"non_interest_bearing_upb\", DoubleType),\n  StructField(labelColName, IntegerType)))\n\nval featureNames = schema.filter(_.name != labelColName).map(_.name)",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:46+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\u001b[1m\u001b[34mlabelColName\u001b[0m: \u001b[1m\u001b[32mString\u001b[0m = delinquency_12\n\u001b[1m\u001b[34mschema\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.types.StructType\u001b[0m = StructType(StructField(orig_channel,DoubleType,true), StructField(first_home_buyer,DoubleType,true), StructField(loan_purpose,DoubleType,true), StructField(property_type,DoubleType,true), StructField(occupancy_status,DoubleType,true), StructField(property_state,DoubleType,true), StructField(product_type,DoubleType,true), StructField(relocation_mortgage_indicator,DoubleType,true), StructField(seller_name,DoubleType,true), StructField(mod_flag,DoubleType,true), StructField(orig_interest_rate,DoubleType,true), StructField(orig_upb,IntegerType,true), StructField(orig_loan_term,IntegerType,true), StructField(orig_ltv,DoubleType,true), StructField(orig_cltv...\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580282_-318188050",
+      "id": "20200712-043620_542099397",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:46+0000",
+      "dateFinished": "2020-07-13T02:18:46+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11091"
+    },
+    {
+      "text": "%md\n## Create a new spark session and load data\n\nA new spark session should be created to continue all the following spark operations.\n\nNOTE: in this notebook, the dependency jars have been loaded when installing toree kernel. Alternatively the jars can be loaded into notebook by [%AddJar magic](https://toree.incubator.apache.org/docs/current/user/faq/). However, there's one restriction for `%AddJar`: the jar uploaded can only be available when `AddJar` is called just after a new spark session is created. Do it as below:\n\n```scala\nimport org.apache.spark.sql.SparkSession\nval spark = SparkSession.builder().appName(\"mortgage-GPU\").getOrCreate\n%AddJar file:/data/libs/cudf-XXX-cuda10.jar\n%AddJar file:/data/libs/rapids-4-spark-XXX.jar\n%AddJar file:/data/libs/xgboost4j_3.0-XXX.jar\n%AddJar file:/data/libs/xgboost4j-spark_3.0-XXX.jar\n// ...\n```\n\n##### Please note the new jar \"rapids-4-spark-XXX.jar\" is only needed for GPU version, you can not add it to dependence list for CPU version.",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:47+0000",
+      "config": {
+        "editorMode": "ace/mode/markdown",
+        "editorHide": true,
+        "editorSetting": {
+          "language": "markdown",
+          "editOnDblClick": true
+        },
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {}
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "HTML",
+            "data": "<div class=\"markdown-body\">\n<h2>Create a new spark session and load data</h2>\n<p>A new spark session should be created to continue all the following spark operations.</p>\n<p>NOTE: in this notebook, the dependency jars have been loaded when installing toree kernel. Alternatively the jars can be loaded into notebook by <a href=\"https://toree.incubator.apache.org/docs/current/user/faq/\">%AddJar magic</a>. However, there&rsquo;s one restriction for <code>%AddJar</code>: the jar uploaded can only be available when <code>AddJar</code> is called just after a new spark session is created. Do it as below:</p>\n<pre><code class=\"language-scala\">import org.apache.spark.sql.SparkSession\nval spark = SparkSession.builder().appName(&quot;mortgage-GPU&quot;).getOrCreate\n%AddJar file:/data/libs/cudf-XXX-cuda10.jar\n%AddJar file:/data/libs/rapids-4-spark-XXX.jar\n%AddJar file:/data/libs/xgboost4j_3.0-XXX.jar\n%AddJar file:/data/libs/xgboost4j-spark_3.0-XXX.jar\n// ...\n</code></pre>\n<h5>Please note the new jar &ldquo;rapids-4-spark-XXX.jar&rdquo; is only needed for GPU version, you can not add it to dependence list for CPU version.</h5>\n\n</div>"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580283_-1107372761",
+      "id": "20200712-043620_889594738",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:47+0000",
+      "dateFinished": "2020-07-13T02:18:47+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11092"
+    },
+    {
+      "text": "// Build the spark session and data reader as usual\nval conf = new SparkConf()\nconf.set(\"spark.executor.instances\", \"20\")\nconf.set(\"spark.executor.cores\", \"7\")\nconf.set(\"spark.task.cpus\", \"7\")\nconf.set(\"spark.executor.memory\", \"24g\")\nconf.set(\"spark.rapids.memory.pinnedPool.size\", \"2G\")\nconf.set(\"spark.executor.memoryOverhead\", \"16G\")\nconf.set(\"spark.executor.extraJavaOptions\", \"-Dai.rapids.cudf.prefer-pinned=true\")\nconf.set(\"spark.locality.wait\", \"0s\")\nconf.set(\"spark.sql.files.maxPartitionBytes\", \"512m\")\nconf.set(\"spark.executor.resource.gpu.amount\", \"1\")\nconf.set(\"spark.task.resource.gpu.amount\", \"1\")\nconf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\nconf.set(\"spark.rapids.sql.hasNans\", \"false\")\nconf.set(\"spark.rapids.sql.batchSizeBytes\", \"512M\")\nconf.set(\"spark.rapids.sql.reader.batchSizeBytes\", \"768M\")\nconf.set(\"spark.rapids.sql.variableFloatAgg.enabled\", \"true\")\nconf.set(\"spark.rapids.memory.gpu.pooling.enabled\", \"false\")\n// conf.set(\"spark.rapids.memory.gpu.allocFraction\", \"0.1\")\nval spark = SparkSession.builder.appName(\"mortgage-gpu\")\n                               .enableHiveSupport()\n                               .config(conf)\n                               .getOrCreate\nval reader = spark.read.option(\"header\", true).schema(schema)",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:47+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\u001b[1m\u001b[34mconf\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.SparkConf\u001b[0m = org.apache.spark.SparkConf@1aab0102\n\u001b[1m\u001b[34mspark\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.SparkSession\u001b[0m = org.apache.spark.sql.SparkSession@1239890f\n\u001b[1m\u001b[34mreader\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrameReader\u001b[0m = org.apache.spark.sql.DataFrameReader@7a9bb956\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580283_-892064929",
+      "id": "20200712-043620_622739089",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:47+0000",
+      "dateFinished": "2020-07-13T02:18:53+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11093"
+    },
+    {
+      "text": "val trainSet = reader.parquet(trainPath)\nval evalSet  = reader.parquet(evalPath)\nval transSet = reader.parquet(transPath)",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:53+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\u001b[1m\u001b[34mtrainSet\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrame\u001b[0m = [orig_channel: double, first_home_buyer: double ... 26 more fields]\n\u001b[1m\u001b[34mevalSet\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrame\u001b[0m = [orig_channel: double, first_home_buyer: double ... 26 more fields]\n\u001b[1m\u001b[34mtransSet\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrame\u001b[0m = [orig_channel: double, first_home_buyer: double ... 26 more fields]\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580283_1108385932",
+      "id": "20200712-043620_562533619",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:53+0000",
+      "dateFinished": "2020-07-13T02:18:54+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11094"
+    },
+    {
+      "text": "%md\n## Set xgboost parameters and build a XGBoostClassifier\n\nFor CPU version, `num_workers` is recommended being equal to the number of CPU cores, while for GPU version, it should be set to the number of GPUs in Spark cluster.\n\nBesides the `tree_method` for CPU version is also different from that for GPU version. Now only \"gpu_hist\" is supported for training on GPU.\n\n```scala\n// difference in parameters\n  \"num_workers\" -> 12,\n  \"tree_method\" -> \"hist\",\n```",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:54+0000",
+      "config": {
+        "editorMode": "ace/mode/markdown",
+        "editorHide": true,
+        "editorSetting": {
+          "language": "markdown",
+          "editOnDblClick": true
+        },
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {}
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "HTML",
+            "data": "<div class=\"markdown-body\">\n<h2>Set xgboost parameters and build a XGBoostClassifier</h2>\n<p>For CPU version, <code>num_workers</code> is recommended being equal to the number of CPU cores, while for GPU version, it should be set to the number of GPUs in Spark cluster.</p>\n<p>Besides the <code>tree_method</code> for CPU version is also different from that for GPU version. Now only &ldquo;gpu_hist&rdquo; is supported for training on GPU.</p>\n<pre><code class=\"language-scala\">// difference in parameters\n  &quot;num_workers&quot; -&gt; 12,\n  &quot;tree_method&quot; -&gt; &quot;hist&quot;,\n</code></pre>\n\n</div>"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580283_-880026833",
+      "id": "20200712-043620_1948369426",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:54+0000",
+      "dateFinished": "2020-07-13T02:18:54+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11095"
+    },
+    {
+      "text": "val commParamMap = Map(\n  \"eta\" -> 0.1,\n  \"gamma\" -> 0.1,\n  \"missing\" -> 0.0,\n  \"max_depth\" -> 10,\n  \"max_leaves\" -> 256,\n  \"objective\" -> \"binary:logistic\",\n  \"grow_policy\" -> \"depthwise\",\n  \"min_child_weight\" -> 30,\n  \"lambda\" -> 1,\n  \"scale_pos_weight\" -> 2,\n  \"subsample\" -> 1,\n  \"num_round\" -> 100)\n  \nval xgbParamFinal = commParamMap ++ Map(\"tree_method\" -> \"gpu_hist\", \"num_workers\" -> 20, \"nthread\" -> 7)",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:54+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\u001b[1m\u001b[34mcommParamMap\u001b[0m: \u001b[1m\u001b[32mscala.collection.immutable.Map[String,Any]\u001b[0m = Map(min_child_weight -> 30, grow_policy -> depthwise, scale_pos_weight -> 2, subsample -> 1, lambda -> 1, max_depth -> 10, objective -> binary:logistic, num_round -> 100, missing -> 0.0, eta -> 0.1, max_leaves -> 256, gamma -> 0.1)\n\u001b[1m\u001b[34mxgbParamFinal\u001b[0m: \u001b[1m\u001b[32mscala.collection.immutable.Map[String,Any]\u001b[0m = Map(min_child_weight -> 30, grow_policy -> depthwise, scale_pos_weight -> 2, num_workers -> 20, subsample -> 1, lambda -> 1, max_depth -> 10, objective -> binary:logistic, num_round -> 100, missing -> 0.0, tree_method -> gpu_hist, eta -> 0.1, max_leaves -> 256, gamma -> 0.1, nthread -> 7)\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580283_312126552",
+      "id": "20200712-043620_726034129",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:54+0000",
+      "dateFinished": "2020-07-13T02:18:54+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11096"
+    },
+    {
+      "text": "%md\nHere comes the only API difference,`setFeaturesCol` in CPU version vs `setFeaturesCols` in GPU version.\n\nIn previous block, it said that CPU version needs `VectorAssembler` to assemble multiple feature columns into one column, because `setFeaturesCol` only accepts one feature column with the type of `vector`.\n\nBut `setFeaturesCols` supports multiple columns directly, so set the feautres column names directly to `XGBoostClassifier`. \n\nCPU version:\n\n```scala\nval xgbClassifier  = new XGBoostClassifier(paramMap)\n  .setLabelCol(labelName)\n  .setFeaturesCol(\"features\")\n```",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:54+0000",
+      "config": {
+        "editorMode": "ace/mode/markdown",
+        "editorHide": true,
+        "editorSetting": {
+          "language": "markdown",
+          "editOnDblClick": true
+        },
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {}
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "HTML",
+            "data": "<div class=\"markdown-body\">\n<p>Here comes the only API difference,<code>setFeaturesCol</code> in CPU version vs <code>setFeaturesCols</code> in GPU version.</p>\n<p>In previous block, it said that CPU version needs <code>VectorAssembler</code> to assemble multiple feature columns into one column, because <code>setFeaturesCol</code> only accepts one feature column with the type of <code>vector</code>.</p>\n<p>But <code>setFeaturesCols</code> supports multiple columns directly, so set the feautres column names directly to <code>XGBoostClassifier</code>.</p>\n<p>CPU version:</p>\n<pre><code class=\"language-scala\">val xgbClassifier  = new XGBoostClassifier(paramMap)\n  .setLabelCol(labelName)\n  .setFeaturesCol(&quot;features&quot;)\n</code></pre>\n\n</div>"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580283_1889609272",
+      "id": "20200712-043620_531120952",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:54+0000",
+      "dateFinished": "2020-07-13T02:18:54+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11097"
+    },
+    {
+      "text": "val xgbClassifier = new XGBoostClassifier(xgbParamFinal)\n      .setLabelCol(labelColName)\n      // === diff ===\n      .setFeaturesCols(featureNames)",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:55+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\u001b[1m\u001b[34mxgbClassifier\u001b[0m: \u001b[1m\u001b[32mml.dmlc.xgboost4j.scala.spark.XGBoostClassifier\u001b[0m = xgbc_2ce07ee0b6cb\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580283_-1143522441",
+      "id": "20200712-043620_427072123",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:55+0000",
+      "dateFinished": "2020-07-13T02:18:55+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11098"
+    },
+    {
+      "text": "%md\n## Benchmark and train\nThe object `benchmark` is used to compute the elapsed time of some operations.\n\nTraining with evaluation sets is also supported in 2 ways, the same as CPU version's behavior:\n\n* Call API `setEvalSets` after initializing an XGBoostClassifier\n\n```scala\nxgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))\n\n```\n\n* Use parameter `eval_sets` when initializing an XGBoostClassifier\n\n```scala\nval paramMapWithEval = paramMap + (\"eval_sets\" -> Map(\"eval\" -> evalSet))\nval xgbClassifierWithEval = new XGBoostClassifier(paramMapWithEval)\n```\n\nHere chooses the API way to set evaluation sets.",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:55+0000",
+      "config": {
+        "editorMode": "ace/mode/markdown",
+        "editorHide": true,
+        "editorSetting": {
+          "language": "markdown",
+          "editOnDblClick": true
+        },
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {}
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "HTML",
+            "data": "<div class=\"markdown-body\">\n<h2>Benchmark and train</h2>\n<p>The object <code>benchmark</code> is used to compute the elapsed time of some operations.</p>\n<p>Training with evaluation sets is also supported in 2 ways, the same as CPU version&rsquo;s behavior:</p>\n<ul>\n<li>Call API <code>setEvalSets</code> after initializing an XGBoostClassifier</li>\n</ul>\n<pre><code class=\"language-scala\">xgbClassifier.setEvalSets(Map(&quot;eval&quot; -&gt; evalSet))\n\n</code></pre>\n<ul>\n<li>Use parameter <code>eval_sets</code> when initializing an XGBoostClassifier</li>\n</ul>\n<pre><code class=\"language-scala\">val paramMapWithEval = paramMap + (&quot;eval_sets&quot; -&gt; Map(&quot;eval&quot; -&gt; evalSet))\nval xgbClassifierWithEval = new XGBoostClassifier(paramMapWithEval)\n</code></pre>\n<p>Here chooses the API way to set evaluation sets.</p>\n\n</div>"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580283_-268123036",
+      "id": "20200712-043620_1915241764",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:55+0000",
+      "dateFinished": "2020-07-13T02:18:55+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11099"
+    },
+    {
+      "text": "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:55+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\u001b[1m\u001b[34mres86\u001b[0m: \u001b[1m\u001b[32mxgbClassifier.type\u001b[0m = xgbc_2ce07ee0b6cb\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580283_-1163292247",
+      "id": "20200712-043620_324775014",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:55+0000",
+      "dateFinished": "2020-07-13T02:18:55+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11100"
+    },
+    {
+      "text": "def benchmark[R](phase: String)(block: => R): (R, Float) = {\n  val t0 = System.currentTimeMillis\n  val result = block // call-by-name\n  val t1 = System.currentTimeMillis\n  println(\"Elapsed time [\" + phase + \"]: \" + ((t1 - t0).toFloat / 1000) + \"s\")\n  (result, (t1 - t0).toFloat / 1000)\n}",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:55+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\u001b[1m\u001b[34mbenchmark\u001b[0m: \u001b[1m\u001b[32m[R](phase: String)(block: => R)(R, Float)\u001b[0m\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580284_-196014933",
+      "id": "20200712-043620_1233757982",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:55+0000",
+      "dateFinished": "2020-07-13T02:18:55+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11101"
+    },
+    {
+      "text": "%md\nCPU version reqires an extra step before fitting data to classifier, using `VectorAssembler` to assemble all feature columns into one column. The following code snip shows how to do the vectorizing.\n\n```scala\nobject Vectorize {\n  def apply(df: DataFrame, featureNames: Seq[String], labelName: String): DataFrame = {\n    val toFloat = df.schema.map(f => col(f.name).cast(FloatType))\n    new VectorAssembler()\n      .setInputCols(featureNames.toArray)\n      .setOutputCol(\"features\")\n      .transform(df.select(toFloat:_*))\n      .select(col(\"features\"), col(labelName))\n  }\n}\n\ntrainSet = Vectorize(trainSet, featureCols, labelName)\nevalSet = Vectorize(evalSet, featureCols, labelName)\ntransSet = Vectorize(transSet, featureCols, labelName)\n\n```\n\n`VectorAssembler` is not needed for GPU version. Just fit the loaded data directly to XGBoostClassifier.",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:56+0000",
+      "config": {
+        "editorMode": "ace/mode/text",
+        "editorHide": false,
+        "editorSetting": {
+          "language": "text",
+          "editOnDblClick": false
+        },
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "tableHide": false
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "HTML",
+            "data": "<div class=\"markdown-body\">\n<p>CPU version reqires an extra step before fitting data to classifier, using <code>VectorAssembler</code> to assemble all feature columns into one column. The following code snip shows how to do the vectorizing.</p>\n<pre><code class=\"language-scala\">object Vectorize {\n  def apply(df: DataFrame, featureNames: Seq[String], labelName: String): DataFrame = {\n    val toFloat = df.schema.map(f =&gt; col(f.name).cast(FloatType))\n    new VectorAssembler()\n      .setInputCols(featureNames.toArray)\n      .setOutputCol(&quot;features&quot;)\n      .transform(df.select(toFloat:_*))\n      .select(col(&quot;features&quot;), col(labelName))\n  }\n}\n\ntrainSet = Vectorize(trainSet, featureCols, labelName)\nevalSet = Vectorize(evalSet, featureCols, labelName)\ntransSet = Vectorize(transSet, featureCols, labelName)\n\n</code></pre>\n<p><code>VectorAssembler</code> is not needed for GPU version. Just fit the loaded data directly to XGBoostClassifier.</p>\n\n</div>"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580284_-1513881670",
+      "id": "20200712-043620_618156060",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:56+0000",
+      "dateFinished": "2020-07-13T02:18:56+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11102"
+    },
+    {
+      "text": "// Start training\nprintln(\"\\n------ Training ------\")\nval (xgbClassificationModel, _) = benchmark(\"train\") {\n  xgbClassifier.fit(trainSet)\n}",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:18:56+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580284_-695049679",
+      "id": "20200712-043620_1418358219",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:18:56+0000",
+      "dateFinished": "2020-07-13T02:26:51+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11103",
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\n------ Training ------\nTracker started, with env={DMLC_NUM_SERVER=0, DMLC_TRACKER_URI=10.164.0.17, DMLC_TRACKER_PORT=9091, DMLC_NUM_WORKER=20}\nElapsed time [train]: 475.008s\n\u001b[1m\u001b[34mxgbClassificationModel\u001b[0m: \u001b[1m\u001b[32mml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel\u001b[0m = xgbc_2ce07ee0b6cb\n"
+          }
+        ]
+      }
+    },
+    {
+      "text": "%md\n## Transformation and evaluation\nHere uses `transSet` to evaluate our model and prints some useful columns to show our prediction result. After that `MulticlassClassificationEvaluator` is used to calculate an overall accuracy of our predictions.",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:26:51+0000",
+      "config": {
+        "editorMode": "ace/mode/markdown",
+        "editorHide": true,
+        "editorSetting": {
+          "language": "markdown",
+          "editOnDblClick": true
+        },
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {}
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "HTML",
+            "data": "<div class=\"markdown-body\">\n<h2>Transformation and evaluation</h2>\n<p>Here uses <code>transSet</code> to evaluate our model and prints some useful columns to show our prediction result. After that <code>MulticlassClassificationEvaluator</code> is used to calculate an overall accuracy of our predictions.</p>\n\n</div>"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580284_1090201866",
+      "id": "20200712-043620_470610364",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:26:51+0000",
+      "dateFinished": "2020-07-13T02:26:51+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11104"
+    },
+    {
+      "text": "println(\"\\n------ Transforming ------\")\nval (results, _) = benchmark(\"transform\") {\n  val ret = xgbClassificationModel.transform(transSet).cache()\n  ret\n}\nz.show(results.select(\"orig_channel\", labelColName,\"rawPrediction\",\"probability\",\"prediction\").limit(10))\n\nprintln(\"\\n------Accuracy of Evaluation------\")\nval evaluator = new MulticlassClassificationEvaluator().setLabelCol(labelColName)\nval accuracy = evaluator.evaluate(results)\nprintln(accuracy)",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:26:51+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {
+          "1": {
+            "graph": {
+              "mode": "table",
+              "height": 300,
+              "optionOpen": false,
+              "setting": {
+                "table": {
+                  "tableGridState": {},
+                  "tableColumnTypeState": {
+                    "names": {
+                      "orig_channel": "string",
+                      "delinquency_12": "string",
+                      "rawPrediction": "string",
+                      "probability": "string",
+                      "prediction": "string"
+                    },
+                    "updated": false
+                  },
+                  "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]",
+                  "tableOptionValue": {
+                    "useFilter": false,
+                    "showPagination": false,
+                    "showAggregationFooter": false
+                  },
+                  "updated": false,
+                  "initialized": false
+                }
+              },
+              "commonSetting": {}
+            }
+          }
+        },
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        }
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "\n------ Transforming ------\nElapsed time [transform]: 0.143s\n"
+          },
+          {
+            "type": "TABLE",
+            "data": "orig_channel\tdelinquency_12\trawPrediction\tprobability\tprediction\n4.9E-324\t0\t[5.001231670379639,-5.001231670379639]\t[0.9933153325691819,0.006684667430818081]\t0.0\n1.0E-323\t0\t[6.777693748474121,-6.777693748474121]\t[0.9988623971585184,0.0011376028414815664]\t0.0\n4.9E-324\t0\t[7.609184741973877,-7.609184741973877]\t[0.999504369799979,4.956302000209689E-4]\t0.0\n1.0E-323\t0\t[8.442628860473633,-8.442628860473633]\t[0.9997845634934492,2.1543650655075908E-4]\t0.0\n1.0E-323\t0\t[8.08891773223877,-8.08891773223877]\t[0.9996931724308524,3.068275691475719E-4]\t0.0\n4.9E-324\t0\t[8.863614082336426,-8.863614082336426]\t[0.999858577051782,1.4142294821795076E-4]\t0.0\n1.0E-323\t0\t[8.85793399810791,-8.85793399810791]\t[0.9998577715887222,1.422284112777561E-4]\t0.0\n4.9E-324\t0\t[7.265506744384766,-7.265506744384766]\t[0.9993012417689897,6.98758231010288E-4]\t0.0\n4.9E-324\t0\t[5.615269184112549,-5.615269184112549]\t[0.9963713854085654,0.003628614591434598]\t0.0\n4.9E-324\t0\t[6.023037910461426,-6.023037910461426]\t[0.997583553660661,0.002416446339339018]\t0.0\n"
+          },
+          {
+            "type": "TEXT",
+            "data": "\n------Accuracy of Evaluation------\n0.9982550045083602\n\u001b[1m\u001b[34mresults\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.Dataset[org.apache.spark.sql.Row]\u001b[0m = [orig_channel: double, first_home_buyer: double ... 29 more fields]\n\u001b[1m\u001b[34mevaluator\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.ml.evaluation.MulticlassClassificationEvaluator\u001b[0m = MulticlassClassificationEvaluator: uid=mcEval_62ee3ceb950d, metricName=f1, metricLabel=0.0, beta=1.0, eps=1.0E-15\n\u001b[1m\u001b[34maccuracy\u001b[0m: \u001b[1m\u001b[32mDouble\u001b[0m = 0.9982550045083602\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580284_-218421974",
+      "id": "20200712-043620_775095654",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:26:51+0000",
+      "dateFinished": "2020-07-13T02:27:20+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11105"
+    },
+    {
+      "title": "Example to save/load the model, predict with the model",
+      "text": "xgbClassificationModel.write.overwrite.save(modelPath)\n\nval modelFromDisk = XGBoostClassificationModel.load(modelPath)\n\nval (results2, _) = benchmark(\"transform2\") {\n  modelFromDisk.transform(transSet)\n}\nz.show(results2.limit(5))",
+      "user": "anonymous",
+      "dateUpdated": "2020-07-13T02:27:20+0000",
+      "config": {
+        "editorMode": "ace/mode/scala",
+        "editorHide": false,
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {
+          "1": {
+            "graph": {
+              "mode": "table",
+              "height": 300,
+              "optionOpen": false,
+              "setting": {
+                "table": {
+                  "tableGridState": {},
+                  "tableColumnTypeState": {
+                    "names": {
+                      "orig_channel": "string",
+                      "first_home_buyer": "string",
+                      "loan_purpose": "string",
+                      "property_type": "string",
+                      "occupancy_status": "string",
+                      "property_state": "string",
+                      "product_type": "string",
+                      "relocation_mortgage_indicator": "string",
+                      "seller_name": "string",
+                      "mod_flag": "string",
+                      "orig_interest_rate": "string",
+                      "orig_upb": "string",
+                      "orig_loan_term": "string",
+                      "orig_ltv": "string",
+                      "orig_cltv": "string",
+                      "num_borrowers": "string",
+                      "dti": "string",
+                      "borrower_credit_score": "string",
+                      "num_units": "string",
+                      "zip": "string",
+                      "mortgage_insurance_percent": "string",
+                      "current_loan_delinquency_status": "string",
+                      "current_actual_upb": "string",
+                      "interest_rate": "string",
+                      "loan_age": "string",
+                      "msa": "string",
+                      "non_interest_bearing_upb": "string",
+                      "delinquency_12": "string",
+                      "rawPrediction": "string",
+                      "probability": "string",
+                      "prediction": "string"
+                    },
+                    "updated": false
+                  },
+                  "tableOptionSpecHash": "[{\"name\":\"useFilter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable filter for columns\"},{\"name\":\"showPagination\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable pagination for better navigation\"},{\"name\":\"showAggregationFooter\",\"valueType\":\"boolean\",\"defaultValue\":false,\"widget\":\"checkbox\",\"description\":\"Enable a footer for displaying aggregated values\"}]",
+                  "tableOptionValue": {
+                    "useFilter": false,
+                    "showPagination": false,
+                    "showAggregationFooter": false
+                  },
+                  "updated": false,
+                  "initialized": false
+                }
+              },
+              "commonSetting": {}
+            }
+          }
+        },
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionSupport": true,
+          "completionKey": "TAB"
+        },
+        "title": true
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "results": {
+        "code": "SUCCESS",
+        "msg": [
+          {
+            "type": "TEXT",
+            "data": "Elapsed time [transform2]: 0.058s\n"
+          },
+          {
+            "type": "TABLE",
+            "data": "orig_channel\tfirst_home_buyer\tloan_purpose\tproperty_type\toccupancy_status\tproperty_state\tproduct_type\trelocation_mortgage_indicator\tseller_name\tmod_flag\torig_interest_rate\torig_upb\torig_loan_term\torig_ltv\torig_cltv\tnum_borrowers\tdti\tborrower_credit_score\tnum_units\tzip\tmortgage_insurance_percent\tcurrent_loan_delinquency_status\tcurrent_actual_upb\tinterest_rate\tloan_age\tmsa\tnon_interest_bearing_upb\tdelinquency_12\trawPrediction\tprobability\tprediction\n4.9E-324\t4.9E-324\t1.0E-323\t4.9E-324\t1.0E-323\t1.24E-322\tnull\t4.9E-324\t4.9E-324\t4.9E-324\t2.75\t278000\t120\t56.0\t56.0\t1.0\t46.0\t624.0\t1\t295\t0.0\t0\t148441.15\t2.75\t13.0\t34820.0\t0.0\t0\t[5.001231670379639,-5.001231670379639]\t[0.9933153325691819,0.006684667430818081]\t0.0\n1.0E-323\t4.9E-324\t1.5E-323\t4.9E-324\t4.9E-324\t4.9E-324\tnull\t4.9E-324\t6.9E-323\t4.9E-324\t4.25\t579000\t360\t72.0\t72.0\t2.0\t44.0\t714.0\t1\t949\t0.0\t0\t568406.57\t4.25\t13.0\t41860.0\t0.0\t0\t[6.777693748474121,-6.777693748474121]\t[0.9988623971585184,0.0011376028414815664]\t0.0\n4.9E-324\t4.9E-324\t1.5E-323\t4.9E-324\t4.9E-324\t4.4E-323\tnull\t4.9E-324\t4.9E-324\t4.9E-324\t4.0\t240000\t360\t80.0\t80.0\t1.0\t18.0\t820.0\t1\t282\t0.0\t0\t236132.18\t4.0\t10.0\t16740.0\t0.0\t0\t[7.609184741973877,-7.609184741973877]\t[0.999504369799979,4.956302000209689E-4]\t0.0\n1.0E-323\t4.9E-324\t1.0E-323\t4.9E-324\t4.9E-324\t1.04E-322\tnull\t4.9E-324\t3.0E-323\t4.9E-324\t3.0\t241000\t180\t44.0\t44.0\t2.0\t44.0\t787.0\t1\t650\t0.0\t0\t230092.59\t3.0\t9.0\t0.0\t0.0\t0\t[8.442628860473633,-8.442628860473633]\t[0.9997845634934492,2.1543650655075908E-4]\t0.0\n1.0E-323\t4.9E-324\t4.9E-324\t1.5E-323\t4.9E-324\t1.0E-323\tnull\t4.9E-324\t4.9E-324\t4.9E-324\t4.25\t177000\t360\t75.0\t75.0\t2.0\t26.0\t792.0\t1\t787\t0.0\t0\t172387.22\t4.25\t18.0\t12420.0\t0.0\t0\t[8.08891773223877,-8.08891773223877]\t[0.9996931724308524,3.068275691475719E-4]\t0.0\n"
+          },
+          {
+            "type": "TEXT",
+            "data": "\u001b[1m\u001b[34mmodelFromDisk\u001b[0m: \u001b[1m\u001b[32mml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel\u001b[0m = xgbc_2ce07ee0b6cb\n\u001b[1m\u001b[34mresults2\u001b[0m: \u001b[1m\u001b[32morg.apache.spark.sql.DataFrame\u001b[0m = [orig_channel: double, first_home_buyer: double ... 29 more fields]\n"
+          }
+        ]
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528580284_1907963406",
+      "id": "20200712-043620_1435219490",
+      "dateCreated": "2020-07-12T04:36:20+0000",
+      "dateStarted": "2020-07-13T02:27:20+0000",
+      "dateFinished": "2020-07-13T02:27:23+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11106"
+    },
+    {
+      "user": "anonymous",
+      "dateUpdated": "2020-07-12T04:50:45+0000",
+      "config": {
+        "colWidth": 12,
+        "fontSize": 9,
+        "enabled": true,
+        "results": {},
+        "editorSetting": {
+          "language": "scala",
+          "editOnDblClick": false,
+          "completionKey": "TAB",
+          "completionSupport": true
+        },
+        "editorMode": "ace/mode/scala"
+      },
+      "settings": {
+        "params": {},
+        "forms": {}
+      },
+      "apps": [],
+      "runtimeInfos": {},
+      "progressUpdateIntervalMs": 500,
+      "jobName": "paragraph_1594528930033_-558128424",
+      "id": "paragraph_1594528930033_-558128424",
+      "dateCreated": "2020-07-12T04:42:10+0000",
+      "status": "FINISHED",
+      "$$hashKey": "object:11107"
+    }
+  ],
+  "name": "mortgage-gpu-scala",
+  "id": "2FCHJHDT3",
+  "defaultInterpreterGroup": "spark",
+  "version": "0.9.0-preview1",
+  "noteParams": {},
+  "noteForms": {},
+  "angularObjects": {},
+  "config": {
+    "isZeppelinNotebookCronEnable": false,
+    "looknfeel": "default",
+    "personalizedMode": "false"
+  },
+  "info": {
+    "isRunning": true
+  },
+  "path": "/mortgage-gpu-scala"
+}
\ No newline at end of file
diff --git a/docs/demo/gpu-mortgage_accelerated.ipynb b/docs/demo/gpu-mortgage_accelerated.ipynb
new file mode 100644
index 00000000000..2ce911b3a6b
--- /dev/null
+++ b/docs/demo/gpu-mortgage_accelerated.ipynb
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"code","source":["%sh\n \nwget http://rapidsai-data.s3-website.us-east-2.amazonaws.com/notebook-mortgage-data/mortgage_2000.tgz -P /Users/<your user id>/\n \nmkdir -p /dbfs/FileStore/tables/mortgage\nmkdir -p /dbfs/FileStore/tables/mortgage_parquet_gpu/perf\nmkdir /dbfs/FileStore/tables/mortgage_parquet_gpu/acq\nmkdir /dbfs/FileStore/tables/mortgage_parquet_gpu/output\n \ntar xfvz /Users/<your user id>/mortgage_2000.tgz --directory /dbfs/FileStore/tables/mortgage\n"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["import time\nfrom pyspark import broadcast\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import *\nfrom pyspark.sql.types import *\n\ndef _get_quarter_from_csv_file_name():\n    return substring_index(substring_index(input_file_name(), '.', 1), '_', -1)\n\n_csv_perf_schema = StructType([\n    StructField('loan_id', LongType()),\n    StructField('monthly_reporting_period', StringType()),\n    StructField('servicer', StringType()),\n    StructField('interest_rate', DoubleType()),\n    StructField('current_actual_upb', DoubleType()),\n    StructField('loan_age', DoubleType()),\n    StructField('remaining_months_to_legal_maturity', DoubleType()),\n    StructField('adj_remaining_months_to_maturity', DoubleType()),\n    StructField('maturity_date', StringType()),\n    StructField('msa', DoubleType()),\n    StructField('current_loan_delinquency_status', IntegerType()),\n    StructField('mod_flag', StringType()),\n    StructField('zero_balance_code', StringType()),\n    StructField('zero_balance_effective_date', StringType()),\n    StructField('last_paid_installment_date', StringType()),\n    StructField('foreclosed_after', StringType()),\n    StructField('disposition_date', StringType()),\n    StructField('foreclosure_costs', DoubleType()),\n    StructField('prop_preservation_and_repair_costs', DoubleType()),\n    StructField('asset_recovery_costs', DoubleType()),\n    StructField('misc_holding_expenses', DoubleType()),\n    StructField('holding_taxes', DoubleType()),\n    StructField('net_sale_proceeds', DoubleType()),\n    StructField('credit_enhancement_proceeds', DoubleType()),\n    StructField('repurchase_make_whole_proceeds', StringType()),\n    StructField('other_foreclosure_proceeds', DoubleType()),\n    StructField('non_interest_bearing_upb', DoubleType()),\n    StructField('principal_forgiveness_upb', StringType()),\n    StructField('repurchase_make_whole_proceeds_flag', StringType()),\n    StructField('foreclosure_principal_write_off_amount', StringType()),\n    StructField('servicing_activity_indicator', StringType())])\n_csv_acq_schema = StructType([\n    StructField('loan_id', LongType()),\n    StructField('orig_channel', StringType()),\n    StructField('seller_name', StringType()),\n    StructField('orig_interest_rate', DoubleType()),\n    StructField('orig_upb', IntegerType()),\n    StructField('orig_loan_term', IntegerType()),\n    StructField('orig_date', StringType()),\n    StructField('first_pay_date', StringType()),\n    StructField('orig_ltv', DoubleType()),\n    StructField('orig_cltv', DoubleType()),\n    StructField('num_borrowers', DoubleType()),\n    StructField('dti', DoubleType()),\n    StructField('borrower_credit_score', DoubleType()),\n    StructField('first_home_buyer', StringType()),\n    StructField('loan_purpose', StringType()),\n    StructField('property_type', StringType()),\n    StructField('num_units', IntegerType()),\n    StructField('occupancy_status', StringType()),\n    StructField('property_state', StringType()),\n    StructField('zip', IntegerType()),\n    StructField('mortgage_insurance_percent', DoubleType()),\n    StructField('product_type', StringType()),\n    StructField('coborrow_credit_score', DoubleType()),\n    StructField('mortgage_insurance_type', DoubleType()),\n    StructField('relocation_mortgage_indicator', StringType())])\n\ndef read_perf_csv(spark, path):\n    return spark.read.format('csv') \\\n            .option('nullValue', '') \\\n            .option('header', 'false') \\\n            .option('delimiter', '|') \\\n            .schema(_csv_perf_schema) \\\n            .load(path) \\\n            .withColumn('quarter', _get_quarter_from_csv_file_name())\n\ndef read_acq_csv(spark, path):\n    return spark.read.format('csv') \\\n            .option('nullValue', '') \\\n            .option('header', 'false') \\\n            .option('delimiter', '|') \\\n            .schema(_csv_acq_schema) \\\n            .load(path) \\\n            .withColumn('quarter', _get_quarter_from_csv_file_name())\n\ndef _parse_dates(perf):\n    return perf \\\n            .withColumn('monthly_reporting_period', to_date(col('monthly_reporting_period'), 'MM/dd/yyyy')) \\\n            .withColumn('monthly_reporting_period_month', month(col('monthly_reporting_period'))) \\\n            .withColumn('monthly_reporting_period_year', year(col('monthly_reporting_period'))) \\\n            .withColumn('monthly_reporting_period_day', dayofmonth(col('monthly_reporting_period'))) \\\n            .withColumn('last_paid_installment_date', to_date(col('last_paid_installment_date'), 'MM/dd/yyyy')) \\\n            .withColumn('foreclosed_after', to_date(col('foreclosed_after'), 'MM/dd/yyyy')) \\\n            .withColumn('disposition_date', to_date(col('disposition_date'), 'MM/dd/yyyy')) \\\n            .withColumn('maturity_date', to_date(col('maturity_date'), 'MM/yyyy')) \\\n            .withColumn('zero_balance_effective_date', to_date(col('zero_balance_effective_date'), 'MM/yyyy'))\n\ndef _create_perf_deliquency(spark, perf):\n    aggDF = perf.select(\n            col(\"quarter\"),\n            col(\"loan_id\"),\n            col(\"current_loan_delinquency_status\"),\n            when(col(\"current_loan_delinquency_status\") >= 1, col(\"monthly_reporting_period\")).alias(\"delinquency_30\"),\n            when(col(\"current_loan_delinquency_status\") >= 3, col(\"monthly_reporting_period\")).alias(\"delinquency_90\"),\n            when(col(\"current_loan_delinquency_status\") >= 6, col(\"monthly_reporting_period\")).alias(\"delinquency_180\")) \\\n                    .groupBy(\"quarter\", \"loan_id\") \\\n                    .agg(\n                            max(\"current_loan_delinquency_status\").alias(\"delinquency_12\"),\n                            min(\"delinquency_30\").alias(\"delinquency_30\"),\n                            min(\"delinquency_90\").alias(\"delinquency_90\"),\n                            min(\"delinquency_180\").alias(\"delinquency_180\")) \\\n                                    .select(\n                                            col(\"quarter\"),\n                                            col(\"loan_id\"),\n                                            (col(\"delinquency_12\") >= 1).alias(\"ever_30\"),\n                                            (col(\"delinquency_12\") >= 3).alias(\"ever_90\"),\n                                            (col(\"delinquency_12\") >= 6).alias(\"ever_180\"),\n                                            col(\"delinquency_30\"),\n                                            col(\"delinquency_90\"),\n                                            col(\"delinquency_180\"))\n    joinedDf = perf \\\n            .withColumnRenamed(\"monthly_reporting_period\", \"timestamp\") \\\n            .withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n            .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n            .withColumnRenamed(\"current_loan_delinquency_status\", \"delinquency_12\") \\\n            .withColumnRenamed(\"current_actual_upb\", \"upb_12\") \\\n            .select(\"quarter\", \"loan_id\", \"timestamp\", \"delinquency_12\", \"upb_12\", \"timestamp_month\", \"timestamp_year\") \\\n            .join(aggDF, [\"loan_id\", \"quarter\"], \"left_outer\")\n\n    # calculate the 12 month delinquency and upb values\n    months = 12\n    monthArray = [lit(x) for x in range(0, 12)]\n    # explode on a small amount of data is actually slightly more efficient than a cross join\n    testDf = joinedDf \\\n            .withColumn(\"month_y\", explode(array(monthArray))) \\\n            .select(\n                    col(\"quarter\"),\n                    floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000) / months).alias(\"josh_mody\"),\n                    floor(((col(\"timestamp_year\") * 12 + col(\"timestamp_month\")) - 24000 - col(\"month_y\")) / months).alias(\"josh_mody_n\"),\n                    col(\"ever_30\"),\n                    col(\"ever_90\"),\n                    col(\"ever_180\"),\n                    col(\"delinquency_30\"),\n                    col(\"delinquency_90\"),\n                    col(\"delinquency_180\"),\n                    col(\"loan_id\"),\n                    col(\"month_y\"),\n                    col(\"delinquency_12\"),\n                    col(\"upb_12\")) \\\n                            .groupBy(\"quarter\", \"loan_id\", \"josh_mody_n\", \"ever_30\", \"ever_90\", \"ever_180\", \"delinquency_30\", \"delinquency_90\", \"delinquency_180\", \"month_y\") \\\n                            .agg(max(\"delinquency_12\").alias(\"delinquency_12\"), min(\"upb_12\").alias(\"upb_12\")) \\\n                            .withColumn(\"timestamp_year\", floor((lit(24000) + (col(\"josh_mody_n\") * lit(months)) + (col(\"month_y\") - 1)) / lit(12))) \\\n                            .selectExpr('*', 'pmod(24000 + (josh_mody_n * {}) + month_y, 12) as timestamp_month_tmp'.format(months)) \\\n                            .withColumn(\"timestamp_month\", when(col(\"timestamp_month_tmp\") == lit(0), lit(12)).otherwise(col(\"timestamp_month_tmp\"))) \\\n                            .withColumn(\"delinquency_12\", ((col(\"delinquency_12\") > 3).cast(\"int\") + (col(\"upb_12\") == 0).cast(\"int\")).alias(\"delinquency_12\")) \\\n                            .drop(\"timestamp_month_tmp\", \"josh_mody_n\", \"month_y\")\n\n    return perf.withColumnRenamed(\"monthly_reporting_period_month\", \"timestamp_month\") \\\n            .withColumnRenamed(\"monthly_reporting_period_year\", \"timestamp_year\") \\\n            .join(testDf, [\"quarter\", \"loan_id\", \"timestamp_year\", \"timestamp_month\"], \"left\") \\\n            .drop(\"timestamp_year\", \"timestamp_month\")\n\n_name_mapping = [\n        (\"WITMER FUNDING, LLC\", \"Witmer\"),\n        (\"WELLS FARGO CREDIT RISK TRANSFER SECURITIES TRUST 2015\", \"Wells Fargo\"),\n        (\"WELLS FARGO BANK,  NA\" , \"Wells Fargo\"),\n        (\"WELLS FARGO BANK, N.A.\" , \"Wells Fargo\"),\n        (\"WELLS FARGO BANK, NA\" , \"Wells Fargo\"),\n        (\"USAA FEDERAL SAVINGS BANK\" , \"USAA\"),\n        (\"UNITED SHORE FINANCIAL SERVICES, LLC D\\\\/B\\\\/A UNITED WHOLESALE MORTGAGE\" , \"United Seq(e\"),\n        (\"U.S. BANK N.A.\" , \"US Bank\"),\n        (\"SUNTRUST MORTGAGE INC.\" , \"Suntrust\"),\n        (\"STONEGATE MORTGAGE CORPORATION\" , \"Stonegate Mortgage\"),\n        (\"STEARNS LENDING, LLC\" , \"Stearns Lending\"),\n        (\"STEARNS LENDING, INC.\" , \"Stearns Lending\"),\n        (\"SIERRA PACIFIC MORTGAGE COMPANY, INC.\" , \"Sierra Pacific Mortgage\"),\n        (\"REGIONS BANK\" , \"Regions\"),\n        (\"RBC MORTGAGE COMPANY\" , \"RBC\"),\n        (\"QUICKEN LOANS INC.\" , \"Quicken Loans\"),\n        (\"PULTE MORTGAGE, L.L.C.\" , \"Pulte Mortgage\"),\n        (\"PROVIDENT FUNDING ASSOCIATES, L.P.\" , \"Provident Funding\"),\n        (\"PROSPECT MORTGAGE, LLC\" , \"Prospect Mortgage\"),\n        (\"PRINCIPAL RESIDENTIAL MORTGAGE CAPITAL RESOURCES, LLC\" , \"Principal Residential\"),\n        (\"PNC BANK, N.A.\" , \"PNC\"),\n        (\"PMT CREDIT RISK TRANSFER TRUST 2015-2\" , \"PennyMac\"),\n        (\"PHH MORTGAGE CORPORATION\" , \"PHH Mortgage\"),\n        (\"PENNYMAC CORP.\" , \"PennyMac\"),\n        (\"PACIFIC UNION FINANCIAL, LLC\" , \"Other\"),\n        (\"OTHER\" , \"Other\"),\n        (\"NYCB MORTGAGE COMPANY, LLC\" , \"NYCB\"),\n        (\"NEW YORK COMMUNITY BANK\" , \"NYCB\"),\n        (\"NETBANK FUNDING SERVICES\" , \"Netbank\"),\n        (\"NATIONSTAR MORTGAGE, LLC\" , \"Nationstar Mortgage\"),\n        (\"METLIFE BANK, NA\" , \"Metlife\"),\n        (\"LOANDEPOT.COM, LLC\" , \"LoanDepot.com\"),\n        (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2015-1\" , \"JP Morgan Chase\"),\n        (\"J.P. MORGAN MADISON AVENUE SECURITIES TRUST, SERIES 2014-1\" , \"JP Morgan Chase\"),\n        (\"JPMORGAN CHASE BANK, NATIONAL ASSOCIATION\" , \"JP Morgan Chase\"),\n        (\"JPMORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n        (\"JP MORGAN CHASE BANK, NA\" , \"JP Morgan Chase\"),\n        (\"IRWIN MORTGAGE, CORPORATION\" , \"Irwin Mortgage\"),\n        (\"IMPAC MORTGAGE CORP.\" , \"Impac Mortgage\"),\n        (\"HSBC BANK USA, NATIONAL ASSOCIATION\" , \"HSBC\"),\n        (\"HOMEWARD RESIDENTIAL, INC.\" , \"Homeward Mortgage\"),\n        (\"HOMESTREET BANK\" , \"Other\"),\n        (\"HOMEBRIDGE FINANCIAL SERVICES, INC.\" , \"HomeBridge\"),\n        (\"HARWOOD STREET FUNDING I, LLC\" , \"Harwood Mortgage\"),\n        (\"GUILD MORTGAGE COMPANY\" , \"Guild Mortgage\"),\n        (\"GMAC MORTGAGE, LLC (USAA FEDERAL SAVINGS BANK)\" , \"GMAC\"),\n        (\"GMAC MORTGAGE, LLC\" , \"GMAC\"),\n        (\"GMAC (USAA)\" , \"GMAC\"),\n        (\"FREMONT BANK\" , \"Fremont Bank\"),\n        (\"FREEDOM MORTGAGE CORP.\" , \"Freedom Mortgage\"),\n        (\"FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"Franklin America\"),\n        (\"FLEET NATIONAL BANK\" , \"Fleet National\"),\n        (\"FLAGSTAR CAPITAL MARKETS CORPORATION\" , \"Flagstar Bank\"),\n        (\"FLAGSTAR BANK, FSB\" , \"Flagstar Bank\"),\n        (\"FIRST TENNESSEE BANK NATIONAL ASSOCIATION\" , \"Other\"),\n        (\"FIFTH THIRD BANK\" , \"Fifth Third Bank\"),\n        (\"FEDERAL HOME LOAN BANK OF CHICAGO\" , \"Fedral Home of Chicago\"),\n        (\"FDIC, RECEIVER, INDYMAC FEDERAL BANK FSB\" , \"FDIC\"),\n        (\"DOWNEY SAVINGS AND LOAN ASSOCIATION, F.A.\" , \"Downey Mortgage\"),\n        (\"DITECH FINANCIAL LLC\" , \"Ditech\"),\n        (\"CITIMORTGAGE, INC.\" , \"Citi\"),\n        (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERFIRST MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n        (\"CHICAGO MORTGAGE SOLUTIONS DBA INTERBANK MORTGAGE COMPANY\" , \"Chicago Mortgage\"),\n        (\"CHASE HOME FINANCE, LLC\" , \"JP Morgan Chase\"),\n        (\"CHASE HOME FINANCE FRANKLIN AMERICAN MORTGAGE COMPANY\" , \"JP Morgan Chase\"),\n        (\"CHASE HOME FINANCE (CIE 1)\" , \"JP Morgan Chase\"),\n        (\"CHASE HOME FINANCE\" , \"JP Morgan Chase\"),\n        (\"CASHCALL, INC.\" , \"CashCall\"),\n        (\"CAPITAL ONE, NATIONAL ASSOCIATION\" , \"Capital One\"),\n        (\"CALIBER HOME LOANS, INC.\" , \"Caliber Funding\"),\n        (\"BISHOPS GATE RESIDENTIAL MORTGAGE TRUST\" , \"Bishops Gate Mortgage\"),\n        (\"BANK OF AMERICA, N.A.\" , \"Bank of America\"),\n        (\"AMTRUST BANK\" , \"AmTrust\"),\n        (\"AMERISAVE MORTGAGE CORPORATION\" , \"Amerisave\"),\n        (\"AMERIHOME MORTGAGE COMPANY, LLC\" , \"AmeriHome Mortgage\"),\n        (\"ALLY BANK\" , \"Ally Bank\"),\n        (\"ACADEMY MORTGAGE CORPORATION\" , \"Academy Mortgage\"),\n        (\"NO CASH-OUT REFINANCE\" , \"OTHER REFINANCE\"),\n        (\"REFINANCE - NOT SPECIFIED\" , \"OTHER REFINANCE\"),\n        (\"Other REFINANCE\" , \"OTHER REFINANCE\")]\n\ndef _create_acquisition(spark, acq):\n    nameMapping = spark.createDataFrame(_name_mapping, [\"from_seller_name\", \"to_seller_name\"])\n    return acq.join(nameMapping, col(\"seller_name\") == col(\"from_seller_name\"), \"left\") \\\n      .drop(\"from_seller_name\") \\\n      .withColumn(\"old_name\", col(\"seller_name\")) \\\n      .withColumn(\"seller_name\", coalesce(col(\"to_seller_name\"), col(\"seller_name\"))) \\\n      .drop(\"to_seller_name\") \\\n      .withColumn(\"orig_date\", to_date(col(\"orig_date\"), \"MM/yyyy\")) \\\n      .withColumn(\"first_pay_date\", to_date(col(\"first_pay_date\"), \"MM/yyyy\")) \\\n\ndef run_mortgage(spark, perf, acq):\n    parsed_perf = _parse_dates(perf)\n    perf_deliqency = _create_perf_deliquency(spark, parsed_perf)\n    cleaned_acq = _create_acquisition(spark, acq)\n    return perf_deliqency.join(cleaned_acq, [\"loan_id\", \"quarter\"], \"inner\").drop(\"quarter\")"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["orig_perf_path='dbfs:///FileStore/tables/mortgage/perf/*'\norig_acq_path='dbfs:///FileStore/tables/mortgage/acq/*'\ntmp_perf_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/perf/'\ntmp_acq_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/acq/'\noutput_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/output/'\n\nspark.conf.set('spark.rapids.sql.enabled','true')\nspark.conf.set('spark.rapids.sql.explain', 'ALL')\nspark.conf.set('spark.rapids.sql.incompatibleOps.enabled', 'true')\nspark.conf.set('spark.rapids.sql.batchSizeBytes', '512M')\nspark.conf.set('spark.rapids.sql.reader.batchSizeBytes', '768M')"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["# Lets transcode the data first\nstart = time.time()\n# we want a few big files instead of lots of small files\nspark.conf.set('spark.sql.files.maxPartitionBytes', '200G')\nacq = read_acq_csv(spark, orig_acq_path)\nacq.repartition(12).write.parquet(tmp_acq_path, mode='overwrite')\nperf = read_perf_csv(spark, orig_perf_path)\nperf.coalesce(96).write.parquet(tmp_perf_path, mode='overwrite')\nend = time.time()\nprint(end - start)"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["# Now lets actually process the data\\n\",\nstart = time.time()\nspark.conf.set('spark.sql.files.maxPartitionBytes', '1G')\nspark.conf.set('spark.sql.shuffle.partitions', '192')\nperf = spark.read.parquet(tmp_perf_path)\nacq = spark.read.parquet(tmp_acq_path)\nout = run_mortgage(spark, perf, acq)\nout.write.parquet(output_path, mode='overwrite')\nend = time.time()\nprint(end - start)\n"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":6}],"metadata":{"name":"gpu-mortgage_kr","notebookId":2710846968050572},"nbformat":4,"nbformat_minor":0}
diff --git a/docs/examples.md b/docs/examples.md
new file mode 100644
index 00000000000..8cdc8d3c033
--- /dev/null
+++ b/docs/examples.md
@@ -0,0 +1,15 @@
+---
+layout: page
+title: Demos
+nav_order: 4
+---
+# Demos
+
+Example notebooks allow users to test drive "RAPIDS Accelerator for Apache Spark" with public datasets.
+
+##### [Mortgage ETL Notebook](demo/gpu-mortgage_accelerated.ipynb)  [(Dataset)](https://docs.rapids.ai/datasets/mortgage-data)
+
+##### About the Mortgage Dataset:
+Dataset is derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. This processed dataset is redistributed with permission and consent from Fannie Mae.
+
+For the full raw dataset visit [Fannie Mae](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) to register for an account and to download.
diff --git a/docs/get-started/getting-started-gcp.md b/docs/get-started/getting-started-gcp.md
new file mode 100644
index 00000000000..f210134a55e
--- /dev/null
+++ b/docs/get-started/getting-started-gcp.md
@@ -0,0 +1,115 @@
+---
+layout: page
+title: GCP Dataproc
+nav_order: 2
+parent: Getting-Started
+---
+
+# Getting started with RAPIDS Accelerator on GCP Dataproc
+ [Google Cloud Dataproc](https://cloud.google.com/dataproc) is Google Cloud's fully managed Apache Spark and Hadoop service. This guide will walk through the steps to show:
+
+* [How to spin up a Dataproc Cluster Accelerated by GPU](getting-started-gcp#how-to-spin-up-a-dataproc-cluster-accelerated-by-gpu)
+* [Run a sample Pyspark or Scala ETL and XGBoost training Notebooks on a Dataproc Cluster Accelerated by GPU](getting-started-gcp#run-pyspark-and-scala-notebook-a-dataproc-cluster-accelerated-by-gpu)
+* [Submit the same sample ETL application as a Spark job to a Dataproc Cluster Accelerated by GPU](getting-started-gcp#submit-spark-jobs-to-a-dataproc-cluster-accelerated-by-gpu)
+
+
+ 
+## How to spin up a Dataproc Cluster Accelerated by GPU
+ 
+ You can use [Cloud Shell](https://cloud.google.com/shell) to execute shell commands that will create a Dataproc cluster. Cloud Shell contains command line tools for interacting with Google Cloud Platform, including gcloud and gsutil. Alternatively, you can install [GCloud SDK](https://cloud.google.com/sdk/install) on your laptop. From the Cloud Shell, users will need to enable services within your project. Enable the Compute and Dataproc APIs in order to access Dataproc, and enable the Storage API as you’ll need a Google Cloud Storage bucket to house your data. This may take several minutes.
+```bash
+gcloud services enable compute.googleapis.com
+gcloud services enable dataproc.googleapis.com
+gcloud services enable storage-api.googleapis.com
+``` 
+
+After command line environment is setup, log in to your GCP account. We can now create a Dataproc cluster with configuration mentioned below.
+The configuration will allow users to run any of the [notebooks demo](../demo/GCP) on GCP. Alternatively, users can also start a 2*2T4 worker nodes.
+* [GPU Driver](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/gpu) and [RAPIDS Acclerator for Apache Spark](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/rapids) through initialization actions
+* One 8-core master node and 5 32-core worker nodes
+* Four NVIDIA T4 to each worker nodes
+* [Local SSDs](https://cloud.google.com/dataproc/docs/concepts/compute/dataproc-local-ssds) is recommended to improve IO for Spark scratch places  
+* Component gateway enabled for accessing Web UIs hosted on the cluster
+* Configuration for [GPU scheduling and isolation](/get-started/yarn-gpu.html)
+
+
+```bash
+    export REGION=[Your Prefer GCP Region]
+    export GCS_BUCKET=[Your GCS Bucket]
+    export CLUSTER_NAME=[Your Cluster Name]
+    export NUM_GPUS=4
+    export NUM_WORKERS=5
+
+gcloud dataproc clusters create $CLUSTER_NAME  \
+    --region $REGION \
+    --image-version=preview-ubuntu \
+    --master-machine-type n1-standard-16 \
+    --num-workers $NUM_WORKERS \
+    --worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
+    --worker-machine-type n1-highmem-32\
+    --num-worker-local-ssds 4 \
+    --initialization-actions gs://dataproc-initialization-actions/gpu/install_gpu_driver.sh,gs://dataproc-initialization-actions/rapids/rapids.sh \
+    --optional-components=ANACONDA,JUPYTER,ZEPPELIN \
+    --metadata gpu-driver-provider="NVIDIA" \
+    --metadata rapids-runtime=SPARK \
+    --bucket $GCS_BUCKET \
+    --enable-component-gateway \
+    --properties="^#^spark:spark.yarn.unmanagedAM.enabled=false"`
+``` 
+This may take around 5-15 minutes to complete. You can navigate to Dataproc clusters tab in the Google Cloud Console to see the progress.
+
+![Dataproc Cluster](../img/dataproc-cluster.png)
+
+## Run Pyspark and Scala Notebook a Dataproc Cluster Accelerated by GPU
+To use notebooks with Dataproc cluster, click on the cluster name under Dataproc cluster tab and navigate to the "Web Interfaces" Tab. Under the "Web Interfaces", click on JupyterLab or Jupyter link to start to use sample [Mortgage ETL on GPU Jupyter Notebook](../demo/GCP/Mortgage-ETL-GPU.ipynb) to process full 17 years [Mortgage data](https://rapidsai.github.io/demos/datasets/mortgage-data). 
+
+![Dataproc Web Interfaces](../img/dataproc-service.png)
+
+The notebook will first transcode CSV files into Parquet Files and then run a ETL query to prepare the dataset for Training. In the sample notebook, we use 2016 data as evaluation set and the rest as training set, saving to respective GCS location. 
+First stage with default configuration in notebook should take ~110 seconds (1/3 of CPU execution time with same config) whereas second stage takes ~170 seconds (1/7 of CPU execution time with same config). The notebook depends on pre-compiled [Spark RAPIDS SQL plugin](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark-parent) and [cuDF](https://mvnrepository.com/artifact/ai.rapids/cudf/0.14), which pre-downloaded by GCP Dataproc [RAPIDS init script]().
+
+Once data is prepared, we use [Mortgage XGBoost4j Scala Notebook](../demo/GCP/mortgage-xgboost4j-gpu-scala.zpln) in Dataproc Zeppelin service to execute the training job on GPU. NVIDIA Spark team also ship [Spark XGBoost4j](https://github.com/NVIDIA/spark-xgboost) which is based on [DMLC xgboost](https://github.com/dmlc/xgboost). Precompiled [XGBoost4j]() and [XGBoost4j Spark](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.0.0-0.1.0/) library could be downloaded from maven, it is pre downloaded by GCP [RAPIDS init action](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/rapids). Since GITHUB cannot render zeppelin notebook, we prepared a [Jupyter Notebook with Scala code](../demo/GCP/mortgage-xgboost4j-gpu-scala.ipynb) for you to view code content. 
+
+The training time should be around 480 seconds (1/10 of CPU execution time with same config). Which is shown under cell:
+```scala
+// Start training
+println("\n------ Training ------")
+val (xgbClassificationModel, _) = benchmark("train") {
+  xgbClassifier.fit(trainSet)
+}
+```
+
+## Submit Spark jobs to a Dataproc Cluster Accelerated by GPU
+Similar to spark-submit for on-prem clusters, Dataproc supports a Spark applicaton job to be submitted as a dataproc job. The mortgage examples we use above is also available as [spark application](https://github.com/NVIDIA/spark-xgboost-examples/tree/spark-3/examples/apps/scala). After [build the jar files](https://github.com/NVIDIA/spark-xgboost-examples/blob/spark-3/getting-started-guides/building-sample-apps/scala.md) through maven `mvn package -Dcuda.classifier=cuda10-2` 
+
+Then place the jar file `sample_xgboost_apps-0.2.2.jar` under the `gs://$GCS_BUCKET/scala/` folder by `gsutil cp target/sample_xgboost_apps-0.2.2.jar gs://$GCS_BUCKET/scala/`. To do this you can either drag and drop files from your local machine into the GCP storage browser, or use the gsutil cp as shown before to do this from a command line. In the end, we can thereby submit the jar by:
+```bash
+export GCS_BUCKET=<bucket_name>
+export CLUSTER_NAME=<cluster_name>
+export REGION=<region>
+export SPARK_NUM_EXECUTORS=20
+export SPARK_EXECUTOR_MEMORY=20G
+export SPARK_EXECUTOR_MEMORYOVERHEAD=16G
+export SPARK_NUM_CORES_PER_EXECUTOR=7
+export DATA_PATH=gs://${GCS_BUCKET}/mortgage_full
+
+gcloud dataproc jobs submit spark \
+    --cluster=$CLUSTER_NAME \
+    --region=$REGION \
+    --class=com.nvidia.spark.examples.mortgage.GPUMain \
+    --jars=gs://${GCS_BUCKET}/scala/sample_xgboost_apps-0.2.2.jar \
+    --properties=spark.executor.cores=${SPARK_NUM_CORES_PER_EXECUTOR},spark.task.cpus=${SPARK_NUM_CORES_PER_EXECUTOR},spark.executor.memory=${SPARK_EXECUTOR_MEMORY},spark.executor.memoryOverhead=${SPARK_EXECUTOR_MEMORYOVERHEAD},spark.executor.resource.gpu.amount=1,spark.task.resource.gpu.amount=1,spark.rapids.sql.hasNans=false,spark.rapids.sql.batchSizeBytes=512M,spark.rapids.sql.reader.batchSizeBytes=768M,spark.rapids.sql.variableFloatAgg.enabled=true,spark.rapids.memory.gpu.pooling.enabled=false \
+    -- \
+    -dataPath=train::${DATA_PATH}/train \
+    -dataPath=trans::${DATA_PATH}/test \
+    -format=parquet \
+    -numWorkers=${SPARK_NUM_EXECUTORS} \
+    -treeMethod=gpu_hist \
+    -numRound=100 \
+    -maxDepth=8   
+``` 
+
+## Dataproc Hub in AI Platform Notebook to Dataproc cluster 
+With the integration between AI Platform Notebooks and Dataproc. Users can create a [Dataproc Hub notebook](https://cloud.google.com/blog/products/data-analytics/administering-jupyter-notebooks-for-spark-workloads-on-dataproc) from AI platform will can connect to Dataproc cluster through a yaml configuration.
+
+In future, user will be able to provision a dataproc cluster through DataprocHub notebook. Please use example [pyspark notebooks](../demo/GCP/Mortgage-ETL-GPU.ipynb) to experiment. 
diff --git a/docs/get-started/getting-started-menu.md b/docs/get-started/getting-started-menu.md
new file mode 100644
index 00000000000..b1e60faf3a0
--- /dev/null
+++ b/docs/get-started/getting-started-menu.md
@@ -0,0 +1,57 @@
+---
+layout: page
+title: Getting-Started
+nav_order: 2
+has_children: true
+permalink: /Getting-Started/
+---
+# Getting Started with the RAPIDS Accelerator for Apache Spark
+
+Apache Spark 3.0+ lets users provide a plugin that can replace the backend for SQL and DataFrame
+operations. This requires no API changes from the user. The plugin will replace SQL operations it
+supports with GPU accelerated versions. If an operation is not supported it will fall back to using
+the Spark CPU version. Note that the plugin cannot accelerate operations that manipulate RDDs
+directly.
+
+The accelerator library also provides an implementation of Spark's shuffle that can leverage 
+[UCX](https://www.openucx.org/) to optimize GPU data transfers keeping as much data on the GPU as
+possible and bypassing the CPU to do GPU to GPU transfers.
+
+The GPU accelerated processing plugin does not require the accelerated shuffle implementation.
+However, if accelerated SQL processing is not enabled, the shuffle implementation falls back to the
+default `SortShuffleManager`. 
+
+To enable GPU processing acceleration you will need:
+- Apache Spark 3.0+
+- A spark cluster configured with GPUs that comply with the requirements for the version of 
+  [cudf](https://github.com/rapidsai/cudf).
+    - One GPU per executor.
+- Add the following jars:
+    - A cudf jar that corresponds to the version of CUDA available on your cluster.
+    - RAPIDS Spark accelerator plugin jar.
+- Set the config `spark.plugins` to `com.nvidia.spark.SQLPlugin`
+
+## Spark GPU Scheduling Overview
+Apache Spark 3.0 now supports GPU scheduling as long as you are using a cluster manager that
+supports it. You can have Spark request GPUs and assign them to tasks. The exact configs you use
+will vary depending on your cluster manager. Here are a few of the configs:
+- Request your executor to have GPUs:
+  - `--conf spark.executor.resource.gpu.amount=1`
+- Specify the number of GPUs per task:
+  - `--conf spark.task.resource.gpu.amount=1`
+- Specify a GPU discovery script (required on YARN and K8S):
+  - `--conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh`
+
+See the deployment specific sections for more details and restrictions. Note that
+`spark.task.resource.gpu.amount` can be a decimal amount, so if you want multiple tasks to be run
+on an executor at the same time and assigned to the same GPU you can set this to a decimal value
+less than 1. You would want this setting to correspond to the `spark.executor.cores` setting.  For
+instance, if you have `spark.executor.cores=2` which would allow 2 tasks to run on each executor
+and you want those 2 tasks to run on the same GPU then you would set
+`spark.task.resource.gpu.amount=0.5`.
+
+You can also refer to the official Apache Spark documentation.
+- [Overview](https://github.com/apache/spark/blob/master/docs/configuration.md#custom-resource-scheduling-and-configuration-overview)
+- [Kubernetes specific documentation](https://github.com/apache/spark/blob/master/docs/running-on-kubernetes.md#resource-allocation-and-configuration-overview)
+- [Yarn specific documentation](https://github.com/apache/spark/blob/master/docs/running-on-yarn.md#resource-allocation-and-configuration-overview)
+- [Standalone specific documentation](https://github.com/apache/spark/blob/master/docs/spark-standalone.md#resource-allocation-and-configuration-overview)
\ No newline at end of file
diff --git a/docs/get-started/getting-started-with-rapids-accelerator-on-databricks.md b/docs/get-started/getting-started-with-rapids-accelerator-on-databricks.md
new file mode 100644
index 00000000000..18bf7b0f8ca
--- /dev/null
+++ b/docs/get-started/getting-started-with-rapids-accelerator-on-databricks.md
@@ -0,0 +1,80 @@
+---
+layout: page
+title: Databricks
+nav_order: 3
+parent: Getting-Started
+---
+
+# Getting started with RAPIDS Accelerator on Databricks
+This guide will run through how to set up the RAPIDS Accelerator for Apache Spark 3.0 on Databricks. At the end of this guide, the reader will be able to run a sample Apache Spark application that runs on NVIDIA GPUs on Databricks.
+
+## Prerequisites
+* Apache Spark 3.0 running in DataBricks Runtime 7.0 ML with GPU 
+    * AWS: 7.0 ML (includes Apache Spark 3.0.0, GPU, Scala 2.12)
+    * Azure: 7.0 ML (GPU, Scala 2.12, Spark 3.0.0)
+
+The number of GPUs per node dictates the number of Spark executors that can run in that node.
+
+## Start a Databricks Cluster
+Create a Databricks cluster by going to Clusters, then clicking “+ Create Cluster”. Ensure the cluster meets the prerequisites above by configuring it as follows:
+1. On AWS, make sure to use 7.0 ML (includes Apache Spark 3.0.0, GPU, Scala 2.12), or for Azure, choose 7.0 ML (GPU, Scala 2.12, Spark 3.0.0).
+2. Under Autopilot Options, disable auto scaling.
+3. Choose the number of workers that matches the number of GPUs you want to use.
+4. Select a worker type. On AWS, use nodes with 1 GPU each such as p3.xlarge or g4dn.xlarge. p2 nodes do not meet the architecture requirements for the Spark worker (although they can be used for the driver node). For Azure, choose GPU nodes such as Standard_NC6s_v3
+5. Select the driver type. Generally this can be set to be the same as the worker.
+
+## Advance Cluster Configuration
+
+We will need to create an initialization script for the cluster that installs the RAPIDS jars to the cluster.
+
+1. To create the initialization script, import the initialization script notebook from the repo [generate-init-script.ipynb](../demo/Databricks/) to your workspace. See [Managing Notebooks](https://docs.databricks.com/user-guide/notebooks/notebook-manage.html) on how to import a notebook, then open the notebook.
+2. Once you are in the notebook, click the “Run All” button.
+3. Ensure that the newly created init.sh script is present in the output from cell 2 and that the contents of the script are correct..
+4. Go back and edit your cluster to configure it to use the init script. To do this, click the “Clusters” button on the left panel, then select your cluster.
+5. Click the “Edit” button, then navigate down to the “Advanced Options” section.
+Select the “Init Scripts” tab in the advanced options section, and paste the initialization script: `dbfs:/databricks/init_scripts/init.sh`, then click “Add”
+
+![Init Script](../img/initscript.png)
+
+6. Now select the “Spark” tab, and paste the following config options into the Spark Config section:
+    ```bash
+    spark.plugins com.nvidia.spark.SQLPlugin
+    spark.sql.parquet.filterPushdown false
+    spark.rapids.sql.incompatibleOps.enabled true
+    spark.rapids.memory.pinnedPool.size 2G
+    spark.locality.wait 0s
+    spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version 2
+    spark.executor.extraJavaOptions "-Dai.rapids.cudf.prefer-pinned=true"
+    ```
+
+![Spark Config](../img/sparkconfig.png)
+
+7. Once you’ve added the Spark config, click “Confirm and Restart”.
+8. Once the cluster comes back up, it is now enabled for GPU-accelerated Spark with RAPIDS and cuDF.
+
+## Import the GPU Mortgage Example Notebook
+Import the example [notebook](../demo/gpu-mortgage_accelerated.ipynb) from the repo into your workspace, then open the notebook.
+Modify the first cell to point to your workspace, and download a larger dataset if needed. You can find the links to the datasets at [docs.rapids.ai](https://docs.rapids.ai/datasets/mortgage-data)
+
+```bash
+%sh
+ 
+wget http://rapidsai-data.s3-website.us-east-2.amazonaws.com/notebook-mortgage-data/mortgage_2000.tgz -P /Users/<your user id>/
+ 
+mkdir -p /dbfs/FileStore/tables/mortgage
+mkdir -p /dbfs/FileStore/tables/mortgage_parquet_gpu/perf
+mkdir /dbfs/FileStore/tables/mortgage_parquet_gpu/acq
+mkdir /dbfs/FileStore/tables/mortgage_parquet_gpu/output
+ 
+tar xfvz /Users/<your user id>/mortgage_2000.tgz --directory /dbfs/FileStore/tables/mortgage
+```
+
+In Cell 3, update the data paths if necessary. The example notebook merges the columns and prepares the data for XGoost training. The temp and final output results are written back to the dbfs
+```bash
+orig_perf_path='dbfs:///FileStore/tables/mortgage/perf/*'
+orig_acq_path='dbfs:///FileStore/tables/mortgage/acq/*'
+tmp_perf_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/perf/'
+tmp_acq_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/acq/'
+output_path='dbfs:///FileStore/tables/mortgage_parquet_gpu/output/'
+```
+Run the notebook by clicking “Run All”
diff --git a/docs/getting-started.md b/docs/get-started/getting-started.md
similarity index 83%
rename from docs/getting-started.md
rename to docs/get-started/getting-started.md
index 0da71c9c7a9..43bf17ef79d 100644
--- a/docs/getting-started.md
+++ b/docs/get-started/getting-started.md
@@ -1,42 +1,21 @@
 ---
-layout: default
-title: Getting Started
-nav_order: 2
+layout: page
+title: On-Prem
+nav_order: 1
+parent: Getting-Started
 ---
 
+# Getting Started with RAPIDS Accelerator with on premise cluster or local mode
+## Spark Deployment Methods
+The way you decide to deploy Spark affects the steps you must take to install and setup Spark and
+the RAPIDS Accelerator for Apache Spark. The primary methods of deploy Spark are:
+- Local mode - this is for dev/testing only, not for production
+- Standalone Mode
+- On a YARN cluster
+- On a Kubernetes cluster
 
-# Getting Started with the RAPIDS Accelerator for Apache Spark
-
-## Overview
-The RAPIDS Accelerator for Apache Spark leverages GPUs to accelerate processing via the
-[RAPIDS libraries](http://rapids.ai).
-
-Apache Spark 3.0+ lets users provide a plugin that can replace the backend for SQL and DataFrame
-operations. This requires no API changes from the user. The plugin will replace SQL operations it
-supports with GPU accelerated versions. If an operation is not supported it will fall back to using
-the Spark CPU version. Note that the plugin cannot accelerate operations that manipulate RDDs
-directly.
-
-The accelerator library also provides an implementation of Spark's shuffle that can leverage 
-[UCX](https://www.openucx.org/) to optimize GPU data transfers keeping as much data on the GPU as
-possible and bypassing the CPU to do GPU to GPU transfers.
-
-The GPU accelerated processing plugin does not require the accelerated shuffle implementation.
-However, if accelerated SQL processing is not enabled, the shuffle implementation falls back to the
-default `SortShuffleManager`. 
-
-To enable GPU processing acceleration you will need:
-- Apache Spark 3.0+
-- A spark cluster configured with GPUs that comply with the requirements for the version of 
-  [cudf](https://github.com/rapidsai/cudf).
-    - One GPU per executor.
-- Add the following jars:
-    - A cudf jar that corresponds to the version of CUDA available on your cluster.
-    - RAPIDS Spark accelerator plugin jar.
-- Set the config `spark.plugins` to `com.nvidia.spark.SQLPlugin`
-
-## Prerequisites
-Each node where you are running Spark needs to have the following installed. If you are running
+## Apache Spark Setup for GPU
+Each GPU node where you are running Spark needs to have the following installed. If you are running
 with Docker on Kubernetes then skip these as you will do this as part of the docker build.
 - Install Java 8 - note jdk11 is supported by Spark, but we have been building and testing with
   jdk8, so we suggest using that for now.
@@ -53,39 +32,6 @@ with Docker on Kubernetes then skip these as you will do this as part of the doc
   - `sudo apt-get update`
   - `sudo apt-get -y install cuda`
 
-## Spark GPU Scheduling Overview
-Apache Spark 3.0 now supports GPU scheduling as long as you are using a cluster manager that
-supports it. You can have Spark request GPUs and assign them to tasks. The exact configs you use
-will vary depending on your cluster manager. Here are a few of the configs:
-- Request your executor to have GPUs:
-  - `--conf spark.executor.resource.gpu.amount=1`
-- Specify the number of GPUs per task:
-  - `--conf spark.task.resource.gpu.amount=1`
-- Specify a GPU discovery script (required on YARN and K8S):
-  - `--conf spark.executor.resource.gpu.discoveryScript=./getGpusResources.sh`
-
-See the deployment specific sections for more details and restrictions. Note that
-`spark.task.resource.gpu.amount` can be a decimal amount, so if you want multiple tasks to be run
-on an executor at the same time and assigned to the same GPU you can set this to a decimal value
-less than 1. You would want this setting to correspond to the `spark.executor.cores` setting.  For
-instance, if you have `spark.executor.cores=2` which would allow 2 tasks to run on each executor
-and you want those 2 tasks to run on the same GPU then you would set
-`spark.task.resource.gpu.amount=0.5`.
-
-You can also refer to the official Apache Spark documentation.
-- [Overview](https://github.com/apache/spark/blob/master/docs/configuration.md#custom-resource-scheduling-and-configuration-overview)
-- [Kubernetes specific documentation](https://github.com/apache/spark/blob/master/docs/running-on-kubernetes.md#resource-allocation-and-configuration-overview)
-- [Yarn specific documentation](https://github.com/apache/spark/blob/master/docs/running-on-yarn.md#resource-allocation-and-configuration-overview)
-- [Standalone specific documentation](https://github.com/apache/spark/blob/master/docs/spark-standalone.md#resource-allocation-and-configuration-overview)
-
-## Spark Deployment Methods
-The way you decide to deploy Spark affects the steps you must take to install and setup Spark and
-the RAPIDS Accelerator for Apache Spark. The primary methods of deploy Spark are:
-- Local mode - this is for dev/testing only, not for production
-- Standalone Mode
-- On a YARN cluster
-- On a Kubernetes cluster
-
 Below are sections on installing Spark and the RAPIDS Accelerator on a single node, you may want
 to read the deployment method sections before doing any installations.
 
@@ -96,8 +42,8 @@ scala version 2.12 is currently supported by the accelerator.
 
 ## Download the RAPIDS jars
 The [accelerator](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark_2.12) and 
-[cudf](https://mvnrepository.com/artifact/ai.rapids/cudf) jars are available in 
-[maven central](https://mvnrepository.com/search?q=ai.rapids)
+[cudf](https://mvnrepository.com/artifact/ai.rapids/cudf) jars are available in the 
+[download](/docs/version/stable-release#download) section.
 
 Download the RAPIDS Accelerator for Apache Spark plugin jar. Then download the version of the cudf
 jar that your version of the accelerator depends on. Each cudf jar is for a specific version of
@@ -132,7 +78,7 @@ directory as the plugin jars (`/opt/sparkRapidsPlugin` in the example).
 This is for testing/dev setup only.  It is not to be used in production.  In this mode Spark runs
 everything in a single process on a single node.
 - [Install Spark](#install-spark)
-- [Install the RAPIDS jars](#install-the-rapids-jars)
+- [Install the RAPIDS jars](#download-the-rapids-jars)
 - Launch your Spark shell session
 
 Default configs usually work fine in local mode.  The required changes are setting the config 
@@ -164,7 +110,7 @@ Spark Standalone mode requires starting the Spark master and worker(s). You can
 machine or multiple machines for distributed setup.
 
 The first step is to [Install Spark](#install-spark), the 
-[RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the 
+[RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the 
 [GPU discovery script](#install-the-gpu-discovery-script) on all the nodes you want to use.
 After that choose one of the nodes to be your master node and start the master.  Note that the
 master process does **not** need a GPU to function properly.
@@ -227,7 +173,7 @@ $SPARK_HOME/bin/spark-shell \
 ## Running on YARN
 
 YARN requires you to [Install Spark](#install-spark), the
-[RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the
+[RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the
 [GPU discovery script](#install-the-gpu-discovery-script) on a launcher node. YARN handles
 shipping them to the cluster nodes as needed. If you want to use the GPU scheduling feature in
 Spark it requires YARN version >= 2.10 or >= 3.1.1 and ideally you would use >= 3.1.3 in order to
@@ -249,7 +195,7 @@ use - either 3.x or 2.x.
 - Configure YARN to support
   [GPU scheduling and isolation](https://hadoop.apache.org/docs/r3.1.3/hadoop-yarn/hadoop-yarn-site/UsingGpus.html).
 - Install [Spark](#install-spark), the
-  [RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the
+  [RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the
   [GPU discovery script](#install-the-gpu-discovery-script) on the node from which you are
   launching your Spark application.
 - Use the following configuration settings when running Spark on YARN, changing the amounts as
@@ -278,7 +224,7 @@ $SPARK_HOME/bin/spark-shell \
 - Configure YARN to support
  [GPU scheduling and isolation](https://hadoop.apache.org/docs/r2.10.0/hadoop-yarn/hadoop-yarn-site/ResourceProfiles.html)
 - Install [Spark](#install-spark), the
-  [RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the
+  [RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the
   [GPU discovery script](#install-the-gpu-discovery-script) on the node from which you are
   launching your Spark application.
 - Use the following configs when running Spark on YARN, changing the amounts as necessary:
@@ -311,7 +257,7 @@ accessing a GPU at once. Note it does not matter if GPU scheduling support is en
   - Foreach GPU index set it to `EXCLUSIVE_PROCESS` mode:
     - `nvidia-smi -c EXCLUSIVE_PROCESS -i $index`
 - Install [Spark](#install-spark), the
-  [RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the
+  [RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the
   [GPU discovery script](#install-the-gpu-discovery-script) on the node from which you are
   launching your Spark application.
 - Use the following configs when running Spark on YARN. Note that we are configuring a resource
@@ -349,7 +295,7 @@ This assumes you have Kubernetes already installed and setup.  These instruction
 to setup a Kubernetes cluster.
 
 - Install [Spark](#install-spark), the
-  [RAPIDS Accelerator for Spark jars](#install-the-rapids-jars), and the
+  [RAPIDS Accelerator for Spark jars](#download-the-rapids-jars), and the
   [GPU discovery script](#install-the-gpu-discovery-script) on the node from which you are
   going to build your Docker image.  Note that you can download these into a local directory and
   untar the Spark `.tar.gz` rather than installing into a location on the machine.
@@ -382,7 +328,7 @@ $SPARK_HOME/bin/spark-shell \
 ```  
 
 ## RAPIDS Accelerator Configuration and Tuning
-Most of what you need you can get from [tuning guide](./tuning-guide.md).
+Most of what you need you can get from [tuning guide](../tuning-guide).
 
 The following configs will hep you to get started but must be configured based on your cluster
 and application.
@@ -427,7 +373,7 @@ operation “count at ...”, you should see the graph of Spark Execs and some o
 the label Gpu...  For instance, in the screenshot below you will see `GpuRowToColumn`, `GpuFilter`,
 and `GpuColumnarExchange`.  Those correspond to operations that run on the GPU.
 
-![Join Example on Spark SQL UI](img/join-sql-ui-example.png)
+![Join Example on Spark SQL UI](../img/join-sql-ui-example.png)
 
 ## Advanced Configuration
 
diff --git a/docs/get-started/yarn-gpu.md b/docs/get-started/yarn-gpu.md
new file mode 100644
index 00000000000..462b9dc593b
--- /dev/null
+++ b/docs/get-started/yarn-gpu.md
@@ -0,0 +1,120 @@
+---
+layout: page
+title: yarn-gpu
+nav_exclude: true
+---
+
+## Spark3 GPU Configuration Guide on Yarn 3.2.1 
+
+Following files recommended to be configured to enable GPU scheduling on Yarn 3.2.1 and later. 
+
+GPU resource discovery script - `/usr/lib/spark/scripts/gpu/getGpusResources.sh`:
+```bash
+mkdir -p /usr/lib/spark/scripts/gpu/
+cd /usr/lib/spark/scripts/gpu/
+wget https://raw.githubusercontent.com/apache/spark/master/examples/src/main/scripts/getGpusResources.sh
+chmod a+rwx -R /usr/lib/spark/scripts/gpu/
+```
+
+Spark config - `/etc/spark/conf/spark-default.conf`:
+```bash
+spark.rapids.sql.concurrentGpuTasks=2
+spark.executor.resource.gpu.amount=1
+spark.executor.cores=8
+spark.task.cpus=1
+spark.task.resource.gpu.amount=0.125
+spark.rapids.memory.pinnedPool.size=2G
+spark.executor.memoryOverhead=2G
+spark.plugins=com.nvidia.spark.SQLPlugin
+spark.executor.extraJavaOptions='-Dai.rapids.cudf.prefer-pinned=true'
+spark.locality.wait=0s
+spark.executor.resource.gpu.discoveryScript=/usr/lib/spark/scripts/gpu/getGpusResources.sh # this match the location of discovery script
+spark.sql.shuffle.partitions=40
+spark.sql.files.maxPartitionBytes=512m
+```
+
+Yarn Scheduler config - `/etc/hadoop/conf/capacity-scheduler.xml`:
+```xml
+<configuration>
+  <property>
+    <name>yarn.scheduler.capacity.resource-calculator</name>     
+    <value>org.apache.hadoop.yarn.util.resource.DominantResourceCalculator</value>
+  </property>
+</configuration>
+```
+
+Yarn config - `/etc/hadoop/conf/yarn-site.xml`:
+```xml
+<configuration>
+  <property>
+    <name>yarn.nodemanager.resource-plugins</name>
+    <value>yarn.io/gpu</value>
+  </property>
+  <property>
+     <name>yarn.resource-types</name>
+     <value>yarn.io/gpu</value>
+  </property>
+  <property>
+     <name>yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices</name>
+     <value>auto</value>
+  </property>
+  <property>
+     <name>yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables</name>
+     <value>/usr/bin</value>
+  </property>
+  <property>
+     <name>yarn.nodemanager.linux-container-executor.cgroups.mount</name>
+     <value>true</value>
+  </property>
+  <property>
+     <name>yarn.nodemanager.linux-container-executor.cgroups.mount-path</name>
+     <value>/sys/fs/cgroup</value>
+  </property>
+  <property>
+     <name>yarn.nodemanager.linux-container-executor.cgroups.hierarchy</name>
+     <value>yarn</value>
+  </property>
+  <property>
+    <name>yarn.nodemanager.container-executor.class</name>
+    <value>org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor</value>
+  </property>
+  <property>
+    <name>yarn.nodemanager.linux-container-executor.group</name>
+    <value>yarn</value>
+  </property>
+</configuration>
+```
+
+`/etc/hadoop/conf/container-executor.cfg` - user yarn as service account:
+```bash
+yarn.nodemanager.linux-container-executor.group=yarn
+
+#--Original container-exectuor.cfg Content--
+
+[gpu]
+module.enabled=true
+[cgroups]
+root=/sys/fs/cgroup
+yarn-hierarchy=yarn
+```
+
+Need to share node manager local dir to all user, run below in bash:
+```bash
+chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
+chmod a+rwx -R /sys/fs/cgroup/devices
+local_dirs=$(bdconfig get_property_value \
+    --configuration_file /etc/hadoop/conf/yarn-site.xml \
+    --name yarn.nodemanager.local-dirs 2>/dev/null)
+mod_local_dirs=${local_dirs//\,/ }
+chmod a+rwx -R ${mod_local_dirs}
+```
+
+In the end, restart node manager and resource manager service:
+On all workers:
+```bash
+sudo systemctl restart hadoop-yarn-nodemanager.service
+```
+On all masters:
+```bash
+sudo systemctl restart hadoop-yarn-resourcemanager.service
+```
diff --git a/docs/img/dataproc-cluster.png b/docs/img/dataproc-cluster.png
new file mode 100644
index 00000000000..87f3ea40913
Binary files /dev/null and b/docs/img/dataproc-cluster.png differ
diff --git a/docs/img/dataproc-service.png b/docs/img/dataproc-service.png
new file mode 100644
index 00000000000..79899dc848f
Binary files /dev/null and b/docs/img/dataproc-service.png differ
diff --git a/docs/img/ease-of-use.png b/docs/img/ease-of-use.png
new file mode 100644
index 00000000000..e39c4e4728c
Binary files /dev/null and b/docs/img/ease-of-use.png differ
diff --git a/docs/img/initscript.png b/docs/img/initscript.png
new file mode 100644
index 00000000000..01111a5235a
Binary files /dev/null and b/docs/img/initscript.png differ
diff --git a/docs/img/perf-cost.png b/docs/img/perf-cost.png
new file mode 100644
index 00000000000..1b004d10fb9
Binary files /dev/null and b/docs/img/perf-cost.png differ
diff --git a/docs/img/spark3cluster.png b/docs/img/spark3cluster.png
new file mode 100644
index 00000000000..73050c63451
Binary files /dev/null and b/docs/img/spark3cluster.png differ
diff --git a/docs/img/sparkconfig.png b/docs/img/sparkconfig.png
new file mode 100644
index 00000000000..fe1344eb1e4
Binary files /dev/null and b/docs/img/sparkconfig.png differ
diff --git a/docs/index.md b/docs/index.md
index aed4afce6d9..e4c30e20749 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,7 +5,29 @@ nav_order: 1
 permalink: /
 description: This site serves as a collection of documentation about the RAPIDS accelerator for Apache Spark
 ---
+# Overview
+The RAPIDS Accelerator for Apache Spark leverages GPUs to accelerate processing via the
+[RAPIDS libraries](http://rapids.ai).
+
 As data scientists shift from using traditional analytics to leveraging AI applications that better model complex market demands, traditional CPU-based processing can no longer keep up without compromising either speed or cost. The growing adoption of AI in analytics has created the need for a new framework to process data quickly and cost efficiently with GPUs.
 
 The RAPIDS Accelerator for Apache Spark combines the power of the <a href="https://github.com/rapidsai/cudf/">RAPIDS cuDF</a> library and the scale of the Spark distributed computing framework.  The RAPIDS Accelerator library also has a built-in accelerated shuffle based on <a href="https://github.com/openucx/ucx/">UCX</a> that can be configured to leverage GPU-to-GPU communication and RDMA capabilities. 
 
+## Perfomance & Cost Benefits
+Rapids Accelerator for Apache Spark reaps the benefit of GPU perfomance while saving infrastructure costs.
+![Perf-cost](/img/Perf-cost.png)
+*ETL for FannieMae Mortgage Dataset (~200GB) as shown in our [demo](https://databricks.com/session_na20/deep-dive-into-gpu-support-in-apache-spark-3-x). Costs based on Cloud T4 GPU instance market price & V100 GPU price on Databricks Standard edition
+
+
+## Ease of Use
+Run your existing Apache Spark applications with no code change. Learn more on how to [get started](/Getting-Started/).
+
+`spark.conf.set('spark.rapids.sql.enabled','true')`
+
+![ease-of-use](/img/ease-of-use.png)
+
+## An unified AI framework for ETL + ML/DL  
+A single pipeline, from ingest to data preparation to model training
+![spark3cluster](/img/spark3cluster.png)
+
+
diff --git a/docs/version/stable-release.md b/docs/version/stable-release.md
new file mode 100644
index 00000000000..19726294cd5
--- /dev/null
+++ b/docs/version/stable-release.md
@@ -0,0 +1,36 @@
+---
+layout: page
+title: Stable Version
+nav_order: 1
+parent: Version
+---
+
+## Stable Release - v0.1.0
+This is the first public release of the RAPIDS Accelerator for Apache Spark. 
+The list of supported operations is provided [here](../configs.html#supported-gpu-operators-and-fine-tuning)
+
+Hardware Requirements: 
+   
+    GPU Architecture: NVIDIA Pascal™ or better (Tested on V100 and T4 GPU)
+
+Software Requirements: 
+
+    OS: Ubuntu 16.04 & gcc 5.4 OR Ubuntu 18.04/CentOS 7 & gcc 7.3
+    (RHEL 7 support is provided through CentOS 7 builds/installs)
+
+    CUDA & NVIDIA Drivers: 10.1.2 & v418.87+ or 10.2 & v440.33+
+    
+    Apache Spark 3.0
+  
+    Apache Hadoop 2.10+ or 3.1.1+ (3.1.1 for nvidia-docker version 2)
+
+    Python 3.x, Scala 2.12, Java 8 
+
+
+## Download
+* [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/0.1.0/rapids-4-spark_2.12-0.1.0.jar)
+* [cuDF 10.2 Package](https://repo1.maven.org/maven2/ai/rapids/cudf/0.14/cudf-0.14-cuda10-2.jar)
+* [cuDF 10.1 Package](https://repo1.maven.org/maven2/ai/rapids/cudf/0.14/cudf-0.14-cuda10-1.jar)
+
+
+
diff --git a/docs/version/version.md b/docs/version/version.md
new file mode 100644
index 00000000000..fb5c5b62d7c
--- /dev/null
+++ b/docs/version/version.md
@@ -0,0 +1,7 @@
+---
+layout: page
+title: Version
+nav_order: 10
+has_children: true
+permalink: /Version/
+---