From d8596833d57d3b9ca0618e4a75973e11c983ac37 Mon Sep 17 00:00:00 2001
From: Sameer Raheja <sameerz@users.noreply.github.com>
Date: Tue, 2 Mar 2021 13:56:10 -0800
Subject: [PATCH] Update changelog for 0.4 (#1849)

* Update changelog for 0.4

Signed-off-by: Sameer Raheja <sraheja@nvidia.com>

* Update generate-changelog script

Signed-off-by: Sameer Raheja <sraheja@nvidia.com>
---
 CHANGELOG.md               | 283 ++++++++++++++++++++++++++++++++++++-
 scripts/generate-changelog |  10 +-
 2 files changed, 287 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a94e699af1a..7cf8994f744 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,286 @@
 # Change log
-Generated on 2021-02-08
+Generated on 2021-03-02
+
+## Release 0.4
+
+### Features
+|||
+|:---|:---|
+|[#1773](https://github.com/NVIDIA/spark-rapids/issues/1773)|[FEA] Spark 3.0.2 release support|
+|[#80](https://github.com/NVIDIA/spark-rapids/issues/80)|[FEA] Support the struct SQL function|
+|[#76](https://github.com/NVIDIA/spark-rapids/issues/76)|[FEA] Support CreateArray|
+|[#1635](https://github.com/NVIDIA/spark-rapids/issues/1635)|[FEA] RAPIDS accelerated Java UDF|
+|[#1333](https://github.com/NVIDIA/spark-rapids/issues/1333)|[FEA] Support window operations on Decimal|
+|[#1419](https://github.com/NVIDIA/spark-rapids/issues/1419)|[FEA] Support GPU accelerated UDF alternative for higher order function "aggregate" over window|
+|[#1580](https://github.com/NVIDIA/spark-rapids/issues/1580)|[FEA] Support Decimal for ParquetCachedBatchSerializer|
+|[#1600](https://github.com/NVIDIA/spark-rapids/issues/1600)|[FEA] Support ScalarSubquery|
+|[#1072](https://github.com/NVIDIA/spark-rapids/issues/1072)|[FEA] Support for a custom DataSource V2 which supplies Arrow data|
+|[#906](https://github.com/NVIDIA/spark-rapids/issues/906)|[FEA] Clarify query explanation to directly state what will run on GPU|
+|[#1335](https://github.com/NVIDIA/spark-rapids/issues/1335)|[FEA] Support CollectLimitExec for decimal|
+|[#1485](https://github.com/NVIDIA/spark-rapids/issues/1485)|[FEA] Decimal Support for Parquet Write|
+|[#1329](https://github.com/NVIDIA/spark-rapids/issues/1329)|[FEA] Decimal support for multiply int div, add, subtract and null safe equals|
+|[#1351](https://github.com/NVIDIA/spark-rapids/issues/1351)|[FEA] Execute UDFs that provide a RAPIDS execution path|
+|[#1330](https://github.com/NVIDIA/spark-rapids/issues/1330)|[FEA] Support Decimal Casts|
+|[#1353](https://github.com/NVIDIA/spark-rapids/issues/1353)|[FEA] Example of RAPIDS UDF using custom GPU code|
+|[#1487](https://github.com/NVIDIA/spark-rapids/issues/1487)|[FEA] Change spark 3.1.0 to 3.1.1|
+|[#1334](https://github.com/NVIDIA/spark-rapids/issues/1334)|[FEA] Add support for count aggregate on decimal|
+|[#1325](https://github.com/NVIDIA/spark-rapids/issues/1325)|[FEA] Add in join support for decimal|
+|[#1326](https://github.com/NVIDIA/spark-rapids/issues/1326)|[FEA] Add in Broadcast support for decimal values|
+|[#37](https://github.com/NVIDIA/spark-rapids/issues/37)|[FEA] round and bround SQL functions|
+|[#78](https://github.com/NVIDIA/spark-rapids/issues/78)|[FEA] Support CreateNamedStruct function|
+|[#1331](https://github.com/NVIDIA/spark-rapids/issues/1331)|[FEA] UnionExec and ExpandExec support for decimal|
+|[#1332](https://github.com/NVIDIA/spark-rapids/issues/1332)|[FEA] Support CaseWhen, Coalesce and IfElse for decimal|
+|[#937](https://github.com/NVIDIA/spark-rapids/issues/937)|[FEA] have murmur3 hash function that matches exactly with spark|
+|[#1324](https://github.com/NVIDIA/spark-rapids/issues/1324)|[FEA] Support Parquet Read of Decimal FIXED_LENGTH_BYTE_ARRAY|
+|[#1428](https://github.com/NVIDIA/spark-rapids/issues/1428)|[FEA] Add support for unary decimal operations abs, floor, ceil, unary - and unary +|
+|[#1375](https://github.com/NVIDIA/spark-rapids/issues/1375)|[FEA] Add log statement for what the concurrentGpuTasks tasks is set to on executor startup|
+|[#1352](https://github.com/NVIDIA/spark-rapids/issues/1352)|[FEA] Example of RAPIDS UDF using cudf Java APIs|
+|[#1328](https://github.com/NVIDIA/spark-rapids/issues/1328)|[FEA] Support sorting and shuffle of decimal|
+|[#1316](https://github.com/NVIDIA/spark-rapids/issues/1316)|[FEA] Support simple DECIMAL aggregates|
+
+### Performance
+|||
+|:---|:---|
+|[#1435](https://github.com/NVIDIA/spark-rapids/issues/1435)|[FEA]Improve the file reading by using local file caching|
+|[#1738](https://github.com/NVIDIA/spark-rapids/issues/1738)|[FEA] Reduce regex usage in CAST string to date/timestamp|
+|[#987](https://github.com/NVIDIA/spark-rapids/issues/987)|[FEA] Optimize CAST from string to temporal types by using cuDF is_timestamp function|
+|[#1594](https://github.com/NVIDIA/spark-rapids/issues/1594)|[FEA] RAPIDS accelerated ScalaUDF|
+|[#103](https://github.com/NVIDIA/spark-rapids/issues/103)|[FEA] GPU version of TakeOrderedAndProject|
+|[#1024](https://github.com/NVIDIA/spark-rapids/issues/1024)|Cleanup RAPIDS transport calls to `receive`|
+|[#1366](https://github.com/NVIDIA/spark-rapids/issues/1366)|Seeing performance differences of multi-threaded/coalesce/perfile Parquet reader type for a single file|
+|[#1200](https://github.com/NVIDIA/spark-rapids/issues/1200)|[FEA] Accelerate the scan speed for coalescing parquet reader when reading files from multiple partitioned folders|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#1785](https://github.com/NVIDIA/spark-rapids/issues/1785)|[BUG] Rapids pytest integration tests FAILED on Yarn cluster with unrecognized arguments: `--std_input_path=src/test/resources/`|
+|[#999](https://github.com/NVIDIA/spark-rapids/issues/999)|[BUG] test_multi_types_window_aggs_for_rows_lead_lag fails against Spark 3.1.0|
+|[#1818](https://github.com/NVIDIA/spark-rapids/issues/1818)|[BUG] unmoored doc comment warnings in GpuCast|
+|[#1817](https://github.com/NVIDIA/spark-rapids/issues/1817)|[BUG] Developer build with local modifications fails during verify phase|
+|[#1644](https://github.com/NVIDIA/spark-rapids/issues/1644)|[BUG] test_window_aggregate_udf_array_from_python fails on databricks|
+|[#1771](https://github.com/NVIDIA/spark-rapids/issues/1771)|[BUG] Databricks AWS CI/CD failing to create cluster|
+|[#1157](https://github.com/NVIDIA/spark-rapids/issues/1157)|[BUG] Fix regression supporting to_date on GPU with Spark 3.1.0|
+|[#716](https://github.com/NVIDIA/spark-rapids/issues/716)|[BUG] Cast String to TimeStamp issues|
+|[#1117](https://github.com/NVIDIA/spark-rapids/issues/1117)|[BUG] CAST string to date returns wrong values for dates with out-of-range values|
+|[#1670](https://github.com/NVIDIA/spark-rapids/issues/1670)|[BUG] Some TPC-DS queries fail with AQE when decimal types enabled|
+|[#1730](https://github.com/NVIDIA/spark-rapids/issues/1730)|[BUG] Range Partitioning can crash when processing is in the order-by|
+|[#1726](https://github.com/NVIDIA/spark-rapids/issues/1726)|[BUG] java url decode test failing on databricks, emr, and dataproc|
+|[#1651](https://github.com/NVIDIA/spark-rapids/issues/1651)|[BUG] GDS exception when writing shuffle file|
+|[#1702](https://github.com/NVIDIA/spark-rapids/issues/1702)|[BUG] check all tests marked xfail for Spark 3.1.1|
+|[#575](https://github.com/NVIDIA/spark-rapids/issues/575)|[BUG] Spark 3.1 FAILED join_test.py::test_broadcast_join_mixed[FullOuter][IGNORE_ORDER] failed|
+|[#577](https://github.com/NVIDIA/spark-rapids/issues/577)|[BUG] Spark 3.1 log arithmetic functions fail|
+|[#1541](https://github.com/NVIDIA/spark-rapids/issues/1541)|[BUG] Tests fail in integration in distributed mode after allowing nested types through in sort and shuffle|
+|[#1626](https://github.com/NVIDIA/spark-rapids/issues/1626)|[BUG] TPC-DS-like query 77 at scale=3TB fails with maxResultSize exceeded error|
+|[#1576](https://github.com/NVIDIA/spark-rapids/issues/1576)|[BUG] loading SPARK-32639 example parquet file triggers a JVM crash |
+|[#1643](https://github.com/NVIDIA/spark-rapids/issues/1643)|[BUG] TPC-DS-Like q10, q35, and q69 - slow or hanging at leftSemiJoin|
+|[#1650](https://github.com/NVIDIA/spark-rapids/issues/1650)|[BUG] BenchmarkRunner does not include query name in JSON summary filename when running multiple queries|
+|[#1654](https://github.com/NVIDIA/spark-rapids/issues/1654)|[BUG] TPC-DS-like query 59 at scale=3TB with AQE fails with join mismatch|
+|[#1274](https://github.com/NVIDIA/spark-rapids/issues/1274)|[BUG] OutOfMemoryError - Maximum pool size exceeded while running 24 day criteo ETL Transform stage|
+|[#1497](https://github.com/NVIDIA/spark-rapids/issues/1497)|[BUG] Spark-rapids v0.3.0 pytest integration tests with UCX on FAILED on Yarn cluster|
+|[#1534](https://github.com/NVIDIA/spark-rapids/issues/1534)|[BUG] Spark 3.1.1 test failure in writing due to removal of InMemoryFileIndex.shouldFilterOut|
+|[#1155](https://github.com/NVIDIA/spark-rapids/issues/1155)|[BUG] on shutdown don't print `Socket closed` exception when shutting down UCX.scala|
+|[#1510](https://github.com/NVIDIA/spark-rapids/issues/1510)|[BUG] IllegalArgumentException during shuffle|
+|[#1513](https://github.com/NVIDIA/spark-rapids/issues/1513)|[BUG] executor not fully initialized may get calls from Spark, in the process setting the `catalog` incorrectly|
+|[#1466](https://github.com/NVIDIA/spark-rapids/issues/1466)|[BUG] Databricks build must run before the rapids nightly|
+|[#1456](https://github.com/NVIDIA/spark-rapids/issues/1456)|[BUG] Databricks 0.4 parquet integration tests fail|
+|[#1400](https://github.com/NVIDIA/spark-rapids/issues/1400)|[BUG] Regressions in spark-shell usage of benchmark utilities|
+|[#1119](https://github.com/NVIDIA/spark-rapids/issues/1119)|[BUG] inner join fails with Column size cannot be negative|
+|[#1079](https://github.com/NVIDIA/spark-rapids/issues/1079)|[BUG]The Scala UDF function cannot invoke the UDF compiler when it's passed to "explode"|
+|[#1298](https://github.com/NVIDIA/spark-rapids/issues/1298)|TPCxBB query16 failed at UnsupportedOperationException: org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainIntegerDictionary|
+|[#1271](https://github.com/NVIDIA/spark-rapids/issues/1271)|[BUG] CastOpSuite and AnsiCastOpSuite failing with ArithmeticException on Spark 3.1|
+|[#84](https://github.com/NVIDIA/spark-rapids/issues/84)|[BUG] sort does not match spark for -0.0 and 0.0|
+|[#578](https://github.com/NVIDIA/spark-rapids/issues/578)|[BUG] Spark 3.1 qa_nightly_select_test.py Full join test failures|
+|[#586](https://github.com/NVIDIA/spark-rapids/issues/586)|[BUG] Spark3.1 tpch failures|
+|[#837](https://github.com/NVIDIA/spark-rapids/issues/837)|[BUG] Distinct count of floating point values differs with regular spark|
+|[#953](https://github.com/NVIDIA/spark-rapids/issues/953)|[BUG] 3.1.0 pos_explode tests are failing|
+|[#127](https://github.com/NVIDIA/spark-rapids/issues/127)|[BUG] String CSV parsing does not respect nullValues|
+|[#1203](https://github.com/NVIDIA/spark-rapids/issues/1203)|[BUG] tpcds query 51 fails with join error on Spark 3.1.0|
+|[#750](https://github.com/NVIDIA/spark-rapids/issues/750)|[BUG] udf_cudf_test::test_with_column fails with IPC error |
+|[#1348](https://github.com/NVIDIA/spark-rapids/issues/1348)|[BUG] Host columnar decimal conversions are failing|
+|[#1270](https://github.com/NVIDIA/spark-rapids/issues/1270)|[BUG] Benchmark runner fails to produce report if benchmark fails due to an invalid query plan|
+|[#1179](https://github.com/NVIDIA/spark-rapids/issues/1179)|[BUG] SerializeConcatHostBuffersDeserializeBatch may have thread issues|
+|[#1115](https://github.com/NVIDIA/spark-rapids/issues/1115)|[BUG] Unchecked type warning in SparkQueryCompareTestSuite|
+
+### PRs
+|||
+|:---|:---|
+|[#1842](https://github.com/NVIDIA/spark-rapids/pull/1842)|Update to note support for 3.0.2|
+|[#1832](https://github.com/NVIDIA/spark-rapids/pull/1832)|Spark 3.1.1 shim no longer a snapshot shim|
+|[#1831](https://github.com/NVIDIA/spark-rapids/pull/1831)|Spark 3.0.2 shim no longer a snapshot shim|
+|[#1826](https://github.com/NVIDIA/spark-rapids/pull/1826)|Remove benchmarks|
+|[#1828](https://github.com/NVIDIA/spark-rapids/pull/1828)|Update cudf dependency to 0.18|
+|[#1813](https://github.com/NVIDIA/spark-rapids/pull/1813)|Fix LEAD/LAG failures in Spark 3.1.1|
+|[#1819](https://github.com/NVIDIA/spark-rapids/pull/1819)|Fix scaladoc warning in GpuCast|
+|[#1820](https://github.com/NVIDIA/spark-rapids/pull/1820)|[BUG] make modified check pre-merge only|
+|[#1780](https://github.com/NVIDIA/spark-rapids/pull/1780)|Remove SNAPSHOT from test and integration_test READMEs|
+|[#1809](https://github.com/NVIDIA/spark-rapids/pull/1809)|check if modified files after update_config/supported|
+|[#1804](https://github.com/NVIDIA/spark-rapids/pull/1804)|Update UCX documentation for RX_QUEUE_LEN and Docker|
+|[#1810](https://github.com/NVIDIA/spark-rapids/pull/1810)|Pandas UDF: Sort the data before computing the sum.|
+|[#1751](https://github.com/NVIDIA/spark-rapids/pull/1751)|Exclude foldable expressions from GPU if constant folding is disabled|
+|[#1798](https://github.com/NVIDIA/spark-rapids/pull/1798)|Add documentation about explain not on GPU when AQE is on|
+|[#1766](https://github.com/NVIDIA/spark-rapids/pull/1766)|Branch 0.4 release docs|
+|[#1794](https://github.com/NVIDIA/spark-rapids/pull/1794)|Build python output schema from udf expressions|
+|[#1783](https://github.com/NVIDIA/spark-rapids/pull/1783)|Fix the collect_list over window tests failures on db|
+|[#1781](https://github.com/NVIDIA/spark-rapids/pull/1781)|Better float/double cases for casting tests|
+|[#1790](https://github.com/NVIDIA/spark-rapids/pull/1790)|Record row counts in benchmark runs that call collect|
+|[#1779](https://github.com/NVIDIA/spark-rapids/pull/1779)|Add support of DateType and TimestampType for GetTimestamp expression|
+|[#1768](https://github.com/NVIDIA/spark-rapids/pull/1768)|Updating getting started Databricks docs|
+|[#1742](https://github.com/NVIDIA/spark-rapids/pull/1742)|Fix regression supporting to_date with Spark-3.1|
+|[#1775](https://github.com/NVIDIA/spark-rapids/pull/1775)|Fix ambiguous ordering for some tests|
+|[#1760](https://github.com/NVIDIA/spark-rapids/pull/1760)|Update GpuDataSourceScanExec and GpuBroadcastExchangeExec to fix audit issues|
+|[#1750](https://github.com/NVIDIA/spark-rapids/pull/1750)|Detect task failures in benchmarks|
+|[#1767](https://github.com/NVIDIA/spark-rapids/pull/1767)|Consistent Spark version for test and production|
+|[#1741](https://github.com/NVIDIA/spark-rapids/pull/1741)|Reduce regex use in CAST|
+|[#1756](https://github.com/NVIDIA/spark-rapids/pull/1756)|Skip RAPIDS accelerated Java UDF tests if UDF fails to load|
+|[#1716](https://github.com/NVIDIA/spark-rapids/pull/1716)|Update RapidsShuffleManager documentation for branch 0.4|
+|[#1740](https://github.com/NVIDIA/spark-rapids/pull/1740)|Disable ORC writes until bug can be fixed|
+|[#1747](https://github.com/NVIDIA/spark-rapids/pull/1747)|Fix resource leaks in unit tests|
+|[#1725](https://github.com/NVIDIA/spark-rapids/pull/1725)|Branch 0.4 FAQ reorg|
+|[#1718](https://github.com/NVIDIA/spark-rapids/pull/1718)|CAST string to temporal type now calls isTimestamp|
+|[#1734](https://github.com/NVIDIA/spark-rapids/pull/1734)|Disable range partitioning if computation is needed|
+|[#1723](https://github.com/NVIDIA/spark-rapids/pull/1723)|Removed StructTypes support for ParquetCachedBatchSerializer as cudf doesn't support it yet|
+|[#1714](https://github.com/NVIDIA/spark-rapids/pull/1714)|Add support for RAPIDS accelerated Java UDFs|
+|[#1713](https://github.com/NVIDIA/spark-rapids/pull/1713)|Call GpuDeviceManager.shutdown when the executor plugin is shutting down|
+|[#1596](https://github.com/NVIDIA/spark-rapids/pull/1596)|Added in Decimal support to ParquetCachedBatchSerializer|
+|[#1706](https://github.com/NVIDIA/spark-rapids/pull/1706)|cleanup unused is_before_spark_310|
+|[#1685](https://github.com/NVIDIA/spark-rapids/pull/1685)|Fix CustomShuffleReader replacement when decimal types enabled|
+|[#1699](https://github.com/NVIDIA/spark-rapids/pull/1699)|Add docs about Spark 3.1 in standalone modes not needing extra class path|
+|[#1701](https://github.com/NVIDIA/spark-rapids/pull/1701)|remove xfail for orc test_input_meta for spark 3.1.0|
+|[#1703](https://github.com/NVIDIA/spark-rapids/pull/1703)|Remove xfail for spark 3.1.0 test_broadcast_join_mixed FullOuter|
+|[#1676](https://github.com/NVIDIA/spark-rapids/pull/1676)|BenchmarkRunner option to generate query plan diagrams in DOT format|
+|[#1695](https://github.com/NVIDIA/spark-rapids/pull/1695)|support alternate jar paths|
+|[#1694](https://github.com/NVIDIA/spark-rapids/pull/1694)|increase mem and limit parallelism for pre-merge|
+|[#1691](https://github.com/NVIDIA/spark-rapids/pull/1691)|add validate_execs_in_gpu_plan to pytest.ini|
+|[#1692](https://github.com/NVIDIA/spark-rapids/pull/1692)|Add the integration test resources to the test tarball|
+|[#1677](https://github.com/NVIDIA/spark-rapids/pull/1677)|When PTDS is enabled, print warning if the allocator is not ARENA|
+|[#1683](https://github.com/NVIDIA/spark-rapids/pull/1683)|update changelog to verify autotmerge 0.5 setup [skip ci]|
+|[#1673](https://github.com/NVIDIA/spark-rapids/pull/1673)|support auto-merge for branch 0.5 [skip ci]|
+|[#1681](https://github.com/NVIDIA/spark-rapids/pull/1681)|Xfail the collect_list tests for databricks|
+|[#1678](https://github.com/NVIDIA/spark-rapids/pull/1678)|Fix array/struct checks in Sort and HashAggregate and sorting tests in distributed mode|
+|[#1671](https://github.com/NVIDIA/spark-rapids/pull/1671)|Allow metrics to be configurable by level|
+|[#1675](https://github.com/NVIDIA/spark-rapids/pull/1675)|add run_pyspark_from_build.sh to the pytest distribution tarball|
+|[#1548](https://github.com/NVIDIA/spark-rapids/pull/1548)|Support executing collect_list on GPU with windowing.|
+|[#1593](https://github.com/NVIDIA/spark-rapids/pull/1593)|Avoid unnecessary Table instances after contiguous split|
+|[#1592](https://github.com/NVIDIA/spark-rapids/pull/1592)|Add in support for Decimal divide|
+|[#1668](https://github.com/NVIDIA/spark-rapids/pull/1668)|Implement way for python integration tests to validate Exec is in GPU plan|
+|[#1669](https://github.com/NVIDIA/spark-rapids/pull/1669)|Add FAQ entries for executor-per-GPU questions|
+|[#1661](https://github.com/NVIDIA/spark-rapids/pull/1661)|Enable Parquet test for file containing map struct key|
+|[#1664](https://github.com/NVIDIA/spark-rapids/pull/1664)|Filter nulls for left semi and left anti join to work around cudf|
+|[#1665](https://github.com/NVIDIA/spark-rapids/pull/1665)|Add better automated tests for Arrow columnar copy in HostColumnarToGpu|
+|[#1614](https://github.com/NVIDIA/spark-rapids/pull/1614)|add alluxio getting start document|
+|[#1639](https://github.com/NVIDIA/spark-rapids/pull/1639)|support GpuScalarSubquery|
+|[#1656](https://github.com/NVIDIA/spark-rapids/pull/1656)|Move UDF to Catalyst Expressions to its own document|
+|[#1663](https://github.com/NVIDIA/spark-rapids/pull/1663)|BenchmarkRunner - Include query name in JSON summary filename|
+|[#1655](https://github.com/NVIDIA/spark-rapids/pull/1655)|Fix extraneous shuffles added by AQE|
+|[#1652](https://github.com/NVIDIA/spark-rapids/pull/1652)|Fix typo in arrow optimized config name - spark.rapids.arrowCopyOptimizationEnabled|
+|[#1645](https://github.com/NVIDIA/spark-rapids/pull/1645)|Run Databricks IT with python-xdist parallel, includes test fixes and xfail|
+|[#1649](https://github.com/NVIDIA/spark-rapids/pull/1649)|Move building from source docs to contributing guide|
+|[#1637](https://github.com/NVIDIA/spark-rapids/pull/1637)|Fail DivModLike on zero divisor in ANSI mode|
+|[#1646](https://github.com/NVIDIA/spark-rapids/pull/1646)|Update links in rapids-udfs.md after moving to subfolder|
+|[#1641](https://github.com/NVIDIA/spark-rapids/pull/1641)|Xfail struct and array order by tests on Dataproc|
+|[#1565](https://github.com/NVIDIA/spark-rapids/pull/1565)|Add GPU accelerated array_contains operator|
+|[#1617](https://github.com/NVIDIA/spark-rapids/pull/1617)|Enable nightly test checks for Apache Spark|
+|[#1636](https://github.com/NVIDIA/spark-rapids/pull/1636)|RAPIDS accelerated Spark Scala UDF support|
+|[#1634](https://github.com/NVIDIA/spark-rapids/pull/1634)|Fix databricks build since Arrow code added|
+|[#1599](https://github.com/NVIDIA/spark-rapids/pull/1599)|Add division by zero tests for Spark 3.1 behavior|
+|[#1619](https://github.com/NVIDIA/spark-rapids/pull/1619)|Update GpuFileSourceScanExec to be in sync with DataSourceScanExec|
+|[#1631](https://github.com/NVIDIA/spark-rapids/pull/1631)|Explicitly add maven-jar-plugin version to improve incremental build time.|
+|[#1624](https://github.com/NVIDIA/spark-rapids/pull/1624)|Update explain format to show what will and will not run on the GPU|
+|[#1622](https://github.com/NVIDIA/spark-rapids/pull/1622)|Support faster copy for a custom DataSource V2 which supplies Arrow data|
+|[#1621](https://github.com/NVIDIA/spark-rapids/pull/1621)|Additional functionality docs|
+|[#1618](https://github.com/NVIDIA/spark-rapids/pull/1618)|update blossom-ci for security updates [skip ci]|
+|[#1562](https://github.com/NVIDIA/spark-rapids/pull/1562)|add alluxio support|
+|[#1597](https://github.com/NVIDIA/spark-rapids/pull/1597)|Documentation for Parquet serializer|
+|[#1611](https://github.com/NVIDIA/spark-rapids/pull/1611)|Add in flag for integration tests to not skip required tests|
+|[#1609](https://github.com/NVIDIA/spark-rapids/pull/1609)|Disable float round/bround by default|
+|[#1615](https://github.com/NVIDIA/spark-rapids/pull/1615)|Add in window support for average|
+|[#1610](https://github.com/NVIDIA/spark-rapids/pull/1610)|Limit length of spark app name in BenchmarkRunner|
+|[#1579](https://github.com/NVIDIA/spark-rapids/pull/1579)|Support TakeOrderedAndProject|
+|[#1581](https://github.com/NVIDIA/spark-rapids/pull/1581)|Support Decimal type for CollectLimitExec|
+|[#1591](https://github.com/NVIDIA/spark-rapids/pull/1591)|Add support for running multiple queries in BenchmarkRunner|
+|[#1595](https://github.com/NVIDIA/spark-rapids/pull/1595)|Fix Github documentation issue template|
+|[#1577](https://github.com/NVIDIA/spark-rapids/pull/1577)|rename directory from spark310 to spark311|
+|[#1578](https://github.com/NVIDIA/spark-rapids/pull/1578)|Test to track RAPIDS-side issues re SPARK-32639|
+|[#1583](https://github.com/NVIDIA/spark-rapids/pull/1583)|fix request-action issue [skip ci]|
+|[#1555](https://github.com/NVIDIA/spark-rapids/pull/1555)|Enable ANSI mode for CAST string to timestamp|
+|[#1531](https://github.com/NVIDIA/spark-rapids/pull/1531)|Decimal Support for writing Parquet|
+|[#1545](https://github.com/NVIDIA/spark-rapids/pull/1545)|Support comparing ORC data|
+|[#1570](https://github.com/NVIDIA/spark-rapids/pull/1570)|Branch 0.4 doc cleanup|
+|[#1569](https://github.com/NVIDIA/spark-rapids/pull/1569)|Add shim method shouldIgnorePath|
+|[#1564](https://github.com/NVIDIA/spark-rapids/pull/1564)|Add in support for Decimal Multiply and DIV|
+|[#1561](https://github.com/NVIDIA/spark-rapids/pull/1561)|Decimal support for add and subtract|
+|[#1560](https://github.com/NVIDIA/spark-rapids/pull/1560)|support sum in window aggregation for decimal|
+|[#1546](https://github.com/NVIDIA/spark-rapids/pull/1546)|Cleanup shutdown logging for UCX shuffle|
+|[#1551](https://github.com/NVIDIA/spark-rapids/pull/1551)|RAPIDS-accelerated Hive UDFs support all types|
+|[#1543](https://github.com/NVIDIA/spark-rapids/pull/1543)|Shuffle/transport enabled by default|
+|[#1552](https://github.com/NVIDIA/spark-rapids/pull/1552)|Disable blackduck signature check|
+|[#1540](https://github.com/NVIDIA/spark-rapids/pull/1540)|Handle ShuffleManager api calls when plugin is not fully initialized|
+|[#1547](https://github.com/NVIDIA/spark-rapids/pull/1547)|Cleanup shuffle transport receive calls|
+|[#1512](https://github.com/NVIDIA/spark-rapids/pull/1512)|Support window operations on Decimal|
+|[#1532](https://github.com/NVIDIA/spark-rapids/pull/1532)|Support casting from decimal to decimal|
+|[#1542](https://github.com/NVIDIA/spark-rapids/pull/1542)|Change the number of partitions to zero when a range is empty|
+|[#1506](https://github.com/NVIDIA/spark-rapids/pull/1506)|Add --use-decimals flag to TPC-DS ConvertFiles|
+|[#1511](https://github.com/NVIDIA/spark-rapids/pull/1511)|Remove unused Jenkinsfiles [skip ci]|
+|[#1505](https://github.com/NVIDIA/spark-rapids/pull/1505)|Add least, greatest and eqNullSafe support for DecimalType|
+|[#1484](https://github.com/NVIDIA/spark-rapids/pull/1484)|add doc for nsight systems bundled with cuda toolkit|
+|[#1478](https://github.com/NVIDIA/spark-rapids/pull/1478)|Documentation for RAPIDS-accelerated Hive UDFs|
+|[#1477](https://github.com/NVIDIA/spark-rapids/pull/1477)|Allow structs and arrays to pass through for Shuffle and Sort |
+|[#1489](https://github.com/NVIDIA/spark-rapids/pull/1489)|Adds in some support for the array sql function|
+|[#1438](https://github.com/NVIDIA/spark-rapids/pull/1438)|Cast from numeric types to decimal type|
+|[#1493](https://github.com/NVIDIA/spark-rapids/pull/1493)|Moved ParquetRecordMaterializer to the shim package to follow convention|
+|[#1495](https://github.com/NVIDIA/spark-rapids/pull/1495)|Fix merge conflict, merge branch 0.3 to branch 0.4 [skip ci]|
+|[#1472](https://github.com/NVIDIA/spark-rapids/pull/1472)|Add an example RAPIDS-accelerated Hive UDF using native code|
+|[#1488](https://github.com/NVIDIA/spark-rapids/pull/1488)|Rename Spark 3.1.0 shim to Spark 3.1.1 to match community|
+|[#1474](https://github.com/NVIDIA/spark-rapids/pull/1474)|Fix link|
+|[#1476](https://github.com/NVIDIA/spark-rapids/pull/1476)|DecimalType support for Aggregate Count|
+|[#1475](https://github.com/NVIDIA/spark-rapids/pull/1475)| Join support for DecimalType|
+|[#1244](https://github.com/NVIDIA/spark-rapids/pull/1244)|Support round and bround SQL functions |
+|[#1458](https://github.com/NVIDIA/spark-rapids/pull/1458)|Add in support for struct and named_struct|
+|[#1465](https://github.com/NVIDIA/spark-rapids/pull/1465)|DecimalType support for UnionExec and ExpandExec|
+|[#1450](https://github.com/NVIDIA/spark-rapids/pull/1450)|Add dynamic configs for the spark-rapids IT pipelines|
+|[#1207](https://github.com/NVIDIA/spark-rapids/pull/1207)|Spark SQL hash function using murmur3|
+|[#1457](https://github.com/NVIDIA/spark-rapids/pull/1457)|Support reading decimal columns from parquet files on Databricks|
+|[#1455](https://github.com/NVIDIA/spark-rapids/pull/1455)|Upgrade Scala Maven Plugin to 4.3.0|
+|[#1453](https://github.com/NVIDIA/spark-rapids/pull/1453)|DecimalType support for IfElse and Coalesce|
+|[#1452](https://github.com/NVIDIA/spark-rapids/pull/1452)|Support DecimalType for CaseWhen|
+|[#1444](https://github.com/NVIDIA/spark-rapids/pull/1444)|Improve UX when running benchmarks from Spark shell|
+|[#1294](https://github.com/NVIDIA/spark-rapids/pull/1294)|Support reading decimal columns from parquet files|
+|[#1153](https://github.com/NVIDIA/spark-rapids/pull/1153)|Scala UDF will compile children expressions in Project|
+|[#1416](https://github.com/NVIDIA/spark-rapids/pull/1416)|Optimize mvn dependency download scripts|
+|[#1430](https://github.com/NVIDIA/spark-rapids/pull/1430)|Add project for testing code that requires Spark 3.1.0 or later|
+|[#1425](https://github.com/NVIDIA/spark-rapids/pull/1425)|Add in Decimal support for abs, floor, ceil, unary - and unary +|
+|[#1427](https://github.com/NVIDIA/spark-rapids/pull/1427)|Revert "Make the multi-threaded parquet reader the default"|
+|[#1420](https://github.com/NVIDIA/spark-rapids/pull/1420)|Add udf jar to nightly integration tests|
+|[#1422](https://github.com/NVIDIA/spark-rapids/pull/1422)|Log the number of concurrent gpu tasks allowed on Executor startup|
+|[#1401](https://github.com/NVIDIA/spark-rapids/pull/1401)|Accelerate the coalescing parquet reader when reading files from multiple partitioned folders|
+|[#1413](https://github.com/NVIDIA/spark-rapids/pull/1413)|Add config for cast float to integral types|
+|[#1313](https://github.com/NVIDIA/spark-rapids/pull/1313)|Support spilling to disk directly via cuFile/GDS|
+|[#1411](https://github.com/NVIDIA/spark-rapids/pull/1411)|Add udf-examples jar to databricks build|
+|[#1412](https://github.com/NVIDIA/spark-rapids/pull/1412)|Fix a lot of tests marked with xfail for Spark 3.1.0 that no longer fail|
+|[#1414](https://github.com/NVIDIA/spark-rapids/pull/1414)|Build merged code of HEAD and BASE branch for pre-merge [skip ci]|
+|[#1409](https://github.com/NVIDIA/spark-rapids/pull/1409)|Add option to use decimals in tpc-ds csv to parquet conversion|
+|[#1410](https://github.com/NVIDIA/spark-rapids/pull/1410)|Add Decimal support for In, InSet, AtLeastNNonNulls, GetArrayItem, GetStructField, and GenerateExec|
+|[#1408](https://github.com/NVIDIA/spark-rapids/pull/1408)|Support RAPIDS-accelerated HiveGenericUDF|
+|[#1407](https://github.com/NVIDIA/spark-rapids/pull/1407)|Update docs and tests for null CSV support|
+|[#1393](https://github.com/NVIDIA/spark-rapids/pull/1393)|Support RAPIDS-accelerated HiveSimpleUDF|
+|[#1392](https://github.com/NVIDIA/spark-rapids/pull/1392)|Turn on hash partitioning for decimal support|
+|[#1402](https://github.com/NVIDIA/spark-rapids/pull/1402)|Better GPU Cast type checks|
+|[#1404](https://github.com/NVIDIA/spark-rapids/pull/1404)|Fix branch 0.4 merge conflict|
+|[#1323](https://github.com/NVIDIA/spark-rapids/pull/1323)|More advanced type checking and documentation|
+|[#1391](https://github.com/NVIDIA/spark-rapids/pull/1391)|Remove extra null join filtering because cudf is fast for this now.|
+|[#1395](https://github.com/NVIDIA/spark-rapids/pull/1395)|Fix branch-0.3 -> branch-0.4 automerge|
+|[#1382](https://github.com/NVIDIA/spark-rapids/pull/1382)|Handle "MM[/-]dd" and "dd[/-]MM" datetime formats in UnixTimeExprMeta|
+|[#1390](https://github.com/NVIDIA/spark-rapids/pull/1390)|Accelerated columnar to row/row to columnar for decimal|
+|[#1380](https://github.com/NVIDIA/spark-rapids/pull/1380)|Adds in basic support for decimal sort, sum, and some shuffle|
+|[#1367](https://github.com/NVIDIA/spark-rapids/pull/1367)|Reuse gpu expression conversion rules when checking sort order|
+|[#1349](https://github.com/NVIDIA/spark-rapids/pull/1349)|Add canonicalization tests|
+|[#1368](https://github.com/NVIDIA/spark-rapids/pull/1368)|Move to cudf 0.18-SNAPSHOT|
+|[#1361](https://github.com/NVIDIA/spark-rapids/pull/1361)|Use the correct precision when reading spark columnar data.|
+|[#1273](https://github.com/NVIDIA/spark-rapids/pull/1273)|Update docs and scripts to 0.4.0-SNAPSHOT|
+|[#1321](https://github.com/NVIDIA/spark-rapids/pull/1321)|Refactor to stop inheriting from HashJoin|
+|[#1311](https://github.com/NVIDIA/spark-rapids/pull/1311)|ParquetCachedBatchSerializer code cleanup|
+|[#1303](https://github.com/NVIDIA/spark-rapids/pull/1303)|Add explicit outputOrdering for BHJ and SHJ in spark310 shim|
+|[#1299](https://github.com/NVIDIA/spark-rapids/pull/1299)|Benchmark runner improved error handling|
 
 ## Release 0.3
 
diff --git a/scripts/generate-changelog b/scripts/generate-changelog
index 31fbba8f0b7..5b888ab6f35 100755
--- a/scripts/generate-changelog
+++ b/scripts/generate-changelog
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -44,11 +44,11 @@ Github personal access token: https://github.com/settings/tokens, and make you h
 Usage:
     cd spark-rapids/
 
-    # generate changelog for release 0.1,0.2,0.3
-    scripts/generate-changelog --token=<GITHUB_PERSONAL_ACCESS_TOKEN> --releases=0.1,0.2,0.3
+    # generate changelog for release 0.1,0.2,0.3,0.4
+    scripts/generate-changelog --token=<GITHUB_PERSONAL_ACCESS_TOKEN> --releases=0.1,0.2,0.3,0.4
 
-    # generate changelog for release 0.1,0.2,0.3 to /tmp/CHANGELOG.md
-    GITHUB_TOKEN=<GITHUB_PERSONAL_ACCESS_TOKEN> scripts/generate-changelog --releases=0.1,0.2,0.3 --path=/tmp/CHANGELOG.md
+    # generate changelog for release 0.1,0.2,0.3,0.4 to /tmp/CHANGELOG.md
+    GITHUB_TOKEN=<GITHUB_PERSONAL_ACCESS_TOKEN> scripts/generate-changelog --releases=0.1,0.2,0.3,0.4 --path=/tmp/CHANGELOG.md
 """
 import os
 import sys