From 441e15ef5f113c7f8e89ef89074493c35006fd42 Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Fri, 24 Nov 2023 01:30:59 -0800
Subject: [PATCH 01/40] [SPARK-46087][PYTHON] Sync PySpark dependencies in docs
 and dev requirements

### What changes were proposed in this pull request?
This PR proposes to synchronize the versions of dependencies listed in the [PySpark documentation](https://spark.apache.org/docs/latest/api/python/getting_started/install.html#dependencies) with those specified in the [dev/requirements.txt](https://github.com/apache/spark/blob/master/dev/requirements.txt) file.

### Why are the changes needed?

Aligning the versions of dependencies ensures that the development environment reflects the actual user environment more accurately.

### Does this PR introduce _any_ user-facing change?

No API changes.

### How was this patch tested?

Build the documents from latest master branch manually and sync the version of dependencies:

<img width="774" alt="Screenshot 2023-11-24 at 2 49 09 PM" src="https://github.com/apache/spark/assets/44108233/4f539fc7-0bbc-4fe3-949f-b4c225122b56">

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44000 from itholic/req-sync.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 dev/requirements.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/dev/requirements.txt b/dev/requirements.txt
index 66a74471377dd..7de55ec24968a 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -1,11 +1,11 @@
 # PySpark dependencies (required)
-py4j
+py4j>=0.10.9.7
 
 # PySpark dependencies (optional)
-numpy
-pyarrow
+numpy>=1.21
+pyarrow>=4.0.0
 six==1.16.0
-pandas
+pandas>=1.4.4
 scipy
 plotly
 mlflow>=2.3.1
@@ -52,8 +52,8 @@ black==23.9.1
 py
 
 # Spark Connect (required)
-grpcio==1.59.3
-grpcio-status==1.59.3
+grpcio>=1.59.3
+grpcio-status>=1.59.3
 protobuf==4.25.1
 googleapis-common-protos>=1.56.4
 

From 132bb63a897f4f4049f34deefc065ed3eac6a90f Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Fri, 24 Nov 2023 19:38:31 +0900
Subject: [PATCH 02/40] [SPARK-46016][DOCS][PS] Fix pandas API support list
 properly

### What changes were proposed in this pull request?

This PR proposes to fix a critical issue in the [Supported pandas API documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/supported_pandas_api.html) where many essential APIs such as `DataFrame.max`, `DataFrame.min`, `DataFrame.mean`, `and DataFrame.median`, etc. were incorrectly marked as not implemented - marked as "N" - as below:

<img width="291" alt="Screenshot 2023-11-24 at 12 37 49 PM" src="https://github.com/apache/spark/assets/44108233/95c5785c-711c-400c-b2ec-0db034e90fd8">

 The root cause of this issue was that the script used to generate the support list excluded functions inherited from parent classes. For instance, `CategoricalIndex.max` is actually supported by inheriting the `Index` class but was not directly implemented in `CategoricalIndex`, leading to it being marked as unsupported:

<img width="397" alt="Screenshot 2023-11-24 at 12 30 08 PM" src="https://github.com/apache/spark/assets/44108233/90e92996-a88a-4a20-bb0c-4909097e2688">

### Why are the changes needed?

The current documentation inaccurately represents the state of supported pandas API, which could significantly hinder user experience and adoption. By correcting these inaccuracies, we ensure that the documentation reflects the true capabilities of Pandas API on Spark, providing users with reliable and accurate information.

### Does this PR introduce _any_ user-facing change?

No. This PR only updates the documentation to accurately reflect the current state of supported pandas API.

### How was this patch tested?

Manually build documentation, and check if the supported pandas API list is correctly generated as below:

<img width="299" alt="Screenshot 2023-11-24 at 12 36 31 PM" src="https://github.com/apache/spark/assets/44108233/a2da0f0b-0973-45cb-b22d-9582bbeb51b5">

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #43996 from itholic/fix_supported_api_gen.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/pandas/supported_api_gen.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py
index a83731db8fc16..27d5cd4b37f9d 100644
--- a/python/pyspark/pandas/supported_api_gen.py
+++ b/python/pyspark/pandas/supported_api_gen.py
@@ -138,23 +138,11 @@ def _create_supported_by_module(
         # module not implemented
         return {}
 
-    pd_funcs = dict(
-        [
-            m
-            for m in getmembers(pd_module, isfunction)
-            if not m[0].startswith("_") and m[0] in pd_module.__dict__
-        ]
-    )
+    pd_funcs = dict([m for m in getmembers(pd_module, isfunction) if not m[0].startswith("_")])
     if not pd_funcs:
         return {}
 
-    ps_funcs = dict(
-        [
-            m
-            for m in getmembers(ps_module, isfunction)
-            if not m[0].startswith("_") and m[0] in ps_module.__dict__
-        ]
-    )
+    ps_funcs = dict([m for m in getmembers(ps_module, isfunction) if not m[0].startswith("_")])
 
     return _organize_by_implementation_status(
         module_name, pd_funcs, ps_funcs, pd_module_group, ps_module_group

From 2f6a38cfcb384b4f504e1c08264887ae90d441bc Mon Sep 17 00:00:00 2001
From: Alice Sayutina <alice.sayutina@databricks.com>
Date: Sat, 25 Nov 2023 09:52:27 +0900
Subject: [PATCH 03/40] [SPARK-45922][CONNECT][CLIENT] Minor retries
 refactoring (follow-up to multiple policies)

### What changes were proposed in this pull request?

Follow up to https://github.com/apache/spark/pull/43591.

Refactor default policy arguments into being an arguments on the class, not within core.py

### Why are the changes needed?
General refactoring, also makes it easier for other policies to derive.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Existing coverage

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #43800 from cdkrot/SPARK-45922.

Authored-by: Alice Sayutina <alice.sayutina@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../sql/connect/client/RetryPolicy.scala      |  2 +-
 python/pyspark/sql/connect/client/core.py     | 19 +---------
 python/pyspark/sql/connect/client/retries.py  | 37 ++++++++++++++++---
 .../sql/tests/connect/client/test_client.py   |  3 +-
 4 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RetryPolicy.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RetryPolicy.scala
index cb5b97f2e4aff..8c8472d780dbc 100644
--- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RetryPolicy.scala
+++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RetryPolicy.scala
@@ -55,7 +55,7 @@ object RetryPolicy {
   def defaultPolicy(): RetryPolicy = RetryPolicy(
     name = "DefaultPolicy",
     // Please synchronize changes here with Python side:
-    // pyspark/sql/connect/client/core.py
+    // pyspark/sql/connect/client/retries.py
     //
     // Note: these constants are selected so that the maximum tolerated wait is guaranteed
     // to be at least 10 minutes
diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py
index 58b48bd69ba43..5d8db69c641ff 100644
--- a/python/pyspark/sql/connect/client/core.py
+++ b/python/pyspark/sql/connect/client/core.py
@@ -595,23 +595,8 @@ def __init__(
         self._user_id = None
         self._retry_policies: List[RetryPolicy] = []
 
-        default_policy_args = {
-            # Please synchronize changes here with Scala side
-            # GrpcRetryHandler.scala
-            #
-            # Note: the number of retries is selected so that the maximum tolerated wait
-            # is guaranteed to be at least 10 minutes
-            "max_retries": 15,
-            "backoff_multiplier": 4.0,
-            "initial_backoff": 50,
-            "max_backoff": 60000,
-            "jitter": 500,
-            "min_jitter_threshold": 2000,
-        }
-        if retry_policy:
-            default_policy_args.update(retry_policy)
-
-        default_policy = DefaultPolicy(**default_policy_args)
+        retry_policy_args = retry_policy or dict()
+        default_policy = DefaultPolicy(**retry_policy_args)
         self.set_retry_policies([default_policy])
 
         if self._builder.session_id is None:
diff --git a/python/pyspark/sql/connect/client/retries.py b/python/pyspark/sql/connect/client/retries.py
index 6aa959e09b5b0..26aa6893dfae5 100644
--- a/python/pyspark/sql/connect/client/retries.py
+++ b/python/pyspark/sql/connect/client/retries.py
@@ -185,6 +185,9 @@ def __init__(
         self._done = False
 
     def can_retry(self, exception: BaseException) -> bool:
+        if isinstance(exception, RetryException):
+            return True
+
         return any(policy.can_retry(exception) for policy in self._policies)
 
     def accept_exception(self, exception: BaseException) -> bool:
@@ -204,8 +207,12 @@ def _last_exception(self) -> BaseException:
     def _wait(self) -> None:
         exception = self._last_exception()
 
-        # Attempt to find a policy to wait with
+        if isinstance(exception, RetryException):
+            # Considered immediately retriable
+            logger.debug(f"Got error: {repr(exception)}. Retrying.")
+            return
 
+        # Attempt to find a policy to wait with
         for policy in self._policies:
             if not policy.can_retry(exception):
                 continue
@@ -244,12 +251,34 @@ def __iter__(self) -> Generator[AttemptManager, None, None]:
 class RetryException(Exception):
     """
     An exception that can be thrown upstream when inside retry and which is always retryable
+    even without policies
     """
 
 
 class DefaultPolicy(RetryPolicy):
-    def __init__(self, **kwargs):  # type: ignore[no-untyped-def]
-        super().__init__(**kwargs)
+    # Please synchronize changes here with Scala side in
+    # org.apache.spark.sql.connect.client.RetryPolicy
+    #
+    # Note: the number of retries is selected so that the maximum tolerated wait
+    # is guaranteed to be at least 10 minutes
+
+    def __init__(
+        self,
+        max_retries: Optional[int] = 15,
+        backoff_multiplier: float = 4.0,
+        initial_backoff: int = 50,
+        max_backoff: Optional[int] = 60000,
+        jitter: int = 500,
+        min_jitter_threshold: int = 2000,
+    ):
+        super().__init__(
+            max_retries=max_retries,
+            backoff_multiplier=backoff_multiplier,
+            initial_backoff=initial_backoff,
+            max_backoff=max_backoff,
+            jitter=jitter,
+            min_jitter_threshold=min_jitter_threshold,
+        )
 
     def can_retry(self, e: BaseException) -> bool:
         """
@@ -267,8 +296,6 @@ def can_retry(self, e: BaseException) -> bool:
         True if the exception can be retried, False otherwise.
 
         """
-        if isinstance(e, RetryException):
-            return True
 
         if not isinstance(e, grpc.RpcError):
             return False
diff --git a/python/pyspark/sql/tests/connect/client/test_client.py b/python/pyspark/sql/tests/connect/client/test_client.py
index 580ebc3965bb5..12e690c3a3099 100644
--- a/python/pyspark/sql/tests/connect/client/test_client.py
+++ b/python/pyspark/sql/tests/connect/client/test_client.py
@@ -31,7 +31,6 @@
     from pyspark.sql.connect.client.retries import (
         Retrying,
         DefaultPolicy,
-        RetryException,
         RetriesExceeded,
     )
     from pyspark.sql.connect.client.reattach import ExecutePlanResponseReattachableIterator
@@ -111,7 +110,7 @@ def sleep(t):
         try:
             for attempt in Retrying(client._retry_policies, sleep=sleep):
                 with attempt:
-                    raise RetryException()
+                    raise TestException("Retryable error", grpc.StatusCode.UNAVAILABLE)
         except RetriesExceeded:
             pass
 

From 50f189b3f48aad21307d52ad0c90ff4d9ac5e06d Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Fri, 24 Nov 2023 17:26:43 -0800
Subject: [PATCH 04/40] [SPARK-46066][SQL] Use the Separators API instead of
 the String API to construct the `DefaultPrettyPrinter`

### What changes were proposed in this pull request?
This pr use the `Separators` API instead of the `String` API to construct Jackson `DefaultPrettyPrinter` due to the   `String` API has been marked as deprecated in Jackson 2.16.0:

```java
    /**
     * Constructor that specifies separator String to use between root values;
     * if null, no separator is printed.
     *<p>
     * Note: simply constructs a {link SerializedString} out of parameter,
     * calls {link #DefaultPrettyPrinter(SerializableString)}
     *
     * param rootSeparator String to use as root value separator
     * deprecated in 2.16. Use the Separators API instead.
     */
    Deprecated
    public DefaultPrettyPrinter(String rootSeparator) {
        this((rootSeparator == null) ? null : new SerializedString(rootSeparator));
    }

```

### Why are the changes needed?
Clean up deprecated Jackson API usage.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Pass Github Actions

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #43973 from LuciferYang/jackson-216-Deprecated.

Lead-authored-by: yangjie01 <yangjie01@baidu.com>
Co-authored-by: YangJie <yangjie01@baidu.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../org/apache/spark/sql/catalyst/json/JacksonGenerator.scala  | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
index e02b286061861..e01457ff10255 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala
@@ -75,7 +75,8 @@ class JacksonGenerator(
   private val gen = {
     val generator = new JsonFactory().createGenerator(writer).setRootValueSeparator(null)
     if (options.pretty) {
-      generator.setPrettyPrinter(new DefaultPrettyPrinter(""))
+      generator.setPrettyPrinter(
+        new DefaultPrettyPrinter(PrettyPrinter.DEFAULT_SEPARATORS.withRootSeparator("")))
     }
     if (options.writeNonAsciiCharacterAsCodePoint) {
       generator.setHighestNonEscapedChar(0x7F)

From a694a8a0be540e5d60d7f462e0761c4ba3b8b3e6 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Fri, 24 Nov 2023 17:31:22 -0800
Subject: [PATCH 05/40] [SPARK-46095][DOCS] Document `REST API` for Spark
 Standalone Cluster
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

This PR aims to document `REST API` for Spark Standalone Cluster.

### Why are the changes needed?

To help the users to understand Apache Spark features.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Manual review. `REST API` Section is added newly.

**AFTER**

<img width="704" alt="Screenshot 2023-11-24 at 4 13 53 PM" src="https://github.com/apache/spark/assets/9700541/a4e09d94-d216-4629-8b37-9d350365a428">

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44007 from dongjoon-hyun/SPARK-46095.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 docs/spark-standalone.md | 80 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index ce739cb90b531..2ab68d2a8049f 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -518,6 +518,8 @@ Spark applications supports the following configuration properties specific to s
 
 # Launching Spark Applications
 
+## Spark Protocol
+
 The [`spark-submit` script](submitting-applications.html) provides the most straightforward way to
 submit a compiled Spark application to the cluster. For standalone clusters, Spark currently
 supports two deploy modes. In `client` mode, the driver is launched in the same process as the
@@ -540,6 +542,84 @@ failing repeatedly, you may do so through:
 
 You can find the driver ID through the standalone Master web UI at `http://<master url>:8080`.
 
+## REST API
+
+If `spark.master.rest.enabled` is enabled, Spark master provides additional REST API
+via <code>http://[host:port]/[version]/submissions/[action]</code> where
+<code>host</code> is the master host, and
+<code>port</code> is the port number specified by `spark.master.rest.port` (default: 6066), and 
+<code>version</code> is a protocol version, <code>v1</code> as of today, and
+<code>action</code> is one of the following supported actions.
+
+<table class="table table-striped">
+  <thead><tr><th style="width:21%">Command</th><th>Description</th><th>HTTP METHOD</th><th>Since Version</th></tr></thead>
+  <tr>
+    <td><code>create</code></td>
+    <td>Create a Spark driver via <code>cluster</code> mode.</td>
+    <td>POST</td>
+    <td>1.3.0</td>
+  </tr>
+  <tr>
+    <td><code>kill</code></td>
+    <td>Kill a single Spark driver.</td>
+    <td>POST</td>
+    <td>1.3.0</td>
+  </tr>
+  <tr>
+    <td><code>killall</code></td>
+    <td>Kill all running Spark drivers.</td>
+    <td>POST</td>
+    <td>4.0.0</td>
+  </tr>
+  <tr>
+    <td><code>status</code></td>
+    <td>Check the status of a Spark job.</td>
+    <td>GET</td>
+    <td>1.3.0</td>
+  </tr>
+  <tr>
+    <td><code>clear</code></td>
+    <td>Clear the completed drivers and applications.</td>
+    <td>POST</td>
+    <td>4.0.0</td>
+  </tr>
+</table>
+
+The following is a <code>curl</code> CLI command example with the `pi.py` and REST API.
+
+```bash
+$ curl -XPOST http://IP:PORT/v1/submissions/create \
+--header "Content-Type:application/json;charset=UTF-8" \
+--data '{
+  "appResource": "",
+  "sparkProperties": {
+    "spark.master": "spark://master:7077",
+    "spark.app.name": "Spark Pi",
+    "spark.driver.memory": "1g",
+    "spark.driver.cores": "1",
+    "spark.jars": ""
+  },
+  "clientSparkVersion": "",
+  "mainClass": "org.apache.spark.deploy.SparkSubmit",
+  "environmentVariables": { },
+  "action": "CreateSubmissionRequest",
+  "appArgs": [ "/opt/spark/examples/src/main/python/pi.py", "10" ]
+}'
+```
+
+The following is the response from the REST API for the above <code>create</code> request.
+
+```bash
+{
+  "action" : "CreateSubmissionResponse",
+  "message" : "Driver successfully submitted as driver-20231124153531-0000",
+  "serverSparkVersion" : "4.0.0",
+  "submissionId" : "driver-20231124153531-0000",
+  "success" : true
+}
+```
+
+
 # Resource Scheduling
 
 The standalone cluster mode currently only supports a simple FIFO scheduler across applications.

From 5211f6b140a74bd28f7e05934508bdafdbe7f237 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Fri, 24 Nov 2023 17:52:23 -0800
Subject: [PATCH 06/40] [SPARK-46085][CONNECT] Dataset.groupingSets in Scala
 Spark Connect client

### What changes were proposed in this pull request?

This PR proposes to add `Dataset.groupingsets` API added from https://github.com/apache/spark/pull/43813 to Scala Spark Connect cleint.

### Why are the changes needed?

For feature parity.

### Does this PR introduce _any_ user-facing change?

Yes, it adds a new API to Scala Spark Connect client.

### How was this patch tested?

Unittest was added.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #43995 from HyukjinKwon/SPARK-46085.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../scala/org/apache/spark/sql/Dataset.scala  |  35 ++++++++++++
 .../spark/sql/RelationalGroupedDataset.scala  |   8 ++-
 .../spark/sql/PlanGenerationTestSuite.scala   |   6 +++
 .../explain-results/groupingSets.explain      |   4 ++
 .../query-tests/queries/groupingSets.json     |  50 ++++++++++++++++++
 .../queries/groupingSets.proto.bin            | Bin 0 -> 106 bytes
 6 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/groupingSets.explain
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/groupingSets.json
 create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin

diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala
index a1e57226e530f..d760c9d97693b 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1532,6 +1532,41 @@ class Dataset[T] private[sql] (
       proto.Aggregate.GroupType.GROUP_TYPE_CUBE)
   }
 
+  /**
+   * Create multi-dimensional aggregation for the current Dataset using the specified grouping
+   * sets, so we can run aggregation on them. See [[RelationalGroupedDataset]] for all the
+   * available aggregate functions.
+   *
+   * {{{
+   *   // Compute the average for all numeric columns group by specific grouping sets.
+   *   ds.groupingSets(Seq(Seq($"department", $"group"), Seq()), $"department", $"group").avg()
+   *
+   *   // Compute the max age and average salary, group by specific grouping sets.
+   *   ds.groupingSets(Seq($"department", $"gender"), Seq()), $"department", $"group").agg(Map(
+   *     "salary" -> "avg",
+   *     "age" -> "max"
+   *   ))
+   * }}}
+   *
+   * @group untypedrel
+   * @since 4.0.0
+   */
+  @scala.annotation.varargs
+  def groupingSets(groupingSets: Seq[Seq[Column]], cols: Column*): RelationalGroupedDataset = {
+    val groupingSetMsgs = groupingSets.map { groupingSet =>
+      val groupingSetMsg = proto.Aggregate.GroupingSets.newBuilder()
+      for (groupCol <- groupingSet) {
+        groupingSetMsg.addGroupingSet(groupCol.expr)
+      }
+      groupingSetMsg.build()
+    }
+    new RelationalGroupedDataset(
+      toDF(),
+      cols,
+      proto.Aggregate.GroupType.GROUP_TYPE_GROUPING_SETS,
+      groupingSets = Some(groupingSetMsgs))
+  }
+
   /**
    * (Scala-specific) Aggregates on the entire Dataset without groups.
    * {{{
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index 5ed97e45c7701..776a6231eaecd 100644
--- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -39,7 +39,8 @@ class RelationalGroupedDataset private[sql] (
     private[sql] val df: DataFrame,
     private[sql] val groupingExprs: Seq[Column],
     groupType: proto.Aggregate.GroupType,
-    pivot: Option[proto.Aggregate.Pivot] = None) {
+    pivot: Option[proto.Aggregate.Pivot] = None,
+    groupingSets: Option[Seq[proto.Aggregate.GroupingSets]] = None) {
 
   private[this] def toDF(aggExprs: Seq[Column]): DataFrame = {
     df.sparkSession.newDataFrame { builder =>
@@ -60,6 +61,11 @@ class RelationalGroupedDataset private[sql] (
           builder.getAggregateBuilder
             .setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_PIVOT)
             .setPivot(pivot.get)
+        case proto.Aggregate.GroupType.GROUP_TYPE_GROUPING_SETS =>
+          assert(groupingSets.isDefined)
+          val aggBuilder = builder.getAggregateBuilder
+            .setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_GROUPING_SETS)
+          groupingSets.get.foreach(aggBuilder.addGroupingSets)
         case g => throw new UnsupportedOperationException(g.toString)
       }
     }
diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index 5cc63bc45a04a..c5c917ebfa955 100644
--- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -3017,6 +3017,12 @@ class PlanGenerationTestSuite
     simple.groupBy(Column("id")).pivot("a").agg(functions.count(Column("b")))
   }
 
+  test("groupingSets") {
+    simple
+      .groupingSets(Seq(Seq(fn.col("a")), Seq.empty[Column]), fn.col("a"))
+      .agg("a" -> "max", "a" -> "count")
+  }
+
   test("width_bucket") {
     simple.select(fn.width_bucket(fn.col("b"), fn.col("b"), fn.col("b"), fn.col("a")))
   }
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupingSets.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupingSets.explain
new file mode 100644
index 0000000000000..1e3fe1a987ef5
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupingSets.explain
@@ -0,0 +1,4 @@
+Aggregate [a#0, spark_grouping_id#0L], [a#0, max(a#0) AS max(a)#0, count(a#0) AS count(a)#0L]
++- Expand [[id#0L, a#0, b#0, a#0, 0], [id#0L, a#0, b#0, null, 1]], [id#0L, a#0, b#0, a#0, spark_grouping_id#0L]
+   +- Project [id#0L, a#0, b#0, a#0 AS a#0]
+      +- LocalRelation <empty>, [id#0L, a#0, b#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.json b/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.json
new file mode 100644
index 0000000000000..6e84824ec7a3a
--- /dev/null
+++ b/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.json
@@ -0,0 +1,50 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "aggregate": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cid:bigint,a:int,b:double\u003e"
+      }
+    },
+    "groupType": "GROUP_TYPE_GROUPING_SETS",
+    "groupingExpressions": [{
+      "unresolvedAttribute": {
+        "unparsedIdentifier": "a"
+      }
+    }],
+    "aggregateExpressions": [{
+      "unresolvedFunction": {
+        "functionName": "max",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "a",
+            "planId": "0"
+          }
+        }]
+      }
+    }, {
+      "unresolvedFunction": {
+        "functionName": "count",
+        "arguments": [{
+          "unresolvedAttribute": {
+            "unparsedIdentifier": "a",
+            "planId": "0"
+          }
+        }]
+      }
+    }],
+    "groupingSets": [{
+      "groupingSet": [{
+        "unresolvedAttribute": {
+          "unparsedIdentifier": "a"
+        }
+      }]
+    }, {
+    }]
+  }
+}
\ No newline at end of file
diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ce0294096706ecc7e0528e34bde6f5e438ef0d37
GIT binary patch
literal 106
zcmd;L5@7U7;nLt@5@3i@5>hBGDJo4avB^xaO3F;n%q!7Jv;vVyRw?<VNja%@0<2Q3
uLd;x@iAn-ed|b@Ai4{WZLaac(0E3c{lmHiNa(-!E30%yGoeQGehyefswG|5h

literal 0
HcmV?d00001


From 7b58fffdeeb70524e18ad80ea0aa53e2ac910e2a Mon Sep 17 00:00:00 2001
From: Jiaan Geng <beliefer@163.com>
Date: Sat, 25 Nov 2023 14:38:34 -0600
Subject: [PATCH 07/40] [SPARK-46100][CORE][PYTHON] Reduce stack depth by
 replace (string|array).size with (string|array).length

### What changes were proposed in this pull request?
There are a lot of `[string|array].size` called.
In fact, the size calls the underlying length, this behavior increase the stack length.
We should call `[string|array].length` directly.
We also get the compile waring `Replace .size with .length on arrays and strings`

This PR just improve the core module.

### Why are the changes needed?
Reduce stack depth by replace (string|array).size with (string|array).length

### Does this PR introduce _any_ user-facing change?
'No'.

### How was this patch tested?
Exists test cases.

### Was this patch authored or co-authored using generative AI tooling?
'No'.

Closes #44011 from beliefer/SPARK-46100.

Authored-by: Jiaan Geng <beliefer@163.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../spark/api/python/PythonRunner.scala       |  2 +-
 .../spark/deploy/master/ui/MasterPage.scala   |  4 +-
 .../spark/executor/ExecutorMetrics.scala      |  2 +-
 .../apache/spark/resource/ResourceUtils.scala |  2 +-
 .../spark/scheduler/TaskDescription.scala     |  2 +-
 .../spark/scheduler/TaskSchedulerImpl.scala   |  4 +-
 .../apache/spark/ui/ConsoleProgressBar.scala  |  2 +-
 .../org/apache/spark/util/HadoopFSUtils.scala |  2 +-
 .../util/io/ChunkedByteBufferFileRegion.scala |  2 +-
 .../org/apache/spark/CheckpointSuite.scala    | 16 ++--
 .../org/apache/spark/DistributedSuite.scala   | 16 ++--
 .../scala/org/apache/spark/FileSuite.scala    |  2 +-
 .../apache/spark/MapOutputTrackerSuite.scala  |  4 +-
 .../org/apache/spark/PartitioningSuite.scala  |  4 +-
 .../scala/org/apache/spark/ShuffleSuite.scala |  2 +-
 .../deploy/DecommissionWorkerSuite.scala      |  2 +-
 .../spark/deploy/SparkSubmitSuite.scala       |  4 +-
 .../StandaloneDynamicAllocationSuite.scala    | 22 ++---
 .../spark/deploy/client/AppClientSuite.scala  |  6 +-
 .../history/FsHistoryProviderSuite.scala      | 20 ++---
 .../rest/StandaloneRestSubmitSuite.scala      |  2 +-
 .../WholeTextFileRecordReaderSuite.scala      |  4 +-
 .../plugin/PluginContainerSuite.scala         |  4 +-
 .../spark/rdd/AsyncRDDActionsSuite.scala      |  2 +-
 .../spark/rdd/LocalCheckpointSuite.scala      |  2 +-
 .../spark/rdd/PairRDDFunctionsSuite.scala     | 44 +++++-----
 .../org/apache/spark/rdd/PipedRDDSuite.scala  | 10 +--
 .../scala/org/apache/spark/rdd/RDDSuite.scala | 80 +++++++++----------
 .../org/apache/spark/rdd/SortingSuite.scala   |  6 +-
 .../spark/rdd/ZippedPartitionsSuite.scala     |  4 +-
 .../spark/resource/ResourceProfileSuite.scala |  2 +-
 .../spark/resource/ResourceUtilsSuite.scala   |  6 +-
 .../spark/scheduler/AQEShuffledRDD.scala      |  2 +-
 .../CoarseGrainedSchedulerBackendSuite.scala  |  2 +-
 .../spark/scheduler/DAGSchedulerSuite.scala   | 32 ++++----
 .../spark/scheduler/MapStatusSuite.scala      |  2 +-
 .../OutputCommitCoordinatorSuite.scala        |  8 +-
 .../scheduler/TaskSchedulerImplSuite.scala    | 12 +--
 .../spark/scheduler/TaskSetManagerSuite.scala |  4 +-
 .../KryoSerializerDistributedSuite.scala      |  2 +-
 .../sort/IndexShuffleBlockResolverSuite.scala |  2 +-
 .../apache/spark/storage/DiskStoreSuite.scala |  2 +-
 .../apache/spark/util/FileAppenderSuite.scala |  4 +-
 .../util/collection/SizeTrackerSuite.scala    |  2 +-
 44 files changed, 180 insertions(+), 180 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
index d6363182606d9..e6d5a750ea325 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -378,7 +378,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT](
         resources.foreach { case (k, v) =>
           PythonRDD.writeUTF(k, dataOut)
           PythonRDD.writeUTF(v.name, dataOut)
-          dataOut.writeInt(v.addresses.size)
+          dataOut.writeInt(v.addresses.length)
           v.addresses.foreach { case addr =>
             PythonRDD.writeUTF(addr, dataOut)
           }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index cb325b37958ec..b2f35984d37f8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -83,13 +83,13 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
       .flatMap(_.iterator)
       .groupBy(_._1) // group by resource name
       .map { case (rName, rInfoArr) =>
-      rName -> rInfoArr.map(_._2.addresses.size).sum
+      rName -> rInfoArr.map(_._2.addresses.length).sum
     }
     val usedInfo = aliveWorkers.map(_.resourcesInfoUsed)
       .flatMap(_.iterator)
       .groupBy(_._1) // group by resource name
       .map { case (rName, rInfoArr) =>
-      rName -> rInfoArr.map(_._2.addresses.size).sum
+      rName -> rInfoArr.map(_._2.addresses.length).sum
     }
     formatResourcesUsed(totalInfo, usedInfo)
   }
diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala
index 486e59652218b..8c474e9b76c6a 100644
--- a/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala
+++ b/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala
@@ -46,7 +46,7 @@ class ExecutorMetrics private[spark] extends Serializable {
 
   private[spark] def this(metrics: Array[Long]) = {
     this()
-    Array.copy(metrics, 0, this.metrics, 0, Math.min(metrics.size, this.metrics.size))
+    Array.copy(metrics, 0, this.metrics, 0, Math.min(metrics.length, this.metrics.length))
   }
 
   private[spark] def this(metrics: AtomicLongArray) = {
diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
index 9080be01a9e66..00c655f4a4f4d 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
@@ -303,7 +303,7 @@ private[spark] object ResourceUtils extends Logging {
       allocations: Map[String, ResourceInformation],
       execReqs: Map[String, ExecutorResourceRequest]): Unit = {
     execReqs.foreach { case (rName, req) =>
-      require(allocations.contains(rName) && allocations(rName).addresses.size >= req.amount,
+      require(allocations.contains(rName) && allocations(rName).addresses.length >= req.amount,
         s"Resource: ${rName}, with addresses: " +
           s"${allocations(rName).addresses.mkString(",")} " +
           s"is less than what the user requested: ${req.amount})")
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
index 6e6507782a49e..75032086ead72 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala
@@ -80,7 +80,7 @@ private[spark] object TaskDescription {
     map.foreach { case (key, value) =>
       dataOut.writeUTF(key)
       dataOut.writeUTF(value.name)
-      dataOut.writeInt(value.addresses.size)
+      dataOut.writeInt(value.addresses.length)
       value.addresses.foreach(dataOut.writeUTF(_))
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 41f6b3ad64bf5..15ae2fef221d1 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -434,7 +434,7 @@ private[spark] class TaskSchedulerImpl(
                 // addresses are the same as that we allocated in taskResourceAssignments since it's
                 // synchronized. We don't remove the exact addresses allocated because the current
                 // approach produces the identical result with less time complexity.
-                availableResources(i)(rName).remove(0, rInfo.addresses.size)
+                availableResources(i)(rName).remove(0, rInfo.addresses.length)
               }
             }
           } catch {
@@ -752,7 +752,7 @@ private[spark] class TaskSchedulerImpl(
               .mkString(",")
             addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr))
 
-            logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for " +
+            logInfo(s"Successfully scheduled all the ${addressesWithDescs.length} tasks for " +
               s"barrier stage ${taskSet.stageId}.")
           }
           taskSet.barrierPendingLaunchTasks.clear()
diff --git a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
index dff94b4e875de..b5473e076946b 100644
--- a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
+++ b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala
@@ -74,7 +74,7 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging {
    * the progress bar, then progress bar will be showed in next line without overwrite logs.
    */
   private def show(now: Long, stages: Seq[StageData]): Unit = {
-    val width = TerminalWidth / stages.size
+    val width = TerminalWidth / stages.length
     val bar = stages.map { s =>
       val total = s.numTasks
       val header = s"[Stage ${s.stageId}:"
diff --git a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala
index 3245a528b74cf..4c7b12f60cc8d 100644
--- a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala
@@ -245,7 +245,7 @@ private[spark] object HadoopFSUtils extends Logging {
     val allLeafStatuses = {
       val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory)
       val filteredNestedFiles: Seq[FileStatus] = contextOpt match {
-        case Some(context) if dirs.size > parallelismThreshold =>
+        case Some(context) if dirs.length > parallelismThreshold =>
           parallelListLeafFilesInternal(
             context,
             dirs.map(_.getPath).toImmutableArraySeq,
diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferFileRegion.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferFileRegion.scala
index 23fc0f88f0b93..ec74ce0473efd 100644
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferFileRegion.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferFileRegion.scala
@@ -69,7 +69,7 @@ private[io] class ChunkedByteBufferFileRegion(
       if (keepGoing) {
         // advance to the next chunk (if there are any more)
         currentChunkIdx += 1
-        if (currentChunkIdx == chunks.size) {
+        if (currentChunkIdx == chunks.length) {
           keepGoing = false
         } else {
           currentChunk = chunks(currentChunkIdx)
diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
index c425596eb0433..874f4896bb01e 100644
--- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala
@@ -170,10 +170,10 @@ trait RDDCheckpointTester { self: SparkFunSuite =>
    * upon checkpointing. Ignores the checkpointData field, which may grow when we checkpoint.
    */
   private def getSerializedSizes(rdd: RDD[_]): (Int, Int) = {
-    val rddSize = Utils.serialize(rdd).size
-    val rddCpDataSize = Utils.serialize(rdd.checkpointData).size
-    val rddPartitionSize = Utils.serialize(rdd.partitions).size
-    val rddDependenciesSize = Utils.serialize(rdd.dependencies).size
+    val rddSize = Utils.serialize(rdd).length
+    val rddCpDataSize = Utils.serialize(rdd.checkpointData).length
+    val rddPartitionSize = Utils.serialize(rdd.partitions).length
+    val rddDependenciesSize = Utils.serialize(rdd.dependencies).length
 
     // Print detailed size, helps in debugging
     logInfo("Serialized sizes of " + rdd +
@@ -339,7 +339,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
 
   runTest("ParallelCollectionRDD") { reliableCheckpoint: Boolean =>
     val parCollection = sc.makeRDD(1 to 4, 2)
-    val numPartitions = parCollection.partitions.size
+    val numPartitions = parCollection.partitions.length
     checkpoint(parCollection, reliableCheckpoint)
     assert(parCollection.dependencies === Nil)
     val result = parCollection.collect()
@@ -358,7 +358,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
     val blockManager = SparkEnv.get.blockManager
     blockManager.putSingle(blockId, "test", StorageLevel.MEMORY_ONLY)
     val blockRDD = new BlockRDD[String](sc, Array(blockId))
-    val numPartitions = blockRDD.partitions.size
+    val numPartitions = blockRDD.partitions.length
     checkpoint(blockRDD, reliableCheckpoint)
     val result = blockRDD.collect()
     if (reliableCheckpoint) {
@@ -507,7 +507,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
 
   runTest("CheckpointRDD with zero partitions") { reliableCheckpoint: Boolean =>
     val rdd = new BlockRDD[Int](sc, Array.empty[BlockId])
-    assert(rdd.partitions.size === 0)
+    assert(rdd.partitions.length === 0)
     assert(rdd.isCheckpointed === false)
     assert(rdd.isCheckpointedAndMaterialized === false)
     checkpoint(rdd, reliableCheckpoint)
@@ -516,7 +516,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS
     assert(rdd.count() === 0)
     assert(rdd.isCheckpointed)
     assert(rdd.isCheckpointedAndMaterialized)
-    assert(rdd.partitions.size === 0)
+    assert(rdd.partitions.length === 0)
   }
 
   runTest("checkpointAllMarkedAncestors") { reliableCheckpoint: Boolean =>
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index e156533be15ca..a2b09f0ef3c3a 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -80,7 +80,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
     sc = new SparkContext(clusterUrl, "test")
     val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1)), 5)
     val groups = pairs.groupByKey(5).collect()
-    assert(groups.size === 2)
+    assert(groups.length === 2)
     val valuesFor1 = groups.find(_._1 == 1).get._2
     assert(valuesFor1.toList.sorted === List(1, 2, 3))
     val valuesFor2 = groups.find(_._1 == 2).get._2
@@ -264,8 +264,8 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
     sc = new SparkContext(clusterUrl, "test")
     val data = sc.parallelize(Seq(true, true), 2)
     assert(data.count() === 2) // force executors to start
-    assert(data.map(markNodeIfIdentity).collect().size === 2)
-    assert(data.map(failOnMarkedIdentity).collect().size === 2)
+    assert(data.map(markNodeIfIdentity).collect().length === 2)
+    assert(data.map(failOnMarkedIdentity).collect().length === 2)
   }
 
   test("recover from repeated node failures during shuffle-map") {
@@ -275,7 +275,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
     for (i <- 1 to 3) {
       val data = sc.parallelize(Seq(true, false), 2)
       assert(data.count() === 2)
-      assert(data.map(markNodeIfIdentity).collect().size === 2)
+      assert(data.map(markNodeIfIdentity).collect().length === 2)
       assert(data.map(failOnMarkedIdentity).map(x => x -> x).groupByKey().count() === 2)
     }
   }
@@ -287,7 +287,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
     for (i <- 1 to 3) {
       val data = sc.parallelize(Seq(true, true), 2)
       assert(data.count() === 2)
-      assert(data.map(markNodeIfIdentity).collect().size === 2)
+      assert(data.map(markNodeIfIdentity).collect().length === 2)
       // This relies on mergeCombiners being used to perform the actual reduce for this
       // test to actually be testing what it claims.
       val grouped = data.map(x => x -> x).combineByKey(
@@ -295,7 +295,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
                       (x: Boolean, y: Boolean) => x,
                       (x: Boolean, y: Boolean) => failOnMarkedIdentity(x)
                     )
-      assert(grouped.collect().size === 1)
+      assert(grouped.collect().length === 1)
     }
   }
 
@@ -310,8 +310,8 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex
       data.persist(StorageLevel.MEMORY_ONLY_2)
 
       assert(data.count() === 4)
-      assert(data.map(markNodeIfIdentity).collect().size === 4)
-      assert(data.map(failOnMarkedIdentity).collect().size === 4)
+      assert(data.map(markNodeIfIdentity).collect().length === 4)
+      assert(data.map(failOnMarkedIdentity).collect().length === 4)
 
       // Create a new replicated RDD to make sure that cached peer information doesn't cause
       // problems.
diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
index 4a2b2339159cb..7750db6020887 100644
--- a/core/src/test/scala/org/apache/spark/FileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -236,7 +236,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
       // Try reading the output back as an object file
       val ct = reflect.ClassTag[Any](Utils.classForName(className, noSparkClassLoader = true))
       val output = sc.objectFile[Any](outputDir)
-      assert(output.collect().size === 3)
+      assert(output.collect().length === 3)
       assert(output.collect().head.getClass.getName === className)
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index dde30aee82878..5d635011d2ec6 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -237,13 +237,13 @@ class MapOutputTrackerSuite extends SparkFunSuite with LocalSparkContext {
     // as it has 4 out of 7 bytes of output.
     val topLocs50 = tracker.getLocationsWithLargestOutputs(10, 0, 1, 0.5)
     assert(topLocs50.nonEmpty)
-    assert(topLocs50.get.size === 1)
+    assert(topLocs50.get.length === 1)
     assert(topLocs50.get.head === BlockManagerId("a", "hostA", 1000))
 
     // When the threshold is 20%, both hosts should be returned as preferred locations.
     val topLocs20 = tracker.getLocationsWithLargestOutputs(10, 0, 1, 0.2)
     assert(topLocs20.nonEmpty)
-    assert(topLocs20.get.size === 2)
+    assert(topLocs20.get.length === 2)
     assert(topLocs20.get.toSet ===
            Seq(BlockManagerId("a", "hostA", 1000), BlockManagerId("b", "hostB", 1000)).toSet)
 
diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
index 28fa9f5e23e79..3447ba8c1765e 100644
--- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
+++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala
@@ -77,7 +77,7 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
       for (element <- 1 to 1000) {
         val partition = partitioner.getPartition(element)
         if (numPartitions > 1) {
-          if (partition < rangeBounds.size) {
+          if (partition < rangeBounds.length) {
             assert(element <= rangeBounds(partition))
           }
           if (partition > 0) {
@@ -111,7 +111,7 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva
     assert(count === rdd.count())
     sketched.foreach { case (idx, n, sample) =>
       assert(n === idx)
-      assert(sample.size === math.min(n, sampleSizePerPartition))
+      assert(sample.length === math.min(n, sampleSizePerPartition))
     }
   }
 
diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
index a92d532907adf..ac10a00d98e04 100644
--- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala
@@ -51,7 +51,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalRootDi
     sc = new SparkContext("local", "test", myConf)
     val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1)), 4)
     val groups = pairs.groupByKey(4).collect()
-    assert(groups.size === 2)
+    assert(groups.length === 2)
     val valuesFor1 = groups.find(_._1 == 1).get._2
     assert(valuesFor1.toList.sorted === List(1, 2, 3))
     val valuesFor2 = groups.find(_._1 == 2).get._2
diff --git a/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala
index 3b3bcff0c5a3f..20993df718a3b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala
@@ -439,7 +439,7 @@ class DecommissionWorkerSuite
     val appId = sc.applicationId
     eventually(timeout(1.minute), interval(1.seconds)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.getExecutorLimit === Int.MaxValue)
     }
diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
index a032e9aa16be9..553d001285b2d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala
@@ -1736,7 +1736,7 @@ object SimpleApplicationTest {
         .map(x => SparkEnv.get.conf.get(config))
         .collect()
         .distinct
-      if (executorValues.size != 1) {
+      if (executorValues.length != 1) {
         throw new SparkException(s"Inconsistent values for $config: " +
           s"${executorValues.mkString("values(", ", ", ")")}")
       }
@@ -1795,7 +1795,7 @@ class TestFileSystem extends org.apache.hadoop.fs.LocalFileSystem {
 class TestSparkApplication extends SparkApplication with Matchers {
 
   override def start(args: Array[String], conf: SparkConf): Unit = {
-    assert(args.size === 1)
+    assert(args.length === 1)
     assert(args(0) === "hello")
     assert(conf.get("spark.test.hello") === "world")
     assert(sys.props.get("spark.test.hello") === None)
diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
index 01995ca3632d2..5ecc551c16b8c 100644
--- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala
@@ -69,7 +69,7 @@ class StandaloneDynamicAllocationSuite
     workers = makeWorkers(10, 2048)
     // Wait until all workers register with master successfully
     eventually(timeout(1.minute), interval(10.milliseconds)) {
-      assert(getMasterState.workers.size === numWorkers)
+      assert(getMasterState.workers.length === numWorkers)
     }
   }
 
@@ -93,7 +93,7 @@ class StandaloneDynamicAllocationSuite
     val appId = sc.applicationId
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 2)
       assert(apps.head.getExecutorLimit === Int.MaxValue)
@@ -140,7 +140,7 @@ class StandaloneDynamicAllocationSuite
     val appId = sc.applicationId
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 2)
       assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4))
@@ -195,7 +195,7 @@ class StandaloneDynamicAllocationSuite
     val appId = sc.applicationId
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 2)
       assert(apps.head.executors.values.map(_.cores).toArray === Array(8, 8))
@@ -248,7 +248,7 @@ class StandaloneDynamicAllocationSuite
     val appId = sc.applicationId
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 10) // 20 cores total
       assert(apps.head.getExecutorLimit === Int.MaxValue)
@@ -302,7 +302,7 @@ class StandaloneDynamicAllocationSuite
     val appId = sc.applicationId
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 4) // 8 cores total
       assert(apps.head.getExecutorLimit === Int.MaxValue)
@@ -360,7 +360,7 @@ class StandaloneDynamicAllocationSuite
     sc.requestExecutors(2)
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 2)
       assert(apps.head.getExecutorLimit === 2)
@@ -385,7 +385,7 @@ class StandaloneDynamicAllocationSuite
     sc.requestExecutors(2)
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 2)
       assert(apps.head.getExecutorLimit === 2)
@@ -425,7 +425,7 @@ class StandaloneDynamicAllocationSuite
     val appId = sc.applicationId
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 2)
       assert(apps.head.getExecutorLimit === Int.MaxValue)
@@ -465,7 +465,7 @@ class StandaloneDynamicAllocationSuite
     val appId = sc.applicationId
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === initialExecutorLimit)
       assert(apps.head.getExecutorLimit === initialExecutorLimit)
@@ -477,7 +477,7 @@ class StandaloneDynamicAllocationSuite
     val appId = sc.applicationId
     eventually(timeout(10.seconds), interval(10.millis)) {
       val apps = getApplications()
-      assert(apps.size === 1)
+      assert(apps.length === 1)
       assert(apps.head.id === appId)
       assert(apps.head.executors.size === 2)
       assert(apps.head.getExecutorLimit === Int.MaxValue)
diff --git a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala
index d109ed8442d44..3555faf5c2cb9 100644
--- a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala
@@ -71,7 +71,7 @@ class AppClientSuite
     workers = makeWorkers(10, 2048)
     // Wait until all workers register with master successfully
     eventually(timeout(1.minute), interval(10.milliseconds)) {
-      assert(getMasterState.workers.size === numWorkers)
+      assert(getMasterState.workers.length === numWorkers)
     }
   }
 
@@ -99,7 +99,7 @@ class AppClientSuite
       eventually(timeout(10.seconds), interval(10.millis)) {
         val apps = getApplications()
         assert(ci.listener.connectedIdList.size === 1, "client listener should have one connection")
-        assert(apps.size === 1, "master should have 1 registered app")
+        assert(apps.length === 1, "master should have 1 registered app")
       }
 
       // Send message to Master to request Executors, verify request by change in executor limit
@@ -176,7 +176,7 @@ class AppClientSuite
       eventually(timeout(10.seconds), interval(10.millis)) {
         val apps = getApplications()
         assert(ci.listener.connectedIdList.size === 1, "client listener should have one connection")
-        assert(apps.size === 1, "master should have 1 registered app")
+        assert(apps.length === 1, "master should have 1 registered app")
       }
 
       // Send message to Master to request Executors with multiple resource profiles.
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index d16e904bdcf13..3013a5bf4a294 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -1113,13 +1113,13 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P
 
     provider.checkForLogs()
     provider.cleanLogs()
-    assert(new File(testDir.toURI).listFiles().size === logCount)
+    assert(new File(testDir.toURI).listFiles().length === logCount)
 
     // Move the clock forward 1 day and scan the files again. They should still be there.
     clock.advance(TimeUnit.DAYS.toMillis(1))
     provider.checkForLogs()
     provider.cleanLogs()
-    assert(new File(testDir.toURI).listFiles().size === logCount)
+    assert(new File(testDir.toURI).listFiles().length === logCount)
 
     // Update the slow app to contain valid info. Code should detect the change and not clean
     // it up.
@@ -1133,7 +1133,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P
     clock.advance(TimeUnit.DAYS.toMillis(2))
     provider.checkForLogs()
     provider.cleanLogs()
-    assert(new File(testDir.toURI).listFiles().size === validLogCount)
+    assert(new File(testDir.toURI).listFiles().length === validLogCount)
   }
 
   test("always find end event for finished apps") {
@@ -1414,12 +1414,12 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P
 
     provider.checkForLogs()
     // The invalid application log file would be cleaned by checkAndCleanLog().
-    assert(new File(testDir.toURI).listFiles().size === 1)
+    assert(new File(testDir.toURI).listFiles().length === 1)
 
     clock.advance(1)
     // cleanLogs() would clean the valid application log file.
     provider.cleanLogs()
-    assert(new File(testDir.toURI).listFiles().size === 0)
+    assert(new File(testDir.toURI).listFiles().length === 0)
   }
 
   private def assertOptionAfterSerde(opt: Option[Long], expected: Option[Long]): Unit = {
@@ -1556,7 +1556,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P
         SparkListenerJobStart(1, 0, Seq.empty)), rollFile = false)
       provider.checkForLogs()
       provider.cleanLogs()
-      assert(dir.listFiles().size === 1)
+      assert(dir.listFiles().length === 1)
       assert(provider.getListing().length === 1)
 
       // Manually delete the appstatus file to make an invalid rolling event log
@@ -1578,7 +1578,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P
       provider.checkForLogs()
       provider.cleanLogs()
       assert(provider.getListing().length === 1)
-      assert(dir.listFiles().size === 2)
+      assert(dir.listFiles().length === 2)
 
       // Make sure a new provider sees the valid application
       provider.stop()
@@ -1615,7 +1615,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P
       // The 1st checkForLogs should scan/update app2 only since it is newer than app1
       provider.checkForLogs()
       assert(provider.getListing().length === 1)
-      assert(dir.listFiles().size === 2)
+      assert(dir.listFiles().length === 2)
       assert(provider.getListing().map(e => e.id).contains("app2"))
       assert(!provider.getListing().map(e => e.id).contains("app1"))
 
@@ -1630,7 +1630,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P
       // The 2nd checkForLogs should scan/update app3 only since it is newer than app1
       provider.checkForLogs()
       assert(provider.getListing().length === 2)
-      assert(dir.listFiles().size === 3)
+      assert(dir.listFiles().length === 3)
       assert(provider.getListing().map(e => e.id).contains("app3"))
       assert(!provider.getListing().map(e => e.id).contains("app1"))
 
@@ -1655,7 +1655,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P
         SparkListenerJobStart(1, 0, Seq.empty)), rollFile = false)
       provider.checkForLogs()
       provider.cleanLogs()
-      assert(dir.listFiles().size === 1)
+      assert(dir.listFiles().length === 1)
       assert(provider.getListing().length === 1)
 
       // Manually delete event log files and create event log file reader
diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
index 2f645e69079a2..abe05a8055843 100644
--- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala
@@ -289,7 +289,7 @@ class StandaloneRestSubmitSuite extends SparkFunSuite {
     val statusRequestPath = s"$httpUrl/$v/submissions/status"
     val goodJson = constructSubmitRequest(masterUrl).toJson
     val badJson1 = goodJson.replaceAll("action", "fraction") // invalid JSON
-    val badJson2 = goodJson.substring(goodJson.size / 2) // malformed JSON
+    val badJson2 = goodJson.substring(goodJson.length / 2) // malformed JSON
     val notJson = "\"hello, world\""
     val (response1, code1) = sendHttpRequestWithResponse(submitRequestPath, "POST") // missing JSON
     val (response2, code2) = sendHttpRequestWithResponse(submitRequestPath, "POST", badJson1)
diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
index e64ebe2a55142..0fc0b7536067e 100644
--- a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala
@@ -97,7 +97,7 @@ class WholeTextFileRecordReaderSuite extends SparkFunSuite {
 
       val res = sc.wholeTextFiles(dir.toString, 3).collect()
 
-      assert(res.size === WholeTextFileRecordReaderSuite.fileNames.size,
+      assert(res.length === WholeTextFileRecordReaderSuite.fileNames.length,
         "Number of files read out does not fit with the actual value.")
 
       for ((filename, contents) <- res) {
@@ -120,7 +120,7 @@ class WholeTextFileRecordReaderSuite extends SparkFunSuite {
 
       val res = sc.wholeTextFiles(dir.toString, 3).collect()
 
-      assert(res.size === WholeTextFileRecordReaderSuite.fileNames.size,
+      assert(res.length === WholeTextFileRecordReaderSuite.fileNames.length,
         "Number of files read out does not fit with the actual value.")
 
       for ((filename, contents) <- res) {
diff --git a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala
index ef214bd50d928..95b484d7176a5 100644
--- a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala
@@ -214,11 +214,11 @@ class PluginContainerSuite extends SparkFunSuite with LocalSparkContext {
       }
       val execFiles =
         children.filter(_.getName.startsWith(NonLocalModeSparkPlugin.executorFileStr))
-      assert(execFiles.size === 1)
+      assert(execFiles.length === 1)
       val allLines = Files.readLines(execFiles(0), StandardCharsets.UTF_8)
       assert(allLines.size === 1)
       val addrs = NonLocalModeSparkPlugin.extractGpuAddrs(allLines.get(0))
-      assert(addrs.size === 2)
+      assert(addrs.length === 2)
       assert(addrs.sorted === Array("3", "4"))
 
       assert(NonLocalModeSparkPlugin.driverContext != null)
diff --git a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
index 56783de1c13b4..4239180ba6c37 100644
--- a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala
@@ -91,7 +91,7 @@ class AsyncRDDActionsSuite extends SparkFunSuite with TimeLimits {
       val expected = input.take(num)
       val saw = rdd.takeAsync(num).get()
       assert(saw == expected, "incorrect result for rdd with %d partitions (expected %s, saw %s)"
-        .format(rdd.partitions.size, expected, saw))
+        .format(rdd.partitions.length, expected, saw))
     }
     val input = Range(1, 1000)
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala b/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala
index f644fee74a18b..591b8b4c0df7e 100644
--- a/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala
@@ -159,7 +159,7 @@ class LocalCheckpointSuite extends SparkFunSuite with LocalSparkContext {
 
   test("missing checkpoint block fails with informative message") {
     val rdd = newRdd.localCheckpoint()
-    val numPartitions = rdd.partitions.size
+    val numPartitions = rdd.partitions.length
     val partitionIndices = rdd.partitions.map(_.index)
     val bmm = sc.env.blockManager.master
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 9b60d2eeeed1b..e436d98843411 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -41,7 +41,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val pairs = sc.parallelize(Seq((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2)
 
     val sets = pairs.aggregateByKey(new HashSet[Int]())(_ += _, _ ++= _).collect()
-    assert(sets.size === 3)
+    assert(sets.length === 3)
     val valuesFor1 = sets.find(_._1 == 1).get._2
     assert(valuesFor1.toList.sorted === List(1))
     val valuesFor3 = sets.find(_._1 == 3).get._2
@@ -53,7 +53,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
   test("groupByKey") {
     val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1)))
     val groups = pairs.groupByKey().collect()
-    assert(groups.size === 2)
+    assert(groups.length === 2)
     val valuesFor1 = groups.find(_._1 == 1).get._2
     assert(valuesFor1.toList.sorted === List(1, 2, 3))
     val valuesFor2 = groups.find(_._1 == 2).get._2
@@ -63,7 +63,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
   test("groupByKey with duplicates") {
     val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (1, 1), (2, 1)))
     val groups = pairs.groupByKey().collect()
-    assert(groups.size === 2)
+    assert(groups.length === 2)
     val valuesFor1 = groups.find(_._1 == 1).get._2
     assert(valuesFor1.toList.sorted === List(1, 1, 2, 3))
     val valuesFor2 = groups.find(_._1 == 2).get._2
@@ -73,7 +73,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
   test("groupByKey with negative key hash codes") {
     val pairs = sc.parallelize(Seq((-1, 1), (-1, 2), (-1, 3), (2, 1)))
     val groups = pairs.groupByKey().collect()
-    assert(groups.size === 2)
+    assert(groups.length === 2)
     val valuesForMinus1 = groups.find(_._1 == -1).get._2
     assert(valuesForMinus1.toList.sorted === List(1, 2, 3))
     val valuesFor2 = groups.find(_._1 == 2).get._2
@@ -83,7 +83,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
   test("groupByKey with many output partitions") {
     val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1)))
     val groups = pairs.groupByKey(10).collect()
-    assert(groups.size === 2)
+    assert(groups.length === 2)
     val valuesFor1 = groups.find(_._1 == 1).get._2
     assert(valuesFor1.toList.sorted === List(1, 2, 3))
     val valuesFor2 = groups.find(_._1 == 2).get._2
@@ -249,7 +249,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
     val joined = rdd1.join(rdd2).collect()
-    assert(joined.size === 4)
+    assert(joined.length === 4)
     assert(joined.toSet === Set(
       (1, (1, 'x')),
       (1, (2, 'x')),
@@ -262,7 +262,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (1, 3)))
     val rdd2 = sc.parallelize(Seq((1, 'x'), (1, 'y')))
     val joined = rdd1.join(rdd2).collect()
-    assert(joined.size === 6)
+    assert(joined.length === 6)
     assert(joined.toSet === Set(
       (1, (1, 'x')),
       (1, (1, 'y')),
@@ -277,7 +277,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
     val joined = rdd1.leftOuterJoin(rdd2).collect()
-    assert(joined.size === 5)
+    assert(joined.length === 5)
     assert(joined.toSet === Set(
       (1, (1, Some('x'))),
       (1, (2, Some('x'))),
@@ -296,7 +296,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd2 = sc.emptyRDD[(Int, Int)](intPairCT)
 
     val joined = rdd1.cogroup(rdd2).collect()
-    assert(joined.size > 0)
+    assert(joined.length > 0)
   }
 
   // See SPARK-9326
@@ -307,7 +307,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.emptyRDD[Int](intCT).groupBy((x) => 5)
     val joined = rdd1.cogroup(rdd2).collect()
-    assert(joined.size > 0)
+    assert(joined.length > 0)
   }
 
   // See SPARK-22465
@@ -377,7 +377,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
     val joined = rdd1.rightOuterJoin(rdd2).collect()
-    assert(joined.size === 5)
+    assert(joined.length === 5)
     assert(joined.toSet === Set(
       (1, (Some(1), 'x')),
       (1, (Some(2), 'x')),
@@ -391,7 +391,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
     val joined = rdd1.fullOuterJoin(rdd2).collect()
-    assert(joined.size === 6)
+    assert(joined.length === 6)
     assert(joined.toSet === Set(
       (1, (Some(1), Some('x'))),
       (1, (Some(2), Some('x'))),
@@ -406,14 +406,14 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Seq((4, 'x'), (5, 'y'), (5, 'z'), (6, 'w')))
     val joined = rdd1.join(rdd2).collect()
-    assert(joined.size === 0)
+    assert(joined.length === 0)
   }
 
   test("join with many output partitions") {
     val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
     val joined = rdd1.join(rdd2, 10).collect()
-    assert(joined.size === 4)
+    assert(joined.length === 4)
     assert(joined.toSet === Set(
       (1, (1, 'x')),
       (1, (2, 'x')),
@@ -426,7 +426,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1)))
     val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
     val joined = rdd1.groupWith(rdd2).collect()
-    assert(joined.size === 4)
+    assert(joined.length === 4)
     val joinedSet = joined.map(x => (x._1, (x._2._1.toList, x._2._2.toList))).toSet
     assert(joinedSet === Set(
       (1, (List(1, 2), List('x'))),
@@ -441,7 +441,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w')))
     val rdd3 = sc.parallelize(Seq((1, 'a'), (3, 'b'), (4, 'c'), (4, 'd')))
     val joined = rdd1.groupWith(rdd2, rdd3).collect()
-    assert(joined.size === 4)
+    assert(joined.length === 4)
     val joinedSet = joined.map(x => (x._1,
       (x._2._1.toList, x._2._2.toList, x._2._3.toList))).toSet
     assert(joinedSet === Set(
@@ -458,7 +458,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val rdd3 = sc.parallelize(Seq((1, 'a'), (3, 'b'), (4, 'c'), (4, 'd')))
     val rdd4 = sc.parallelize(Seq((2, '@')))
     val joined = rdd1.groupWith(rdd2, rdd3, rdd4).collect()
-    assert(joined.size === 4)
+    assert(joined.length === 4)
     val joinedSet = joined.map(x => (x._1,
       (x._2._1.toList, x._2._2.toList, x._2._3.toList, x._2._4.toList))).toSet
     assert(joinedSet === Set(
@@ -492,14 +492,14 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val b = a.map(a => (a, (a * 2).toString))
     // then a group by, and see we didn't revert to 2 partitions
     val c = b.groupByKey()
-    assert(c.partitions.size === 2000)
+    assert(c.partitions.length === 2000)
   }
 
   test("default partitioner uses largest partitioner") {
     val a = sc.makeRDD(Seq((1, "a"), (2, "b")), 2)
     val b = sc.makeRDD(Seq((1, "a"), (2, "b")), 2000)
     val c = a.join(b)
-    assert(c.partitions.size === 2000)
+    assert(c.partitions.length === 2000)
   }
 
   test("subtract") {
@@ -507,7 +507,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val b = sc.parallelize(Array(2, 3, 4).toImmutableArraySeq, 4)
     val c = a.subtract(b)
     assert(c.collect().toSet === Set(1))
-    assert(c.partitions.size === a.partitions.size)
+    assert(c.partitions.length === a.partitions.length)
   }
 
   test("subtract with narrow dependency") {
@@ -531,7 +531,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
     val b = sc.parallelize(Seq((2, 20), (3, 30), (4, 40)), 4)
     val c = a.subtractByKey(b)
     assert(c.collect().toSet === Set((1, "a"), (1, "a")))
-    assert(c.partitions.size === a.partitions.size)
+    assert(c.partitions.length === a.partitions.length)
   }
 
   test("subtractByKey with narrow dependency") {
@@ -795,7 +795,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext {
         assertBinomialSample(exact = exact, actual = v.toInt, trials = trials(k).toInt,
           p = samplingRate)
       }
-      assert(takeSample.size === takeSample.toSet.size)
+      assert(takeSample.length === takeSample.toSet.size)
       takeSample.foreach { x => assert(1 <= x._2 && x._2 <= n, s"elements not in [1, $n]") }
     }
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
index 3a097e5335a2a..7f12d8b624c84 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala
@@ -47,7 +47,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall
     val piped = nums.pipe(Seq("cat"))
 
     val c = piped.collect()
-    assert(c.size === 4)
+    assert(c.length === 4)
     assert(c(0) === "1")
     assert(c(1) === "2")
     assert(c(2) === "3")
@@ -61,7 +61,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall
     // verify that both RDD.pipe(command: String) and RDD.pipe(command: String, env) work good
     for (piped <- Seq(nums.pipe("wc -l"), nums.pipe("wc -l", Map[String, String]()))) {
       val c = piped.collect()
-      assert(c.size === 2)
+      assert(c.length === 2)
       assert(c(0).trim === "2")
       assert(c(1).trim === "2")
     }
@@ -129,7 +129,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall
 
     val c = piped.collect()
 
-    assert(c.size === 8)
+    assert(c.length === 8)
     assert(c(0) === "0")
     assert(c(1) === "\u0001")
     assert(c(2) === "1_")
@@ -151,7 +151,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall
             f(e + "_")
           }
         }).collect()
-    assert(d.size === 8)
+    assert(d.length === 8)
     assert(d(0) === "0")
     assert(d(1) === "\u0001")
     assert(d(2) === "b\t2_")
@@ -216,7 +216,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall
     val nums = sc.makeRDD(Array(1, 2, 3, 4).toImmutableArraySeq, 2)
     val piped = nums.pipe(Seq("cat"), separateWorkingDir = true)
     val c = piped.collect()
-    assert(c.size === 4)
+    assert(c.length === 4)
     assert(c(0) === "1")
     assert(c(1) === "2")
     assert(c(2) === "3")
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
index 32ba2053258eb..706ebfa936470 100644
--- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -322,7 +322,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
   test("empty RDD") {
     val empty = new EmptyRDD[Int](sc)
     assert(empty.count() === 0)
-    assert(empty.collect().size === 0)
+    assert(empty.collect().length === 0)
 
     val thrown = intercept[UnsupportedOperationException]{
       empty.reduce(_ + _)
@@ -331,12 +331,12 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
 
     val emptyKv = new EmptyRDD[(Int, Int)](sc)
     val rdd = sc.parallelize(1 to 2, 2).map(x => (x, x))
-    assert(rdd.join(emptyKv).collect().size === 0)
-    assert(rdd.rightOuterJoin(emptyKv).collect().size === 0)
-    assert(rdd.leftOuterJoin(emptyKv).collect().size === 2)
-    assert(rdd.fullOuterJoin(emptyKv).collect().size === 2)
-    assert(rdd.cogroup(emptyKv).collect().size === 2)
-    assert(rdd.union(emptyKv).collect().size === 2)
+    assert(rdd.join(emptyKv).collect().length === 0)
+    assert(rdd.rightOuterJoin(emptyKv).collect().length === 0)
+    assert(rdd.leftOuterJoin(emptyKv).collect().length === 2)
+    assert(rdd.fullOuterJoin(emptyKv).collect().length === 2)
+    assert(rdd.cogroup(emptyKv).collect().length === 2)
+    assert(rdd.union(emptyKv).collect().length === 2)
   }
 
   test("repartitioned RDDs") {
@@ -348,7 +348,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
 
     // Coalesce partitions
     val repartitioned1 = data.repartition(2)
-    assert(repartitioned1.partitions.size == 2)
+    assert(repartitioned1.partitions.length == 2)
     val partitions1 = repartitioned1.glom().collect()
     assert(partitions1(0).length > 0)
     assert(partitions1(1).length > 0)
@@ -356,7 +356,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
 
     // Split partitions
     val repartitioned2 = data.repartition(20)
-    assert(repartitioned2.partitions.size == 20)
+    assert(repartitioned2.partitions.length == 20)
     val partitions2 = repartitioned2.glom().collect()
     assert(partitions2(0).length > 0)
     assert(partitions2(19).length > 0)
@@ -370,7 +370,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     val data = sc.parallelize(input.toImmutableArraySeq, initialPartitions)
 
     val repartitioned1 = data.repartition(2)
-    assert(repartitioned1.partitions.size == 2)
+    assert(repartitioned1.partitions.length == 2)
     val partitions1 = repartitioned1.glom().collect()
     // some noise in balancing is allowed due to randomization
     assert(math.abs(partitions1(0).length - 500) < initialPartitions)
@@ -380,7 +380,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     def testSplitPartitions(input: Seq[Int], initialPartitions: Int, finalPartitions: Int): Unit = {
       val data = sc.parallelize(input, initialPartitions)
       val repartitioned = data.repartition(finalPartitions)
-      assert(repartitioned.partitions.size === finalPartitions)
+      assert(repartitioned.partitions.length === finalPartitions)
       val partitions = repartitioned.glom().collect()
       // assert all elements are present
       assert(repartitioned.collect().sortWith(_ > _).toSeq === input.toSeq.sortWith(_ > _).toSeq)
@@ -441,7 +441,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
 
     // when shuffling, we can increase the number of partitions
     val coalesced6 = data.coalesce(20, shuffle = true)
-    assert(coalesced6.partitions.size === 20)
+    assert(coalesced6.partitions.length === 20)
     assert(coalesced6.collect().toSet === (1 to 10).toSet)
   }
 
@@ -564,13 +564,13 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
       val coalesced2 = data2.coalesce(partitions)
 
       // test that we have 10000 partitions
-      assert(coalesced2.partitions.size == 10000, "Expected 10000 partitions, but got " +
-        coalesced2.partitions.size)
+      assert(coalesced2.partitions.length == 10000, "Expected 10000 partitions, but got " +
+        coalesced2.partitions.length)
 
       // test that we have 100 partitions
       val coalesced3 = data2.coalesce(numMachines * 2)
-      assert(coalesced3.partitions.size == 100, "Expected 100 partitions, but got " +
-        coalesced3.partitions.size)
+      assert(coalesced3.partitions.length == 100, "Expected 100 partitions, but got " +
+        coalesced3.partitions.length)
 
       // test that the groups are load balanced with 100 +/- 20 elements in each
       val maxImbalance3 = coalesced3.partitions
@@ -613,9 +613,9 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     val data = sc.parallelize(1 to 10, 10)
     // Note that split number starts from 0, so > 8 means only 10th partition left.
     val prunedRdd = new PartitionPruningRDD(data, splitNum => splitNum > 8)
-    assert(prunedRdd.partitions.size === 1)
+    assert(prunedRdd.partitions.length === 1)
     val prunedData = prunedRdd.collect()
-    assert(prunedData.size === 1)
+    assert(prunedData.length === 1)
     assert(prunedData(0) === 10)
   }
 
@@ -626,7 +626,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
 
   test("take") {
     var nums = sc.makeRDD(Range(1, 1000), 1)
-    assert(nums.take(0).size === 0)
+    assert(nums.take(0).length === 0)
     assert(nums.take(1) === Array(1))
     assert(nums.take(3) === Array(1, 2, 3))
     assert(nums.take(500) === (1 to 500).toArray)
@@ -635,7 +635,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     assert(nums.take(1000) === (1 to 999).toArray)
 
     nums = sc.makeRDD(Range(1, 1000), 2)
-    assert(nums.take(0).size === 0)
+    assert(nums.take(0).length === 0)
     assert(nums.take(1) === Array(1))
     assert(nums.take(3) === Array(1, 2, 3))
     assert(nums.take(500) === (1 to 500).toArray)
@@ -644,7 +644,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     assert(nums.take(1000) === (1 to 999).toArray)
 
     nums = sc.makeRDD(Range(1, 1000), 100)
-    assert(nums.take(0).size === 0)
+    assert(nums.take(0).length === 0)
     assert(nums.take(1) === Array(1))
     assert(nums.take(3) === Array(1, 2, 3))
     assert(nums.take(500) === (1 to 500).toArray)
@@ -653,7 +653,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     assert(nums.take(1000) === (1 to 999).toArray)
 
     nums = sc.makeRDD(Range(1, 1000), 1000)
-    assert(nums.take(0).size === 0)
+    assert(nums.take(0).length === 0)
     assert(nums.take(1) === Array(1))
     assert(nums.take(3) === Array(1, 2, 3))
     assert(nums.take(500) === (1 to 500).toArray)
@@ -662,7 +662,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     assert(nums.take(1000) === (1 to 999).toArray)
 
     nums = sc.parallelize(1 to 2, 2)
-    assert(nums.take(2147483638).size === 2)
+    assert(nums.take(2147483638).length === 2)
     assert(nums.takeAsync(2147483638).get().size === 2)
   }
 
@@ -670,7 +670,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     val nums = Seq.range(1, 100000)
     val ints = sc.makeRDD(scala.util.Random.shuffle(nums), 2)
     val topK = ints.top(5)
-    assert(topK.size === 5)
+    assert(topK.length === 5)
     assert(topK === nums.reverse.take(5))
   }
 
@@ -679,7 +679,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     implicit val ord = implicitly[Ordering[String]].reverse
     val rdd = sc.makeRDD(words, 2)
     val topK = rdd.top(2)
-    assert(topK.size === 2)
+    assert(topK.length === 2)
     assert(topK.sorted === Array("b", "a"))
   }
 
@@ -687,7 +687,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     val nums = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
     val rdd = sc.makeRDD(nums.toImmutableArraySeq, 2)
     val sortedLowerK = rdd.takeOrdered(5)
-    assert(sortedLowerK.size === 5)
+    assert(sortedLowerK.length === 5)
     assert(sortedLowerK === Array(1, 2, 3, 4, 5))
   }
 
@@ -695,7 +695,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     val nums = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
     val rdd = sc.makeRDD(nums.toImmutableArraySeq, 2)
     val sortedLowerK = rdd.takeOrdered(0)
-    assert(sortedLowerK.size === 0)
+    assert(sortedLowerK.length === 0)
   }
 
   test("SPARK-40276: takeOrdered with empty RDDs") {
@@ -708,7 +708,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     implicit val ord = implicitly[Ordering[Int]].reverse
     val rdd = sc.makeRDD(nums.toImmutableArraySeq, 2)
     val sortedTopK = rdd.takeOrdered(5)
-    assert(sortedTopK.size === 5)
+    assert(sortedTopK.length === 5)
     assert(sortedTopK === Array(10, 9, 8, 7, 6))
     assert(sortedTopK === nums.sorted(ord).take(5))
   }
@@ -736,48 +736,48 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
 
     for (num <- List(5, 20, 100)) {
       val sample = data.takeSample(withReplacement = false, num = num)
-      assert(sample.size === num)        // Got exactly num elements
+      assert(sample.length === num)        // Got exactly num elements
       assert(sample.toSet.size === num)  // Elements are distinct
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     for (seed <- 1 to 5) {
       val sample = data.takeSample(withReplacement = false, 20, seed)
-      assert(sample.size === 20)        // Got exactly 20 elements
+      assert(sample.length === 20)        // Got exactly 20 elements
       assert(sample.toSet.size === 20)  // Elements are distinct
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     for (seed <- 1 to 5) {
       val sample = data.takeSample(withReplacement = false, 100, seed)
-      assert(sample.size === 100)        // Got only 100 elements
+      assert(sample.length === 100)        // Got only 100 elements
       assert(sample.toSet.size === 100)  // Elements are distinct
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     for (seed <- 1 to 5) {
       val sample = data.takeSample(withReplacement = true, 20, seed)
-      assert(sample.size === 20)        // Got exactly 20 elements
+      assert(sample.length === 20)        // Got exactly 20 elements
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     {
       val sample = data.takeSample(withReplacement = true, num = 20)
-      assert(sample.size === 20)        // Got exactly 20 elements
+      assert(sample.length === 20)        // Got exactly 20 elements
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     {
       val sample = data.takeSample(withReplacement = true, num = n)
-      assert(sample.size === n)        // Got exactly n elements
+      assert(sample.length === n)        // Got exactly n elements
       // Chance of getting all distinct elements is astronomically low, so test we got < n
       assert(sample.toSet.size < n, "sampling with replacement returned all distinct elements")
       assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]")
     }
     for (seed <- 1 to 5) {
       val sample = data.takeSample(withReplacement = true, n, seed)
-      assert(sample.size === n)        // Got exactly n elements
+      assert(sample.length === n)        // Got exactly n elements
       // Chance of getting all distinct elements is astronomically low, so test we got < n
       assert(sample.toSet.size < n, "sampling with replacement returned all distinct elements")
     }
     for (seed <- 1 to 5) {
       val sample = data.takeSample(withReplacement = true, 2 * n, seed)
-      assert(sample.size === 2 * n)        // Got exactly 2 * n elements
+      assert(sample.length === 2 * n)        // Got exactly 2 * n elements
       // Chance of getting all distinct elements is still quite low, so test we got < n
       assert(sample.toSet.size < n, "sampling with replacement returned all distinct elements")
     }
@@ -794,7 +794,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
     val data = sc.parallelize(1 to n, 2)
     for(seed <- 1 to 5) {
       val splits = data.randomSplit(Array(1.0, 2.0, 3.0), seed)
-      assert(splits.size == 3, "wrong number of splits")
+      assert(splits.length == 3, "wrong number of splits")
       assert(splits.flatMap(_.collect()).sorted.toList == data.collect().toList,
         "incomplete or wrong split")
       val s = splits.map(_.count())
@@ -1179,7 +1179,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
       sc.hadoopFile(outDir, classOf[TextInputFormat], classOf[LongWritable], classOf[Text])
     val coalescedHadoopRDD =
       hadoopRDD.coalesce(2, partitionCoalescer = Option(new SizeBasedCoalescer(maxSplitSize)))
-    assert(coalescedHadoopRDD.partitions.size <= 10)
+    assert(coalescedHadoopRDD.partitions.length <= 10)
     var totalPartitionCount = 0L
     coalescedHadoopRDD.partitions.foreach(partition => {
       var splitSizeSum = 0L
@@ -1256,7 +1256,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually {
       .map(coalescedRDD.getPreferredLocations(_).head)
       .groupBy(identity)
       .view
-      .mapValues(_.size)
+      .mapValues(_.length)
 
     // Make sure the coalesced partitions are distributed fairly evenly between the two locations.
     // This should not become flaky since the DefaultPartitionsCoalescer uses a fixed seed.
@@ -1357,7 +1357,7 @@ class SizeBasedCoalescer(val maxSize: Int) extends PartitionCoalescer with Seria
       totalSum += splitSize
     }
 
-    while (index < partitions.size) {
+    while (index < partitions.length) {
       val partition = partitions(index)
       val fileSplit =
         partition.asInstanceOf[HadoopPartition].inputSplit.value.asInstanceOf[FileSplit]
diff --git a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
index 802889b047796..5771e99b64c69 100644
--- a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala
@@ -35,7 +35,7 @@ class SortingSuite extends SparkFunSuite with SharedSparkContext with Matchers {
     val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) }
     val pairs = sc.parallelize(pairArr.toImmutableArraySeq, 2)
     val sorted = pairs.sortByKey()
-    assert(sorted.partitions.size === 2)
+    assert(sorted.partitions.length === 2)
     assert(sorted.collect() === pairArr.sortBy(_._1))
   }
 
@@ -44,7 +44,7 @@ class SortingSuite extends SparkFunSuite with SharedSparkContext with Matchers {
     val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) }
     val pairs = sc.parallelize(pairArr.toImmutableArraySeq, 2)
     val sorted = pairs.sortByKey(true, 1)
-    assert(sorted.partitions.size === 1)
+    assert(sorted.partitions.length === 1)
     assert(sorted.collect() === pairArr.sortBy(_._1))
   }
 
@@ -53,7 +53,7 @@ class SortingSuite extends SparkFunSuite with SharedSparkContext with Matchers {
     val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) }
     val pairs = sc.parallelize(pairArr.toImmutableArraySeq, 2)
     val sorted = pairs.sortByKey(true, 20)
-    assert(sorted.partitions.size === 20)
+    assert(sorted.partitions.length === 20)
     assert(sorted.collect() === pairArr.sortBy(_._1))
   }
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala
index 7079b9ea8eadc..c04719eb9ea6f 100644
--- a/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala
@@ -21,7 +21,7 @@ import org.apache.spark.{SharedSparkContext, SparkFunSuite}
 
 object ZippedPartitionsSuite {
   def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
-    Iterator(i.toArray.size, s.toArray.size, d.toArray.size)
+    Iterator(i.toArray.length, s.toArray.length, d.toArray.length)
   }
 }
 
@@ -35,7 +35,7 @@ class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext {
 
     val obtainedSizes = zippedRDD.collect()
     val expectedSizes = Array(2, 3, 1, 2, 3, 1)
-    assert(obtainedSizes.size == 6)
+    assert(obtainedSizes.length == 6)
     assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2))
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala
index fd7018f189e26..be38315cd75fe 100644
--- a/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala
+++ b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala
@@ -374,7 +374,7 @@ class ResourceProfileSuite extends SparkFunSuite with MockitoSugar {
     rprof.require(eReq)
 
     // Update this if new resource type added
-    assert(ResourceProfile.allSupportedExecutorResources.size === 5,
+    assert(ResourceProfile.allSupportedExecutorResources.length === 5,
       "Executor resources should have 5 supported resources")
     assert(rprof.build().getCustomExecutorResources().size === 1,
       "Executor resources should have 1 custom resource")
diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala
index 1ab9f7c5d2b0c..20d6cc7671582 100644
--- a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala
@@ -101,13 +101,13 @@ class ResourceUtilsSuite extends SparkFunSuite
       val gpuValue = resources.get(GPU)
       assert(gpuValue.nonEmpty, "Should have a gpu entry")
       assert(gpuValue.get.name == "gpu", "name should be gpu")
-      assert(gpuValue.get.addresses.size == 2, "Should have 2 indexes")
+      assert(gpuValue.get.addresses.length == 2, "Should have 2 indexes")
       assert(gpuValue.get.addresses.sameElements(Array("0", "1")), "should have 0,1 entries")
 
       val fpgaValue = resources.get(FPGA)
       assert(fpgaValue.nonEmpty, "Should have a gpu entry")
       assert(fpgaValue.get.name == "fpga", "name should be fpga")
-      assert(fpgaValue.get.addresses.size == 3, "Should have 3 indexes")
+      assert(fpgaValue.get.addresses.length == 3, "Should have 3 indexes")
       assert(fpgaValue.get.addresses.sameElements(Array("f1", "f2", "f3")),
         "should have f1,f2,f3 entries")
     }
@@ -231,7 +231,7 @@ class ResourceUtilsSuite extends SparkFunSuite
       val gpuValue = resources.get(GPU)
       assert(gpuValue.nonEmpty, "Should have a gpu entry")
       assert(gpuValue.get.name == "gpu", "name should be gpu")
-      assert(gpuValue.get.addresses.size == 2, "Should have 2 indexes")
+      assert(gpuValue.get.addresses.length == 2, "Should have 2 indexes")
       assert(gpuValue.get.addresses.sameElements(Array("0", "1")), "should have 0,1 entries")
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/AQEShuffledRDD.scala b/core/src/test/scala/org/apache/spark/scheduler/AQEShuffledRDD.scala
index 3f8eaede6e799..84f9ef0d557e6 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/AQEShuffledRDD.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/AQEShuffledRDD.scala
@@ -48,7 +48,7 @@ class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: A
     result
   }
 
-  override def numPartitions: Int = partitionStartIndices.size
+  override def numPartitions: Int = partitionStartIndices.length
 
   override def getPartition(key: Any): Int = {
     parentPartitionMapping(parent.getPartition(key))
diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
index bf5e9d96cd80e..e9b8ae4bffe6d 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala
@@ -62,7 +62,7 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo
     }
     assert(thrown.getMessage.contains("using broadcast variables for large values"))
     val smaller = sc.parallelize(1 to 4).collect()
-    assert(smaller.size === 4)
+    assert(smaller.length === 4)
   }
 
   test("compute max number of concurrent tasks can be launched") {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
index 0f7146bc7c150..c55f627075e8f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala
@@ -462,9 +462,9 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
 
   /** Send the given CompletionEvent messages for the tasks in the TaskSet. */
   private def complete(taskSet: TaskSet, taskEndInfos: Seq[(TaskEndReason, Any)]): Unit = {
-    assert(taskSet.tasks.size >= taskEndInfos.size)
+    assert(taskSet.tasks.length >= taskEndInfos.size)
     for ((result, i) <- taskEndInfos.zipWithIndex) {
-      if (i < taskSet.tasks.size) {
+      if (i < taskSet.tasks.length) {
         runEvent(makeCompletionEvent(taskSet.tasks(i), result._1, result._2))
       }
     }
@@ -474,9 +474,9 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
       accumId: Long,
       taskSet: TaskSet,
       results: Seq[(TaskEndReason, Any)]): Unit = {
-    assert(taskSet.tasks.size >= results.size)
+    assert(taskSet.tasks.length >= results.size)
     for ((result, i) <- results.zipWithIndex) {
-      if (i < taskSet.tasks.size) {
+      if (i < taskSet.tasks.length) {
         runEvent(makeCompletionEvent(
           taskSet.tasks(i),
           result._1,
@@ -1671,21 +1671,21 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
     runEvent(makeCompletionEvent(
       taskSet.tasks(0),
       Success,
-      makeMapStatus("hostA", reduceRdd.partitions.size)))
+      makeMapStatus("hostA", reduceRdd.partitions.length)))
     assert(shuffleStage.numAvailableOutputs === 0)
 
     // should work because it's a non-failed host (so the available map outputs will increase)
     runEvent(makeCompletionEvent(
       taskSet.tasks(0),
       Success,
-      makeMapStatus("hostB", reduceRdd.partitions.size)))
+      makeMapStatus("hostB", reduceRdd.partitions.length)))
     assert(shuffleStage.numAvailableOutputs === 1)
 
     // should be ignored for being too old
     runEvent(makeCompletionEvent(
       taskSet.tasks(0),
       Success,
-      makeMapStatus("hostA", reduceRdd.partitions.size)))
+      makeMapStatus("hostA", reduceRdd.partitions.length)))
     assert(shuffleStage.numAvailableOutputs === 1)
 
     // should work because it's a new epoch, which will increase the number of available map
@@ -1694,7 +1694,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
     runEvent(makeCompletionEvent(
       taskSet.tasks(1),
       Success,
-      makeMapStatus("hostA", reduceRdd.partitions.size)))
+      makeMapStatus("hostA", reduceRdd.partitions.length)))
     assert(shuffleStage.numAvailableOutputs === 2)
     assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet ===
       HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostA")))
@@ -2081,7 +2081,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
     // stage complete), but the tasks that ran on HostA need to be re-run, so the DAGScheduler
     // should re-submit the stage with one task (the task that originally ran on HostA).
     assert(taskSets.size === 2)
-    assert(taskSets(1).tasks.size === 1)
+    assert(taskSets(1).tasks.length === 1)
 
     // Make sure that the stage that was re-submitted was the ShuffleMapStage (not the reduce
     // stage, which shouldn't be run until all of the tasks in the ShuffleMapStage complete on
@@ -2735,7 +2735,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
     // Now complete tasks in the second task set
     val newTaskSet = taskSets(1)
     // 2 tasks should have been re-submitted, for tasks 0 and 1 (which ran on hostA).
-    assert(newTaskSet.tasks.size === 2)
+    assert(newTaskSet.tasks.length === 2)
     // Complete task 0 from the original task set (i.e., not the one that's currently active).
     // This should still be counted towards the job being complete (but there's still one
     // outstanding task).
@@ -2878,7 +2878,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
     // failed hostA, so both should be resubmitted. Complete them on hostB successfully.
     scheduler.resubmitFailedStages()
     assert(taskSets(2).stageId === 0 && taskSets(2).stageAttemptId === 1
-      && taskSets(2).tasks.size === 2)
+      && taskSets(2).tasks.length === 2)
     complete(taskSets(2), Seq(
       (Success, makeMapStatus("hostB", 2)),
       (Success, makeMapStatus("hostB", 2))))
@@ -2898,7 +2898,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
     // Task(stageId=1, stageAttemptId=1, partitionId=1) of this new active stage attempt
     // is still running.
     assert(taskSets(3).stageId === 1 && taskSets(3).stageAttemptId === 1
-      && taskSets(3).tasks.size === 2)
+      && taskSets(3).tasks.length === 2)
     runEvent(makeCompletionEvent(
       taskSets(3).tasks(0), Success, makeMapStatus("hostB", 2)))
 
@@ -2907,7 +2907,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
     // was ignored due to executor failure
     assert(taskSets.size === 5)
     assert(taskSets(4).stageId === 1 && taskSets(4).stageAttemptId === 2
-      && taskSets(4).tasks.size === 1)
+      && taskSets(4).tasks.length === 1)
 
     // Complete task(stageId=1, stageAttempt=2, partitionId=1) successfully.
     runEvent(makeCompletionEvent(
@@ -4445,7 +4445,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
     // a scenario where stage 0 needs to be resubmitted upon finishing all tasks.
     // Merge finalization should be scheduled in this case.
     for ((result, i) <- taskResults.zipWithIndex) {
-      if (i == taskSets(0).tasks.size - 1) {
+      if (i == taskSets(0).tasks.length - 1) {
         mapOutputTracker.removeOutputsOnHost("host0")
       }
       runEvent(makeCompletionEvent(taskSets(0).tasks(i), result._1, result._2))
@@ -4522,7 +4522,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
     // a scenario where stage 0 needs to be resubmitted upon finishing all tasks.
     // Merge finalization should be scheduled in this case.
     for ((result, i) <- taskResults.zipWithIndex) {
-      if (i == taskSets(0).tasks.size - 1) {
+      if (i == taskSets(0).tasks.length - 1) {
         mapOutputTracker.removeOutputsOnHost("host0")
       }
       runEvent(makeCompletionEvent(taskSets(0).tasks(i), result._1, result._2))
@@ -4986,7 +4986,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti
    * Note that this checks only the host and not the executor ID.
    */
   private def assertLocations(taskSet: TaskSet, hosts: Seq[Seq[String]]): Unit = {
-    assert(hosts.size === taskSet.tasks.size)
+    assert(hosts.size === taskSet.tasks.length)
     for ((taskLocs, expectedLocs) <- taskSet.tasks.map(_.preferredLocations).zip(hosts)) {
       assert(taskLocs.map(_.host).toSet === expectedLocs.toSet)
     }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
index cf2240a0511d7..13e7ff758ebaf 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala
@@ -268,7 +268,7 @@ class MapStatusSuite extends SparkFunSuite {
       "number of skewed block sizes")
 
     val smallAndUntrackedBlocks =
-      nonEmptyBlocks.slice(0, nonEmptyBlocks.size - trackedSkewedBlocksLength)
+      nonEmptyBlocks.slice(0, nonEmptyBlocks.length - trackedSkewedBlocksLength)
     val avg = smallAndUntrackedBlocks.sum / smallAndUntrackedBlocks.length
 
     val loc = BlockManagerId("a", "b", 10)
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
index 0533f9d7d8a49..f1a4b97c2981d 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -143,14 +143,14 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
     val rdd = sc.parallelize(Seq(1), 1)
     sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).commitSuccessfully _,
       rdd.partitions.indices)
-    assert(tempDir.list().size === 1)
+    assert(tempDir.list().length === 1)
   }
 
   ignore("If commit fails, if task is retried it should not be locked, and will succeed.") {
     val rdd = sc.parallelize(Seq(1), 1)
     sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).failFirstCommitAttempt _,
       rdd.partitions.indices)
-    assert(tempDir.list().size === 1)
+    assert(tempDir.list().length === 1)
   }
 
   test("Job should not complete if all commits are denied") {
@@ -161,13 +161,13 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
     def resultHandler(x: Int, y: Unit): Unit = {}
     val futureAction: SimpleFutureAction[Unit] = sc.submitJob[Int, Unit, Unit](rdd,
       OutputCommitFunctions(tempDir.getAbsolutePath).commitSuccessfully,
-      0 until rdd.partitions.size, resultHandler, ())
+      0 until rdd.partitions.length, resultHandler, ())
     // It's an error if the job completes successfully even though no committer was authorized,
     // so throw an exception if the job was allowed to complete.
     intercept[TimeoutException] {
       ThreadUtils.awaitResult(futureAction, 5.seconds)
     }
-    assert(tempDir.list().size === 0)
+    assert(tempDir.list().length === 0)
   }
 
   test("Only authorized committer failures can clear the authorized committer lock (SPARK-6614)") {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
index f0ae7fc74112b..2ab7df0d9cfd3 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala
@@ -1815,10 +1815,10 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext
     var has1Gpu = 0
     for (tDesc <- taskDescriptions) {
       assert(tDesc.resources.contains(GPU))
-      if (tDesc.resources(GPU).addresses.size == 2) {
+      if (tDesc.resources(GPU).addresses.length == 2) {
         has2Gpus += 1
       }
-      if (tDesc.resources(GPU).addresses.size == 1) {
+      if (tDesc.resources(GPU).addresses.length == 1) {
         has1Gpu += 1
       }
     }
@@ -1836,7 +1836,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext
     taskDescriptions = taskScheduler.resourceOffers(workerOffers3).flatten
     assert(2 === taskDescriptions.length)
     assert(taskDescriptions.head.resources.contains(GPU))
-    assert(2 == taskDescriptions.head.resources(GPU).addresses.size)
+    assert(2 == taskDescriptions.head.resources(GPU).addresses.length)
   }
 
   test("Scheduler works with task resource profiles") {
@@ -1875,10 +1875,10 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext
     var has1Gpu = 0
     for (tDesc <- taskDescriptions) {
       assert(tDesc.resources.contains(GPU))
-      if (tDesc.resources(GPU).addresses.size == 2) {
+      if (tDesc.resources(GPU).addresses.length == 2) {
         has2Gpus += 1
       }
-      if (tDesc.resources(GPU).addresses.size == 1) {
+      if (tDesc.resources(GPU).addresses.length == 1) {
         has1Gpu += 1
       }
     }
@@ -1896,7 +1896,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext
     taskDescriptions = taskScheduler.resourceOffers(workerOffers3).flatten
     assert(2 === taskDescriptions.length)
     assert(taskDescriptions.head.resources.contains(GPU))
-    assert(2 == taskDescriptions.head.resources(GPU).addresses.size)
+    assert(2 == taskDescriptions.head.resources(GPU).addresses.length)
   }
 
   test("Calculate available tasks slots for task resource profiles") {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index 2fe50a486dbd6..2f8b6df8beac5 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -845,7 +845,7 @@ class TaskSetManagerSuite
 
     // multiple 1k result
     val r = sc.makeRDD(0 until 10, 10).map(genBytes(1024)).collect()
-    assert(10 === r.size)
+    assert(10 === r.length)
 
     // single 10M result
     val thrown = intercept[SparkException] {sc.makeRDD(genBytes(10 << 20)(0), 1).collect()}
@@ -863,7 +863,7 @@ class TaskSetManagerSuite
     sc = new SparkContext("local", "test", conf)
     // final result is below limit.
     val r = sc.makeRDD(0 until 2000, 2000).distinct(10).filter(_ == 0).collect()
-    assert(1 === r.size)
+    assert(1 === r.length)
   }
 
   test("[SPARK-13931] taskSetManager should not send Resubmitted tasks after being a zombie") {
diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
index 4acb4bbc779c3..25db9a5c68612 100644
--- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala
@@ -48,7 +48,7 @@ class KryoSerializerDistributedSuite extends SparkFunSuite with LocalSparkContex
     val shuffledRDD = cachedRDD.map { case (i, o) => (i * i * i - 10 * i * i, o)}
 
     // Join the two RDDs, and force evaluation
-    assert(shuffledRDD.join(cachedRDD).collect().size == 1)
+    assert(shuffledRDD.join(cachedRDD).collect().length == 1)
   }
 }
 
diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/IndexShuffleBlockResolverSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/IndexShuffleBlockResolverSuite.scala
index 8a9537b4f18d7..a9ca9135f38a9 100644
--- a/core/src/test/scala/org/apache/spark/shuffle/sort/IndexShuffleBlockResolverSuite.scala
+++ b/core/src/test/scala/org/apache/spark/shuffle/sort/IndexShuffleBlockResolverSuite.scala
@@ -236,7 +236,7 @@ class IndexShuffleBlockResolverSuite extends SparkFunSuite {
         ShuffleMergedBlockId(shuffleId, shuffleMergeId, reduceId),
         dirs)
     assert(mergedBlockMeta.getNumChunks === 3)
-    assert(mergedBlockMeta.readChunkBitmaps().size === 3)
+    assert(mergedBlockMeta.readChunkBitmaps().length === 3)
     assert(mergedBlockMeta.readChunkBitmaps()(0).contains(1))
     assert(mergedBlockMeta.readChunkBitmaps()(0).contains(2))
     assert(!mergedBlockMeta.readChunkBitmaps()(0).contains(3))
diff --git a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
index be1b9be2d85d9..b644224652266 100644
--- a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala
@@ -117,7 +117,7 @@ class DiskStoreSuite extends SparkFunSuite {
 
     val chunkedByteBuffer = blockData.toChunkedByteBuffer(ByteBuffer.allocate)
     val chunks = chunkedByteBuffer.chunks
-    assert(chunks.size === 2)
+    assert(chunks.length === 2)
     for (chunk <- chunks) {
       assert(chunk.limit() === 10 * 1024)
     }
diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
index c377f2495d05d..35ef0587b9b4c 100644
--- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
@@ -192,9 +192,9 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter {
 
     // verify whether the earliest file has been deleted
     val rolledOverFiles = allGeneratedFiles.filter { _ != testFile.toString }.toArray.sorted
-    logInfo(s"All rolled over files generated:${rolledOverFiles.size}\n" +
+    logInfo(s"All rolled over files generated:${rolledOverFiles.length}\n" +
       rolledOverFiles.mkString("\n"))
-    assert(rolledOverFiles.size > 2)
+    assert(rolledOverFiles.length > 2)
     val earliestRolledOverFile = rolledOverFiles.head
     val existingRolledOverFiles = RollingFileAppender.getSortedRolledOverFiles(
       testFile.getParentFile.toString, testFile.getName).map(_.toString)
diff --git a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
index 8aa4be6c2ff8d..82a4c85b02fa0 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala
@@ -104,7 +104,7 @@ private object SizeTrackerSuite {
    * Run speed tests for size tracking collections.
    */
   def main(args: Array[String]): Unit = {
-    if (args.size < 1) {
+    if (args.length < 1) {
       // scalastyle:off println
       println("Usage: SizeTrackerSuite [num elements]")
       // scalastyle:on println

From d30c9a90c6cf9033c45f6f418864c8d7013911e5 Mon Sep 17 00:00:00 2001
From: Max Gekk <max.gekk@gmail.com>
Date: Sun, 26 Nov 2023 14:10:27 +0100
Subject: [PATCH 08/40] [SPARK-45826][SQL] Add a SQL config for stack traces in
 DataFrame query context

### What changes were proposed in this pull request?
In the PR, I propose to add new SQL config `spark.sql.stackTracesInDataFrameContext` which defines how many non-Spark stack traces should be captured into DataFrame query context. By default, the config is set to 1.

### Why are the changes needed?
To improve user experience with Spark SQL. When users troubleshoot an issue, they might need more stack traces in the DataFrame context. For example:
```scala
scala> spark.conf.set("spark.sql.ansi.enabled", true)
scala> spark.conf.set("spark.sql.stackTracesInDataFrameContext", 3)
scala> spark.range(1).select(lit(1) / lit(0)).collect()
org.apache.spark.SparkArithmeticException: [DIVIDE_BY_ZERO] Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. SQLSTATE: 22012
== DataFrame ==
"div" was called from
<init>(<console>:1)
<init>(<console>:16)
.<clinit>(<console>:1)
```

### Does this PR introduce _any_ user-facing change?
No, it doesn't change the default behaviour.

### How was this patch tested?
By running the modified test suite:
```
$ build/sbt "test:testOnly *QueryContextSuite"
```

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #43695 from MaxGekk/df-context-slice-conf-2.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
---
 .../scala/org/apache/spark/sql/internal/SQLConf.scala    | 9 +++++++++
 .../src/main/scala/org/apache/spark/sql/package.scala    | 5 ++++-
 .../org/apache/spark/sql/errors/QueryContextSuite.scala  | 7 +++++--
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 6a8e1f92fc510..5133c40bc6faa 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -4577,6 +4577,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val STACK_TRACES_IN_DATAFRAME_CONTEXT = buildConf("spark.sql.stackTracesInDataFrameContext")
+    .doc("The number of non-Spark stack traces in the captured DataFrame query context.")
+    .version("4.0.0")
+    .intConf
+    .checkValue(_ > 0, "The number of stack traces in the DataFrame context must be positive.")
+    .createWithDefault(1)
+
   /**
    * Holds information about keys that have been deprecated.
    *
@@ -5465,6 +5472,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
   def legacyRaiseErrorWithoutErrorClass: Boolean =
     getConf(SQLConf.LEGACY_RAISE_ERROR_WITHOUT_ERROR_CLASS)
 
+  def stackTracesInDataFrameContext: Int = getConf(SQLConf.STACK_TRACES_IN_DATAFRAME_CONTEXT)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
index 96bef83af0a86..877d9906a1cff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/package.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala
@@ -22,6 +22,7 @@ import java.util.regex.Pattern
 import org.apache.spark.annotation.{DeveloperApi, Unstable}
 import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin}
 import org.apache.spark.sql.execution.SparkStrategy
+import org.apache.spark.sql.internal.SQLConf
 
 /**
  * Allows the execution of relational queries, including those expressed in SQL using Spark.
@@ -103,7 +104,9 @@ package object sql {
       while (i < st.length && !sparkCode(st(i))) i += 1
       // Stop at the end of the first Spark code traces
       while (i < st.length && sparkCode(st(i))) i += 1
-      val origin = Origin(stackTrace = Some(st.slice(i - 1, i + 1)))
+      val origin = Origin(stackTrace = Some(st.slice(
+        from = i - 1,
+        until = i + SQLConf.get.stackTracesInDataFrameContext)))
       CurrentOrigin.withOrigin(origin)(f)
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryContextSuite.scala
index 7d57eeb01bfa1..426822da3c912 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryContextSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryContextSuite.scala
@@ -25,14 +25,17 @@ import org.apache.spark.sql.test.SharedSparkSession
 class QueryContextSuite extends QueryTest with SharedSparkSession {
 
   test("summary of DataFrame context") {
-    withSQLConf(SQLConf.ANSI_ENABLED.key -> "true") {
+    withSQLConf(
+      SQLConf.ANSI_ENABLED.key -> "true",
+      SQLConf.STACK_TRACES_IN_DATAFRAME_CONTEXT.key -> "2") {
       val e = intercept[SparkArithmeticException] {
         spark.range(1).select(lit(1) / lit(0)).collect()
       }
       assert(e.getQueryContext.head.summary() ==
         """== DataFrame ==
           |"div" was called from
-          |org.apache.spark.sql.errors.QueryContextSuite.$anonfun$new$3(QueryContextSuite.scala:30)
+          |org.apache.spark.sql.errors.QueryContextSuite.$anonfun$new$3(QueryContextSuite.scala:32)
+          |org.scalatest.Assertions.intercept(Assertions.scala:749)
           |""".stripMargin)
     }
   }

From ade861d19910df724d9233df98c059ff9d57f795 Mon Sep 17 00:00:00 2001
From: wforget <643348094@qq.com>
Date: Sun, 26 Nov 2023 23:28:52 +0800
Subject: [PATCH 09/40] [SPARK-45974][SQL] Add scan.filterAttributes non-empty
 judgment for RowLevelOperationRuntimeGroupFiltering

### What changes were proposed in this pull request?

Add scan.filterAttributes non-empty judgment for RowLevelOperationRuntimeGroupFiltering.

### Why are the changes needed?

When scan.filterAttributes is empty, an invalid dynamic pruning condition will be generated in RowLevelOperationRuntimeGroupFiltering.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

added test case

### Was this patch authored or co-authored using generative AI tooling?

No

Closes #43869 from wForget/SPARK-45974.

Authored-by: wforget <643348094@qq.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 ...wLevelOperationRuntimeGroupFiltering.scala |  4 ++-
 .../connector/MergeIntoTableSuiteBase.scala   | 32 +++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala
index b8288c636c386..7c28f91ee1cc6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala
@@ -51,7 +51,8 @@ class RowLevelOperationRuntimeGroupFiltering(optimizeSubqueries: Rule[LogicalPla
     // apply special dynamic filtering only for group-based row-level operations
     case GroupBasedRowLevelOperation(replaceData, _, Some(cond),
         DataSourceV2ScanRelation(_, scan: SupportsRuntimeV2Filtering, _, _, _))
-        if conf.runtimeRowLevelOperationGroupFilterEnabled && cond != TrueLiteral =>
+        if conf.runtimeRowLevelOperationGroupFilterEnabled && cond != TrueLiteral
+          && scan.filterAttributes().nonEmpty =>
 
       // use reference equality on scan to find required scan relations
       val newQuery = replaceData.query transformUp {
@@ -116,6 +117,7 @@ class RowLevelOperationRuntimeGroupFiltering(optimizeSubqueries: Rule[LogicalPla
       matchingRowsPlan: LogicalPlan,
       buildKeys: Seq[Attribute],
       pruningKeys: Seq[Attribute]): Expression = {
+    assert(buildKeys.nonEmpty && pruningKeys.nonEmpty)
 
     val buildQuery = Aggregate(buildKeys, buildKeys, matchingRowsPlan)
     DynamicPruningExpression(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala
index e7555c23fa4fc..5668e5981910c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala
@@ -32,6 +32,38 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase {
 
   import testImplicits._
 
+  test("SPARK-45974: merge into non filter attributes table") {
+    val tableName: String = "cat.ns1.non_partitioned_table"
+    withTable(tableName) {
+      withTempView("source") {
+        val sourceRows = Seq(
+          (1, 100, "hr"),
+          (2, 200, "finance"),
+          (3, 300, "hr"))
+        sourceRows.toDF("pk", "salary", "dep").createOrReplaceTempView("source")
+
+        sql(s"CREATE TABLE $tableName (pk INT NOT NULL, salary INT, dep STRING)".stripMargin)
+
+        val df = sql(
+          s"""MERGE INTO $tableName t
+             |USING (select * from source) s
+             |ON t.pk = s.pk
+             |WHEN MATCHED THEN
+             | UPDATE SET t.salary = s.salary
+             |WHEN NOT MATCHED THEN
+             | INSERT *
+             |""".stripMargin)
+
+        checkAnswer(
+          sql(s"SELECT * FROM $tableName"),
+          Seq(
+            Row(1, 100, "hr"), // insert
+            Row(2, 200, "finance"), // insert
+            Row(3, 300, "hr"))) // insert
+      }
+    }
+  }
+
   test("merge into empty table with NOT MATCHED clause") {
     withTempView("source") {
       createTable("pk INT NOT NULL, salary INT, dep STRING")

From d5fad63810149a69527706bb16333baee06a4270 Mon Sep 17 00:00:00 2001
From: Niranjan Jayakar <nija@databricks.com>
Date: Mon, 27 Nov 2023 08:47:31 +0900
Subject: [PATCH 10/40] [SPARK-46074][CONNECT][SCALA] Insufficient details in
 error message on UDF failure

### What changes were proposed in this pull request?

Update the error message for 'FAILED_EXECUTE_UDF' with the underlying error
message.

### Why are the changes needed?

The Spark Connect client does not receive the underlying cause for a UDF failure.
This means that a user needs to go into the driver logs to identify the cause for
failure.

Update the error message so that the underlying exception's message is included.

### Does this PR introduce _any_ user-facing change?

Yes. This changes the error message that the user sees when a UDF fails. A new error
parameter is added but the SQL state and existing parameters are unchanged and should
cause no regressions.

The error message prior to this change:

```
org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 0.0 failed 1 times, most recent failure: Lost task 3.0 in stage 0.0 (TID 3) (192.168.188.21 executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (` (cmd2$Helper$$Lambda$2170/0x000000f001d23000)`: (int) => int). SQLSTATE: 39000
```

Sample of the new error message:

```
org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] User defined function (` (cmd2$Helper$$Lambda$2422/0x0000007001ec1a10)`: (int) => int) failed due to: java.lang.NoClassDefFoundError: com/nija/test/MyClass. SQLSTATE: 39000
```

### How was this patch tested?

Tested manually by running a [local connect server] and [connect client REPL]

[local connect server]: https://github.com/apache/spark/blob/master/connector/connect/bin/spark-connect-shell
[connect client REPL]: https://github.com/apache/spark/blob/master/connector/connect/bin/spark-connect-scala-client

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #43983 from nija-at/udf-error-msg.

Authored-by: Niranjan Jayakar <nija@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../src/main/resources/error/error-classes.json    |  2 +-
 docs/sql-error-conditions.md                       |  2 +-
 .../spark/sql/errors/QueryExecutionErrors.scala    |  3 ++-
 .../sql/catalyst/expressions/ScalaUDFSuite.scala   |  6 ++++--
 .../sql/errors/QueryExecutionErrorsSuite.scala     |  6 ++++--
 .../spark/sql/hive/execution/HiveUDFSuite.scala    | 14 ++++++++++++--
 6 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
index 19b70307a1cdd..5b70edf249d14 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -1067,7 +1067,7 @@
   },
   "FAILED_EXECUTE_UDF" : {
     "message" : [
-      "Failed to execute user defined function (<functionName>: (<signature>) => <result>)."
+      "User defined function (<functionName>: (<signature>) => <result>) failed due to: <reason>."
     ],
     "sqlState" : "39000"
   },
diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md
index c0f88bffa6e5b..71abf10da328b 100644
--- a/docs/sql-error-conditions.md
+++ b/docs/sql-error-conditions.md
@@ -643,7 +643,7 @@ Column expression `<expr>` cannot be sorted because its type `<exprType>` is not
 
 [SQLSTATE: 39000](sql-error-conditions-sqlstates.html#class-39-external-routine-invocation-exception)
 
-Failed to execute user defined function (`<functionName>`: (`<signature>`) => `<result>`).
+User defined function (`<functionName>`: (`<signature>`) => `<result>`) failed due to: `<reason>`.
 
 ### FAILED_FUNCTION_CALL
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
index 1aa25a51fa9c6..24332479f1937 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala
@@ -190,7 +190,8 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE
       messageParameters = Map(
         "functionName" -> toSQLId(functionName),
         "signature" -> inputTypes,
-        "result" -> outputType),
+        "result" -> outputType,
+        "reason" -> e.toString),
       cause = e)
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
index 1b40e02aa8662..00fc9d462eb65 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala
@@ -50,13 +50,15 @@ class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {
       Literal.create(null, StringType) :: Nil,
       Option(resolvedEncoder[String]()) :: Nil)
 
+    val pattern = "User defined function .+ failed due to: java.lang.NullPointerException".r
+
     val e1 = intercept[SparkException](udf.eval())
-    assert(e1.getMessage.contains("Failed to execute user defined function"))
+    assert(pattern.findFirstIn(e1.getMessage).isDefined)
 
     val e2 = intercept[SparkException] {
       checkEvaluationWithUnsafeProjection(udf, null)
     }
-    assert(e2.getMessage.contains("Failed to execute user defined function"))
+    assert(pattern.findFirstIn(e2.getMessage).isDefined)
   }
 
   test("SPARK-22695: ScalaUDF should not use global variables") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala
index a49352cbe5080..1e869bfd25aa9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala
@@ -431,7 +431,8 @@ class QueryExecutionErrorsSuite
       parameters = Map(
         "functionName" -> functionNameRegex,
         "signature" -> "string, int",
-        "result" -> "string"),
+        "result" -> "string",
+        "reason" -> "java.lang.StringIndexOutOfBoundsException: begin 5, end 6, length 5"),
       matchPVals = true)
   }
 
@@ -455,7 +456,8 @@ class QueryExecutionErrorsSuite
       errorClass = "FAILED_EXECUTE_UDF",
       parameters = Map("functionName" -> functionNameRegex,
         "signature" -> "string, int",
-        "result" -> "string"),
+        "result" -> "string",
+        "reason" -> "java.lang.StringIndexOutOfBoundsException: begin 5, end 6, length 5"),
       matchPVals = true)
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
index 3813071b680c9..096b11feb9bcd 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala
@@ -754,7 +754,9 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
             "functionName" ->
               "`org`.`apache`.`hadoop`.`hive`.`ql`.`udf`.`generic`.`GenericUDFAssertTrue`",
             "signature" -> "boolean",
-            "result" -> "void"))
+            "result" -> "void",
+            "reason" ->
+              "org.apache.hadoop.hive.ql.metadata.HiveException: ASSERT_TRUE(): assertion failed."))
       }
     }
   }
@@ -778,6 +780,13 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
       withTable("HiveSimpleUDFTable") {
         sql(s"create table HiveSimpleUDFTable as select false as v")
         val df = sql("SELECT CodeGenHiveSimpleUDF(v) from HiveSimpleUDFTable")
+
+        val reason = """
+          |org.apache.hadoop.hive.ql.metadata.HiveException: Unable to execute method public
+          |boolean org.apache.spark.sql.hive.execution.SimpleUDFAssertTrue.evaluate(boolean) with
+          |arguments {false}:ASSERT_TRUE(): assertion failed."""
+          .stripMargin.replaceAll("\n", " ").trim
+
         checkError(
           exception = intercept[SparkException](df.collect()).getCause.asInstanceOf[SparkException],
           errorClass = "FAILED_EXECUTE_UDF",
@@ -785,7 +794,8 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils {
             "functionName" ->
               "`org`.`apache`.`spark`.`sql`.`hive`.`execution`.`SimpleUDFAssertTrue`",
             "signature" -> "boolean",
-            "result" -> "boolean"
+            "result" -> "boolean",
+            "reason" -> reason
           )
         )
       }

From 400db88d00e50750513d733be697b6b2dd9043d3 Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Mon, 27 Nov 2023 08:49:18 +0900
Subject: [PATCH 11/40] [SPARK-46103][PYTHON][INFRA][BUILD][DOCS] Enhancing
 PySpark documentation

### What changes were proposed in this pull request?

This PR proposes to enhance the PySpark documentation by leveraging modern Sphinx features and functionalities. The primary objective is to improve the overall user experience and readability of the documentation. To achieve this, the PR includes an upgrade of `Sphinx` and `Jinja2` to their newer/latest versions, enabling us to use the latest `pydata_sphinx_theme` features such as light/dark mode toggling.

### Why are the changes needed?

Currently, the PySpark documentation is unable to utilize many of the advanced features available in recent `Sphinx` versions due to older package versions. This limitation hinders the documentation's visual appeal and usability, particularly when compared to other projects like Pandas which have already adopted these enhancements. For example:

## Pandas API reference (better layout / switching light & dark mode available)

### Dark mode
<img width="1409" alt="Screenshot 2023-11-26 at 5 43 29 AM" src="https://github.com/apache/spark/assets/44108233/0f97ce4a-c1ec-47fb-9295-445c2d557393">

### Light mode
<img width="1403" alt="Screenshot 2023-11-26 at 5 45 01 AM" src="https://github.com/apache/spark/assets/44108233/715f74a8-9e49-4c05-80ef-5531d2e68220">

## PySpark API reference (less readable compare to pandas / no light & dark mode)
<img width="1312" alt="Screenshot 2023-11-26 at 5 43 48 AM" src="https://github.com/apache/spark/assets/44108233/722d2b61-e231-4387-a5ab-dcd447045d94">

By updating the `Sphinx` and `Jinja2` versions, we can significantly improve the documentation's layout, design, and interactive features, thereby enhancing the end-user experience.

### Does this PR introduce _any_ user-facing change?

No API changes, but users will notice a more modern and user-friendly interface in the PySpark documentation. New features like light/dark mode and improved page layouts will be available as below:

## Before
<img width="1312" alt="Screenshot 2023-11-26 at 5 43 48 AM" src="https://github.com/apache/spark/assets/44108233/722d2b61-e231-4387-a5ab-dcd447045d94">

## After
### Dark mode
<img width="1388" alt="Screenshot 2023-11-26 at 6 17 13 AM" src="https://github.com/apache/spark/assets/44108233/b5ed6cfd-9a65-4c03-a067-b40e89cc8c48">

### Light mode
<img width="1392" alt="Screenshot 2023-11-26 at 6 16 47 AM" src="https://github.com/apache/spark/assets/44108233/24b723a7-5b00-4565-81d9-9c87154c115f">

### How was this patch tested?

Manually built docs from local environment, and also tested combinations between various `Jinja2`, `Sphinx` and `pydata_sphinx_theme` versions for best document rendering.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44012 from itholic/upgrade_sphinx.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .github/workflows/build_and_test.yml          |   2 +-
 dev/requirements.txt                          |   6 +-
 .../docs/source/_static/spark-logo-dark.png   | Bin 0 -> 23555 bytes
 .../docs/source/_static/spark-logo-light.png  | Bin 0 -> 18773 bytes
 .../autosummary/accessor_attribute.rst        |   6 ++
 .../autosummary/accessor_method.rst           |   6 ++
 .../autosummary/class_with_docs.rst           |   4 +-
 .../_templates/autosummary/plot_class.rst     |  53 ++++++++++++++++++
 python/docs/source/conf.py                    |   6 +-
 .../source/reference/pyspark.pandas/frame.rst |   8 ++-
 .../reference/pyspark.pandas/indexing.rst     |  12 ++++
 .../source/reference/pyspark.pandas/io.rst    |   5 ++
 .../reference/pyspark.pandas/series.rst       |  22 +++++++-
 .../reference/pyspark.sql/spark_session.rst   |  14 +++++
 14 files changed, 136 insertions(+), 8 deletions(-)
 create mode 100644 python/docs/source/_static/spark-logo-dark.png
 create mode 100644 python/docs/source/_static/spark-logo-light.png
 create mode 100644 python/docs/source/_templates/autosummary/accessor_attribute.rst
 create mode 100644 python/docs/source/_templates/autosummary/accessor_method.rst
 create mode 100644 python/docs/source/_templates/autosummary/plot_class.rst

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5033ab00601ab..a4c9ec3042582 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -751,7 +751,7 @@ jobs:
         #   See also https://issues.apache.org/jira/browse/SPARK-35375.
         # Pin the MarkupSafe to 2.0.1 to resolve the CI error.
         #   See also https://issues.apache.org/jira/browse/SPARK-38279.
-        python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme sphinx-copybutton nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0'
+        python3.9 -m pip install 'sphinx==4.2.0' mkdocs 'pydata_sphinx_theme==0.13' sphinx-copybutton nbsphinx numpydoc jinja2 'markupsafe==2.0.1' 'pyzmq<24.0.0'
         python3.9 -m pip install ipython_genutils # See SPARK-38517
         python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
         python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
diff --git a/dev/requirements.txt b/dev/requirements.txt
index 7de55ec24968a..a7af0907c7264 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -31,12 +31,12 @@ pandas-stubs<1.2.0.54
 mkdocs
 
 # Documentation (Python)
-pydata_sphinx_theme
+pydata_sphinx_theme==0.13
 ipython
 nbsphinx
 numpydoc
-jinja2<3.0.0
-sphinx<3.1.0
+jinja2
+sphinx==4.2.0
 sphinx-plotly-directive
 sphinx-copybutton
 docutils<0.18.0
diff --git a/python/docs/source/_static/spark-logo-dark.png b/python/docs/source/_static/spark-logo-dark.png
new file mode 100644
index 0000000000000000000000000000000000000000..7460faec37fc7923d11d968fe8bc9d90f72381fd
GIT binary patch
literal 23555
zcmZ^~1ymg0vM-DWhaka%LvRU$%itC~cyNc{u7g{EAi;vW1P|^`aJS&@KIp)}JpSk0
zchCLqdSCb2y{oJ0SJJ(Et!@2HRaq7bofI7o4h~CRPU`b({qkBuQBhv6H5@5-uN9J&
zxRN*=-0wIHs0s4xJ(Zc<XC*i|pAT?wfuV44kFTu2LpV5hPB^$@V>md$pKx$QPMNK0
z!mol*3mth&B_%lKR~i)#86FP~@s)yqJ>lR<;r_AqO2K`CC;M;uGd$ye=^((tMcBY0
z{g=*{*ZQAB?zQ}b`FD+&3;!RBxd{KQ{gR9L-}GyYaB#MT5Vh9|%}Gwz6%Gyq|DOU6
zm!3%s2Zu0XqoL!bqogQk=4j7uV(w^a!R}@6^bZP7*h}!0w6}0Gq4Kh~b8r>(5~2B*
zhTtpx51WIA>R&2uwjwk-N~%<njxH8dyzHFpoHRgmDk>^r7jsL&&r&k~1%JI0p|N&z
za}wm>@bvU#_XMyzx>#|16c7;L;N;@q;$nN%U~~0$a5M2@b8w~ow~_yAN6Nz0%*DpZ
z&BoDz>L0r%rjG7zA~ZDrIQsA7-*LLxSpLV8gX@2#^_n2ZKN60Q?3^6`FPMdw&HoSV
zAIZOA|H|v%?u7p_CaC+_!qw5v{hzRad|bl+^6>v*{wKbF1OB70>0sjq{J)U@#s1$Y
zo&U)H<K}-d|2IIz#m3_GLH;YQkN*|c|K$A-{eQ9(RJHN4u+x#UvA1w={U;YL9&TZd
z|EJ`CBPAW}99`6%Ow25Rua5o&`5&zRt^F@Qy8rR>5%Bs1{^RF=Nd5~c%<<3L{vU7u
z-%aaZ+}E}RL>K1x?{*4AX9^!qgM$-;la~_L@Pa?fLJQQFZn%H)nEVkIN*{<@t0?hK
zB8jcEnM&MAa`Wg~Qa^h{5*bb08D87mNqhjA01wx5t|YE>l3|1f$W>KJjKuvDi%VA*
zAt*k6*lukDrro4Gd3NsW=&AC7GYB}Gwyc~b=w7C5-*kRPH5(X?LQ2Z`b@OqStA&q!
z<Oct@&r5wvi{D+lV&mnH-1w!Yc(erzt{%1}#mi(gjFig2yZRFVgldQ&3ey8$r6yx&
z3AZjU$e=1n!uJDNy%m_C-s2^vI>uq7Q#5tlyzzb?0E)67Mov>3UmDvH;$|n{I#B-7
zY?=pnk>>I@^`NyTesP%7!BX@5g%TXBn7ZV<1|`{84bYesqV%tAGO80s6i-x@Pisd2
z)xuIbWOunT+Q5`)&BhR-cTJP_f}I@&j;Wvc`p0;(d2;iD!ckJPdD}<|i~`QbuJ1__
zd=Dg7o_x%2bV+>c>UTcKeaFxxQB>$uJq)NX7Zq*fd4MjvTD7AUXuqVa$wz-D3Gn<{
z8D7yQg%}`Yt9(7Z$bMvSzpgur0eJ~XcOG4Lbw)PjMkWbRXU{kpC2Fl7&RO%A2haau
zV4rk~wjF!utCS`M3m1_ZyNg;u@jQQ~UR<<!T`Xi;-Y9zhn5NBK8G9pM#b<Q5RL3L4
zrOn7n=RdKSded-{2?ukESd$pqs_GB-Yx|9&jDd4=E^4)}n6;4eBE@GW79;}FoRSZ?
zPC0*h`B=H6T`l7Hn6bpH-|nn@%AChsSMoF0<U!?kFWCSV2bSAd_DN%#U`4>%I<2b`
zC`?gh+0KXt)V(e`bbpAddhnRaCgi&RQ%nyf8#`$VtFOC;$cgy4f_erZhv>Q@w93^L
zKit)dgP20Us4nuB@8Xcf?E=8XXEGPrOTTMSOE&iCu31YF#%)bTN_<W_wbMo=JS=Li
zAMl~8mY(di5+oFNah!!q@)1E~fAr=Z*me0`MXPHw<;}ca&(?Yh7KD_vR2Pvvb<F+A
z`E1^*aX+}jT|$(&phWn?6p;Pi9Ak+k@~7+BCGgOe7+5(MXjO!=@34BT)efNwI2`)m
zd9lFu^yXL0YGbzF#^1L~W{X)uI|iou&HRA2V#!|dlqU5h((o?zV_S3o57uG|FOs^~
z+VGxm@e&1Lk{@O(XT^K07IS=unsY%mD&qGLb+E9#e4mGu&ocK6<AZKNkaQfIiOr-0
zNUMutee@>l>H}jSJ-n0+t(3ZFK%VdF(CSN^!^Km~L*`C4DeJMEvB1srnNPlxzNN#0
z79gocYRCZD7bmE>{!{$=>(`Z?Kw3lO!XLWQ<$cmYn;vW<n5DA5GkTceDxTXX>KUI}
zn&jlP8J3+}-UO3YuQI$n2tEhDY1?EMmGQrtQS9F2@;venGoH?$t3-D<r1Mz^+KoMY
zWBV)TWoc_%)!0G~L1SC_aQOK}ygfFNGw1%u!*F(I4Kg4Cf&Tc&)`C%=szO24ODLX$
z+4ZZ0!?(rlAwPzKBuXd)tqJEZNk^@FGao?6^KhLOUHs|Cy2npxEdrLw3|<qdfb$lJ
zouy^`WZW#CMOzB9EmupgQOKR<z4(NJG*+K1K9<+Juf~Z&8^^;)fG@6T319Nt8vI*m
zh_PLKG-Z7Q=M&#mwX1UMLEDrsaMssXs+V|P*t@*Aou`@(B2Pcs9;gJUIL(jPn#WhB
zv1lJLJ3R;HO^n-(c$nXfMzBN_4Wt?P-P@i`zW;UqEjF=Y@W`N-R;B@|z&nh8N?l%t
zw5*uy+w+g-63|-5Mb`5Am1^JRiDQIJtkn+`Fho*MuQu)6-|`AwlHIy-tTkFhJTLL9
z?Lw#6PWNP4c`mNoQ7#7~QqGL{Mn0N0yKj98eHSLrP`xc=0%#CseAxcXP#y9{)a~!#
z3;CMt;q=aLAI(dPN1yJ@x8pr8!k^}Nr8ck`&2M;}VU6qEAga)<nCJHJp#5iAzK*Lr
z#r;zEi=5})@Gajk-b|}F6i#+a6{DS{EphF!7K&L5Tyx*m9wR4)8+@XQha8@R3Fr}!
zHhul_Z|sZ1o_$yaUT|g=`0lXd`I+M69F{aQc~(j{61wPnV3~S2z5Sk?7sc<nB<ovi
z-|%7;2$GgzMV@|i5}=Dq5PNZS>=P7=^Wlt##*eDR^_!x$V_r>1b=RV<SCHcyBtKEQ
zH$=zbe!b8>l{@z{K4w1q*}HX>F!$KErElk_x&fMq9yPMh_0U((Y)h%N+}bsMG-I{{
zne!+h+4xa%-YZ+!7x`?SgR!5a<^Y9^+O)S{ux@Of<b2EzCJ@r=r4A<!;?io3_l)=z
zF*i^$*W4~<6fN)ai+uQU2?P%&Ge6^YfXO+cMYLpmhyE^1O?m4du47GwQGZ;F>K{MU
zoeo7Rs`w=FrQ?Rrq7G9oYmbnP+75}7)a%37Knrfb@$vP_-N=n)vGtOECduf@3f<_%
z35)O)OlnbZMk{BU*AzU@a&F{%elr<sWAk{kPyS5YRLU&wtgCA|Sz4T1)Wq-*#b}s+
z|3YGHUV@0S;k{6S7_#3fATF-59o+ldX9;#GBD*VN&J%VK?_Fo{cliwE16QrH6{1R7
zmCnFC(nzs*MegETzz%7+QQ`=x)Fd5N%oX2V`jg&=(XTk>DD$Fq1#+4gVV3rVZGbRM
z_0E-+0)x@>3B?S|CIG;>D>=1ljeCc!Afv9X)#sXPJZc_6ChA+0@O0wES~<_t_JlWJ
zzllxXnGjJ~eIt_ghMI#!5a0t#C`hk;ikSjtU57GQoa#wPT76#pN;8yv>A6qww$9R#
zdLutB!kTJniW8ciyg=nwY?Ir?@$<K=Z9J$}`;9L$+d2y-Go!U=rF^p!hr6i#(P{Bg
zw4e|AS)E>U@}jrHzT_j`WHmHONJ&IV&lfyaZ#3h+zDVQ)0HnKyq0zdRmJWntzgx@8
zTq)_3QgDeeNR1@nV-EeqQu66b{Gt#V@ykWoIYyb&S2{lKniU)4lZTHEToaZUbzY}M
zfp<q&@BVxdYdZ{R{WO7H9yNU(S!v%BMdsezXg&E>Nnr0>{-c`6XHSJFtt^IieG6sd
z<f~9icPvy;tK?M46>{2xgHd$MS{4aEbstT{Hpgi?|1AeW?S?J!i)v|ok6F_+oTc!+
zsqY)n9YtM~QD6ridfNMveZh^{kNO{654#fs$BK){C)j-fEf_Z~XXHxaR3Y=?MtIF(
zlEpO*C0#}`D&EWXZD->6RZgi(`IPZKx1rQiy*nlL7^%-Tse~ySFd08dM3r(BbCa>E
z(?$A0#ca^v#9gf^`6=?3aM9`6IZyzj>q8`#=%S%OUQX+#k*#@8YG}#1c<ol=a3%Aq
zo$?YB22v!8Sa2(RnaXxaD*o^%>XE-C^#0KlqCjBn6nD;U_?B$a+$9i$+ezNq$x0Y_
zBES{Nd$+i3-B#pYe@FWMv2VpOZQ_@aL8UK8?cCi=auBG{b*7lmXmxeIOC!}<r|ez^
zg1K-ZiyV^`B1r)m-UR3-PuI&TV0+7#qx%!~;!CsRcX-3X6e!KRzY_av&hwyVtC3DA
zFjoOZ!kE&L@8@2=pcIC5a7~ePDR+|}asAjfthS(XdIL&-=26wp4FQfNAYP}()K&~}
zGAbm?cCGEibWo9G)Jr)omGdmd+#YL#w2qj1zVjqg8J=?9&BO)RU5s`)Zujo-5t-lk
zeQM9|{oTt=Lq_UVY~3B8ufDi!={&BfwNR2r^&XSPy+1sS{u){6Z-$;YVhj0=Qv25d
zC4=WM*ut}472Lb(zcM<1yuHIqb2eNQJolJxPDTtki|JP?w0-q%prUOBU6(Z}hh3{P
ziLjapl-R6#;{i7;u2I<&A(N@Wk8FRA-f>XkYQC^AaOgmI_`HtrHw*bF4hS3OE`<|$
zy6ro#4OVr#EHvwmm$XsQ>!4Ua|AvT+N9wNs9Ysd=%-@(!sK{!0$>m7B=rUID+#(>q
z`Tkk!K})fMxFr^&QZf6TrJluE1^Zw3LzJz=^yv${HPdN#bG>vJ@Csa%QR%kE4ebV2
zw%kV$MKEhKPkog5rrG3An|u4a>cvReux7i19cEVv*H0YKcbO$a8j$^sR22t@%WB+a
zBxd6+C*bMnV#`i{sJ%act;95d!{9O)V{+ZS>BB+Q(L`2xNJp#}n{!t-y<3gbaO;F?
zNz*7N$I5D@9ztt-Nb)f5Qi<7OYYY+hNv(3M67NBj{`s`)ZGcU~<r|T!Ha0SI#~i<T
z@b~E-=stBEQyHMF>*5a0h)AQ9X+17pLj=p4FFf^Mi@U5$FMb9@<T@?ewB&*|LC;Y!
z^!lW){cR6PtiM7hd-t1***s^Pey;ocB#Rk5J=vv1w%%GqR*0ZSPT53?J^RD6vkcH>
z!@Z-)fU18pk>>YCl=JA0+C8yUxAO(wZSU6UCME6y?hIGPgP~i*_ssP*Tsb%|KgLnW
zeV8s)>@l=o$}_e9fKcmcH<p$bdMkU}55~QMC=g!4njGEafA+9=%xFP6mT9<cIlZOu
z$bs@9kz`5r)p03rwU4ZQ`ZlJzYZl<bQ*4M@CEI4X_Kfm$UC-;6YKImNrJm@fJH;sH
ziP=&|0sOz8jnJUnTG7`2B_g-ZjNO=I0eC-yDg_+>e0!?jb+90+DiY$mrqIals&QLn
z>lQ_+T!9H(fo~1efV=KnLkzY2paUw0GicfleYeOX)geELTlHUFT0i>37%>h^8=Iak
zqGPM}i%lW8S+h<=T%oTTTz383+??&)-Qz9Kso;;_i~JNOszF2+%RGP_$YqScm$Dqz
z9e(-Glst{no;L*X)6FX`nSWV;-1VVfZE~uTy2tw$uY1NgRS+8D7BKj?;C^rOo$OBv
z8v$;RK#+Z+FFdP#Wz^{;*(KxIS)+FE)rR=ZII<F++nzcwQBTnI92aIlnT-^y*lsHk
zj2zIYMcJNU*8ArscOvB+zgw_WK}4s?gLNK!TNsLt!-=%kt?m<^664*Ux<7HpHYBbw
z6;BOmN@<ISD_(VOJPT_<7c&?feju4*>e&Na10w5QSqFLdaK@AKTO@B!vkPt!G}L$Q
z{VKI@*+&K!!~^PwI=@1+Y2_UTx;>!>c*7=}G+DH)qz^xo(JEK8+?wkmZ`t%<vJq~R
zv;JAt_jK(~7Y^`uCrDl1*RfDp^PA|^<DC)uZ$?lXVt<VtELhDf9N=L<@cTu)`AePp
z`Sb0#mC~B4t@@pX4^lua>u$eV%h=rXxBy@IiLR%3c>ZB93c7OV8OU4L!o9mYgP7AC
zdQJkQ94^ha4@6k8lJ3<D^waMnX%SMw`R*N9J2RO3)<KVpB#Y3`KId)AB5-+LC9CB<
z0mjH|TQvntq_zV_mPl0f5KKn3(l_Ywn}lGA6G}JEy`9606*{`vA}-^3+^{{-^i(~4
zS?NRmJu6r9?AHoc&y`?5si=fepOa(L5t6@b4ggpA$h%JIX|we-`PY~7J;fZEsCQMz
zjoc9okFHP!ky}Q($Ce5o@S5-PsU<f3;#kg0r-+{G!I2CVKy>9t;YjnK2iN1q(BWi0
z>pi^z6IuWsH>RjO;QK>~KH}K<R(ZA0&7V}WmD{lQ<aS3gt7xQfh9ySL>20#E9+VxQ
z&;6Svd_4>;_Ly13j2zs7&nC-ND7)8g#Hs63u8Y@tca-x<-j~DcgM4<B9X(f<lSk!O
zD@!+*R1P-x;ocFMGe21L^HKwhe>s=V9A=&R&^o!PH)^V&r&#GV<)Hs2N8MHlDO96l
zQ=zTQ3!+}^-fnTth`m5f?LAfDtG7$cBU&%RessyIX}h=1nPVyPf$rU%q-XNHdmW1A
znPAFLSW0=q4B}SOy;k4(f9LIzwqT#8avj(WD|aZpBX;CO9^Cuw3B3??{IjJ586$7a
zUa6pjY(fGCc28prYhAOn`JS?e?yYcgT*hmao2of{?TEDBIb-#y%qY^nFwF*v_E;{A
zS7h+*tQ{v)&TXwKo)=%kV}M9m{7}a8^r5cz$F;2q`MLH&X2?x@y;djC;MkGBg`<G3
zWC3Vx^)JV(9;1!`HcVUZQ0<P*V**}<HWdq^TF;Z<$)Hxfje<S7(@sN~6^8j{cyZac
zgI~YFB8p1J9hkbB;<a1`tbfR?%Gp{Qm-2{}Ean+p_&noE1puGS))alw97`|PTQL)~
zRp(_xE~oXfW|P3jRnb7eI1+=T&HQ%(lJ+N^qDC|_x4!}9GyVxD<52(Y1{TWSttOWW
z6WIj}K1FeVSwV~j>}1I4MJo;CHfu>8YPX`Q_isLTuAi_&TQJ01Z|^-~)*Df8hcru=
zPfXTw?X9T{5YI_}`QIH;gBltcGFl8Mf3Cu=9LT4fnfW4@vL-5yR@t@&g2tUO6uh#B
z9Z${<{;Z%EZIp$)kbRM<hi(OOtX&t%TmP95U#AyTOrkEPh}G@Pa_*b;ypQ>vtKtDO
z&w0DVoO9}NdonK{P*;Uy#04oMJY&58w2I~)*Z%lL^4j07D<ch+`~`peInAuNJEnT0
zs!gvtYvRZBg(?XxiW+uScH8BnjA3z_)dR7wS4oFcNmQ$JLAMf)XB-5;w-qBAw|Y6b
z_e<Q@IxwZN+~9a)8pt<z)D`nm>r7imboe`~EjjhG2ae;}Q6cQ<u@kmJQjx2v`76NS
zwNu?XH%~>ujP*BLL-gZqo-=KF=YyPfK+|1i<?)z%kf4(xWOXJL0G}1=N_A&-1<|BV
z?J7DO(F};=as$x44f#o5;ODW>r9pX`SxFSJFqYxpxSrZ2!*K2M`Ipd}VJqUbd|Oa^
zIqbeQ>WOP)BP~RhZ~M%S-eVE!quAeV6_OMCZLLQ|X<p5cqE0HW3Yzk<*Gd(#MN1*v
z*64G%<mn@lfBVk<^hcvsOMZKW@>{+}<uQ-ehJE(!(z={t3_iThDFR+uI?r_N2>153
zs!2Vo+bW;xySA=W1pjvgkB({emz|o^6&1`f$5<Uj3ho_zTsyKh*`K;Z$4+&H9G6-2
z(rXv3RVd%(7~jljS5ufZepS`$kTu$1!i}(^MJ!MC*=w5k?AU3AB$Blv2qbS#@6jpR
zvF?C(LM`Gb{zI0y$CnD;i{rsAkatba0S{d$_vhUT|BCiSeV<CHi~P;iw&8|x(#M)D
zM`&uMyyE`o&cM}mwxiWl$nc_b_G$Eu`snBFvf2Gx$rRnJJJC|~fLtN%2VF=sW73vC
z@XP4KCez>QRMek%HLcYGzRP*y>UWN1PjdIk1|p5owz$}g8Sc*Flv`e0sW|)Z-EP5q
z@*L}h1bPV?iTNxpB9b(7kmx#tbZV*cjJfGyj?<tL>+SWc*ji^4bT&vn+GSk$4m5wd
z!3_wZIJ}=hFfe1{#u!8!^vU+2Gx$xzRLjg!_me6P%+$i0wryR|{uOMS^QEEJBIfVP
zr{VtCna^AO`<<Io{)#h0oK=`zB-J@36k0OXiN5zMUU(WZFe9HU%R<a_5U65Wol<kY
zgD-xPNSTtkAmXBK=JKdU`Bzx)9+>0pIyJO8PJ#Q&w6UEFIs@9pJNDSkN^roJ$SV`C
z`3Fw1b`7tR%llp!KE)xVWN*GDnT(ThXRnu1?z}D3GX&ie^%gOD7%_gPixpaOncC~z
zxsLhaGDV$V?LnKmwqt4VcPnG?_V>G6Fw8)%C0ZmO<E##25zPSDf(TOLDD58V_?<Ft
zHy89?;rR!AjgD%jtb%E$di6@H**)33+rA;KYuD+~vU6Og^7YdEwblDzXXk1oI}Bj~
zW1BPOsU0toWal&b2q!&(?;0JDwKlrk0u)edeIaujAM#>cMtt)R1yejCSjEatIpT~-
z8hu=%#6%opSS&=h$V7F3O^lAbnv8Qdn@Tt)l8aw$OFaojzkOY(j}p}!UfcK*d>9vV
zxhZskgZdE~<(Exa>k59h-^vKvn<|b>MGs(L1*=hf+m1=Mt%RxY%kzJ-u8yWjoX1%c
zHbVW&n{VBGiJrJ>bnGWZ_x|>?<rTpiUQ&y3gs+e6+5X$gC4rSE^k?S4oTxG^b2N}3
zwy#&45Srsm1fcks<azLBR-JMh?(#%}8dhm{5^IE<%5glSz$Mmw){Ob&hlSZd$W^Z~
zm(kJk^w)y=q436wEK2bgK%pAAw&b{6OAGs$&^hY=Vrxau!n;4gwfr5t0)Ck_wDe8!
z4J0*nq)aU&8Wj@xk?L3*M2wmRb=ne;fa_y{PoAbRGj1GwFl61GAHK|0xXAR`E;VUk
zoNg1BVh319aDKgMmu}Q))<><t%=*#w=bCEZd{6y6o)weMfxdtr0JZ^w%2$2cH;$EY
zLwJ{e0YMr1H{DE+cqS#63=&Fhu>srx`%_B_ebA+`-a(HWd&dbah&VcNa!8C(@>dg)
z)wR@%gZq7giDSI6^4k;MXz*IWxq<6>jKbo$=&tw3m(6QEVv&&873`#IU-VvLkhsDN
z8b&I|`m|oNja@k~IqQW*Sl59AeWH+pU3Bu?6Yq!MgK{m~cvZMnne)?U!5~$J=HGIp
zZVU1XBxmMwe&dIR99o0u)1_xvI7tCfxRLYGd^$-E;lB-E%m`HmTji{=!1@q)Eu4W?
zRle!;geIY=%K<zxs!hZsV)>+qnJU6gTpIWd-++)yrz=p4{)mW5^Cwg(E8axJrNP!Y
z#eLv~i_E5Z>PlfV3D7@L{}@%u!lhkmG-}H@m6TM_CFszj*!+>k#-+dow@BO09Z^Za
z35Z2#9&+tYnWLlRq}loY#3l848jfm3Ho~F0LLF>qvQfI9t5jX?uTcjxcywRMSRBuJ
zNom{g_-kF&$OUH{Xooz~eD;_N>u&;%c%q5g6a_`c*-G7#B~zD&QrKIWOFAq}vd)xu
z<~QBN%q*P7oo(_QR7uD&)M6L?IiQM`Ria|paF$C~hl7MlE--Er^0Y>`AG)2w?)(V^
zW}jHzgP-Iq6>*zo!9WTh0PK=oIrTo34_jscbLb?oPGnrk#lF}Bg(15m0Ze7)+e_sV
z1GU1X&RyLhfC2Kz6nFDy%R^6F_L7J*heb#CuwhUNS+_^hA|XbN0GCnNu!)Oh!}d~b
zZX5az;W*Nkmx^zRb)zrPz{NiBdN?`uZtJ-{Sm<hmaKo#R1vUVZ-7BQ@T+rrmq;Co$
zX37#*yqF{MaTm>vIDnezxm1(RCp~Sgn|ZR%YHg05Or&`ehp0eGT2Y!$C2}VCiQS4>
zbs$6_!Og)(2jD13Ph9g|g<g!_O>s7bQRg?uiz^nO{>v}A3XwnV{ma9)1sbwJ)yw|I
z0VU1TQ5np_I&u+9+CzqfJI3<E>nUl+hI@TfzG1{@bvMfHNDrv-zht57OpAB(d?PS}
zts{wU%j4#0qCRR0i2a{cYJYVC6j285hq`-X&MI-~L6>xJ8a>C&(cD#EF9!w=VIvdB
zaa!a<7mJF?jTsmk0Y%A_Wxa>C5`LQ&+$mP1-eT1GdmKqUss0j>xU!8l4PeO;0sTdx
z6HFTx!{dUlXE;NDPi?q3RHXE4$C0kDAxc5;0QVd4rA!4__a>*^7!vj+#q-GfmVw?n
z+P^1dQgPTaQKj7k_l=fX5s%OwA^Nk=L>zQ=J}1|<opLHdaAI~ZPxMzLr6b51(qzTq
z0zwqIETrqDsS3@CPE-u*vY%TFxWzNZHRniXb)18@igqiy1~$1caRefEpTDjcXko!A
zeko~Q3E$Ij4~>|+r04gs^x!HWFS(TOT|_rxw8Pg5k?lY5q<KtufJ#OFp1S(ko1axp
znqw*m5up^M4@a50Qlp8`BUUcQ-9cQ%Gw{wQ?<Pr}i3u0vGS2)ETHUh-_S<-qIqfUj
z3crs*A{=O|`A3v)qtArXHE@+l0(0Y#L_C4;C%z!b1ik<Hv#><{4fiwg_hFSa<vZaK
z@<snB{bT&7+N8J3$MDb2e@R3xPjBjPKC-OHV)2i)hIwG=oPFIuW??rBSik8y#;7-%
z#RD-D`MMo0mh)6q9ru&=!xd=cj4Dvh$O?g<C1N70Hz@9h?#|n=pD#Di!%hwev_rq*
zKy6wT*(y`|1^jYU^vElHUh?dRJO>-OvwpFRIyikA;wZQ{p<K!o@_uel8+>y%O7sMs
z2ByR}a4ieET71g-_&LS5oi^;ZVir}qKI?<Q)k!F^Tn26~{?zZE2KjRDvl5cwHV};H
zDDF{J^2}y&!8jhdwXs4fekB+|E0PR-osVxXa`L!zOsFFEuH=<@p$je3<N&F;Ls-S-
zu%Q+T*`jy<>^Rhg1l7K=q{qL6AA7kqRIJXa_!=weVi%Wu?{sID6Uxxu^;eupR)?Lh
z{KH79x^WTL{6R>pc$|2+Jz}$Nvw^h_Zy!fW9(Om2l%7a;wBMzVhMeB|2c(hf@ZSkt
z#G>MY;7}Ir7iR0F7yUIoSZ>O3`Tf^%-Z<<47Hm>Ctz^n}Q@AwWqR3SsZHA2f+0oO<
z{#a*(ilG+_(lad!7@h5Uk+hDJh#2EFHP3TR!v?U4-vQ#>+gL`&quBnW9?X6f!(u`{
zgh>5d_^4x`;dTzL{fe%jJFwRbrBo|cLA6JV%nNbDmNC%h#4hEo*NB|38i6%JFYdn2
zvZlsr;>%Gav?<30RpOpAtWp<Z@?i`W>0@Gq_+%eyvluylL&q5aT#t<UuB4@+eN3lB
zT`CS#Nb--?$E^Ih7?Cv=1#9oE;OBHZmNEEMG90{R#0;b)V!JBTYnj?$64sZ4PUp8N
zxcLrE#$aU4MsM$u>r{3tm?&S6?7n<3@7}iCdB2q{abh<l4b`Vcgm2ZU_WKRU-G))6
zr=n64WYhIu{z8N<FgZ3^?_v(+KWH!l_wnbC0n0MxvQ3%ndpYHU?vsnd0Wx}U>k95G
zaD>6JpM%2jvoZ+XgVzgj9FKdp?&4Hheb!Rxh<qmPmTE0>ewz&%QeUiuR%<{jf2<tE
z+!&x~8BLJaFTw&*0x4`*?i5<{-iN)Yjkjd3F9tjpmTN&*ftza8F`9(h-}4BhN-PC{
z()FUR<AH-b{25(w0B26p;gs?R{H#H*OeUBKbBK)hSn83WH%&<p(u25f5N<&UzVYr}
z;Kz?T<gEH^QtuE2fupdGQ9pNSQ&}83-~MDtcPa4Hqa09qH~u8NCRY&GT;<Aar{HXL
zQ^-;c5N@f+<@CE5hI{%j+-=7F=986l&rD7bBYHi2_g931H!sYBn1rmsUc!TK7DhpE
zU5A-Lo7%0x13^iG@G#ZY4atjcqXP2CU*|jhYn?467MzGT_hRP_CrDs@dsO8mS;tcQ
zS-9MbZ5z1w&IXR+y8a-fi|NU+hL{;L-?)pzmdY-hoAzHATC9`$_`KETe$il%u2*1u
z&;^h>zMo5xJiqJwcku-OW4h1l41nyu0qN?WJHl1%a6CI1KB+9Xu;^WArfgE!-3{ea
zB>udhl>V1_ex`BD+8yb!Yr~lAJ)uu#yRr|eec(B?T>!<HwJZj#l<QuD!A!^mQHn;-
zTKEj<ZPo!=!UapOLg-jGCH^`yw{*r9QyFS8gP<Tel1>k0avt9Z`unj<B*_87`xQDw
zEm+*BBjN;XSmh@;AtcrcMcBY^ibXJ#vTFDa5fWs$b(G&G{vfzz8=9lJS{=rEux1>j
zhcexo237Vr?B6jKzz1b(&Ph9Kc~YLCxIhn0+vDETwl<+{&<g84I8dL_mAIU2UDF9>
zesR54Cd6sJ3>LE4Dl<~~QUl&_vpOgp7kaGPH~ZP1UlwO?dNUK@i^=GSN{aiqcQV^{
z!X_xN(P-g&FOW#I>e)M2y<YboeV9l7!PPr}v7XnW91&=NTOSd}17UJ-PvSVi4o(Rm
z1X+twJZEkszAk3^d~#00mfVo(?NEU1dVy3uQOr1sTJ}0t)d}yrfE0-KJn7VKTf>K<
z3SNnq1IHR3SD`3XD3ti{bv&1dd(Xy)Sg<l|SkeU{aN@3VK)KNC1UkFNw_|aKyt*?y
zrT3`qXK(D6{ds*${o|#lTsW56*Xhr|mA5ENm|wq7p<o@4R0I^PUNFC4QiHQj)nXC7
zfV<CX5GC1&2ku7h^7bzN3>IO{=7_kaN40xKER%;k3}`*%BgG!S@jmq1y?R6`E3mb-
zAO^VuI7*ee#j_v@P<RTyOZH^qSE1a3#;hxdO43=KSgz2q!EvoXD%+^~60c$m?Q9k}
zCe0~Mj$gbt)@3tAZ|g)QbFd5h?uT5{;(_7@KA{!86!Ec?sTgx?bq}p~cyX;@b};4j
zsqHyrxxj$UO5yzK5yF1Fc1ri{1?_@$*Bv@(eyNrf4V4QqS81~Znbc?Hw=hGw_1ALI
z)2At`$J&+;m4KMoI{^tJu-G{BPtGH}G_Hhk(<Yo15))Tf+?PaGEdHTWC`?U-x7-}v
znudUP!3*)Os9sKh2<UG?0>|Hs+0s+PcHck^+kA|m!g$T#VPllG5^H<QszbSvr<_k+
z64qS9dZ8=H4tgj=OfL>6)I<Vt;3U?IMQ>3*#fTK;A0(X&<xW#9#^8v&OOAW8zdr`D
zv82#!Z?DgNiVep9po>kuv^*5#G_}^lhS$^dd>$v~vOR}U8=5W0EIU<aS8_J$8%ho;
z6W^`Fd;pIUp7*T7ZW$x(Um~YxDW{^90+ku^#VMU1HQj_Cgv;&%r)f)m{&d&@FsBe=
z`#v*iyKx=*c_WF&6YWPQi(_Uq%x@_O*DculY_8!NRcugnR}z+?Yfi`!x<yd9lrWt+
zY%!WZ4>hvA5BeB`W5?yZ8k5FcO&L@KYzJ04f095^sUf_~I1EPvKC`L_$Cu311;7Xo
zK9)}FhbdpdMzXPVOtB^=hq@V4Ax}CSVHLcYmUx)fULRu=$}diak;`mqB+kwGKeREQ
z_vLiKlN!)(y@+9BHFgbxNshMMIgmZB%l6OBskkD>aM($ftr9AS7$Ld-!e5f{kH!$|
zG2_P`#~zAREZ?J8H=+eFO_o1U!&@nyB?)MYnlGunKU^5kfE-K2i^?W$_X(mvheDCb
zC>jvxN5jBI%;v>~UEArubzskByFjD*!XuI=u?|AzP-~F_e=e$%j<dQ9ocU^)xtAWN
ziI_?JCNK4y^p<J1`(sYm5o`lVEf4c=K@wg5hC(H5lRhHIX+4ftYFc<O7nB9#y{q3Q
znj?a!5Kf<z8aF&aO`hD@5Vb%R1FlGlkorGD3!XJU5P4X-cN;hzr2CTGPZ{m!jIudm
zwUdBs9r{&Zr{wm-=*M?Le)X$8Lr0_K#NFFoMZ;Q3vDS~OkSr4jWnSdYFTt8mZ8AgM
z=t=k5k944@APC~j$==9LqcyrVvKt+Z+o1Ol!!9fgdQb3j*R)T#NYQaHW*Z$BMb3el
zd4Xg%170^U{JQ8|)Py0FXH>v}b-_d9%XUG*i0`5a7xpy?tE+2q9;OCzyS~2D%t7D&
zi3(oCEXHj~WtoNay3l{PGL^D$W25X)<*$?4Gn7d{gA>`~s#~R^&tdjv7OdRUB(+Ef
z6AvU5AxVW$#P=KlV#6`>LvWyntgEq-DP_5Sm@&n+_hZe%>Y;q{zqzM{^ZC3Jkk_fW
z@@91+hY|`SN~=CwL=GuH)Wl-@UE>(HR3E)%MO}wa_Z5~??>cV(kY$wQxZ%a&sc4b#
zJ-8eK=5S(yhvQZq<6r#4vv(zo2xGL)eWk|U1^d45qS;w?Ena=Un|`mI?u9}=4l+A`
zA8_{LtdN8g*ZeL!alF~+<cQIqTdD~i_K`W&x5|Hn<!<E>7+|OZUn|(|M`vx5{koJ@
zltFaD-hY*#8K10vo-Up=s2xh*POi}*^^I6WRl#3RMm&y-k|=+-Bh<~16oJ(1Z4S!r
zU?k@wCF&$T!h)9xBY2TaBLAZpGbvcx-g^2Y!r(;xr98qOnaldN?#Dp-WXG%dT~$G!
zPxG3gi@9zlkcj1ZQf>QES-qRQ#r8kPmx4yvZ_0e>lH^xbeguFJl<eI44^{BtWpKuD
z1^X638RI+m*8a)o*zbp1%sgw{pO?J>FQt`3m7t|w$`btsQ<70%1Ne{L06*1!&@&&U
zU-2*c>m3OrmklieVo(lHqxB*sdmV$|<#3fW;GjA^?^|ld(HukXb1ZY4j+$%O>De~6
z2f-O!7#FSSUzSj9)ZwSdc$a&83WiJub0eF&5}x4vPu(#To!4|YqfI|Rj%e<*mlgAY
zkEQnV0ZjDmTGk*2Wd>q$&<)y_>wNXX5*?AOo%7p*oO1rhh91v>9Z|GH$#kOQ#4V%v
zE!z|vnPZWlz)27yw6ui%$Qku<MFQBRBTD8nm=u<U(<%j%P4zjCSEH;fZ)a*OR&lkc
z699$Vm3Pj&9=`U?E35aI9oYc9rc`XcKX>*f2u%xZo_u0vV_q`AIR)q`lSvr_0o7in
z-I?{*k>g<L?>XVr+29lm6(WvKZp82H!wPyue4DzZ@V96gQXig<WuMf4$Q;H|8AJM1
zmyB*Zw32(Yf&o<-ezhbF5mng1`_7{`Z5?AS(02E3UxdeD7VHOvHNme7SnUP%8lq+8
z99{w1Mt5p#7pVl`9#at8!~GkB27Q*0UcYCG?wo-bw0CKy$8UG*(>Lo@%qULKf8K5!
z_=oFdb4QvGD6s32x39Gk{~myp;tk47pA_bLVt7~lkxsRwj;j%aGRB+j;4s|}il{J0
z<{`$+#r(lkT|~pdXOo$v66@T?lixz&=HA41R10|=0=i&dGXE;d5zT$6@0-<wUH8Bb
z-s{Hr2b;oFh8h8_`PM9l=$r!SgGw!qdN>^Ghvw^RJvd&fvK#gbtjk@Rj#S4HAAKIK
zR601h9TZ*9lLhCVG8@?fruH(Q-0YlVQ@}V(SRUWv=g*`KSLPRoCUp5OHf`)E(a)Si
z`r)F>(AAKOaIBhzQv`dg28a{k`UX_JwZ>SLX=8BX9H`&OycY|4P@@WgE164;x)G89
zbJhIhx&nB$OAgH&PmngJr2pm5rzD8v<7nZea?Zf24;ledm|iv82oh^R+L-ZcA6tJy
zr+tzao4)4!)pWsrYm^uea7A|DU91~>Ti`zRNky2V7)KjUP=gfY5rgN5Uu$4d?1`VC
zzbTNGK+CX2;yw49Y{01iu-XlhZssq5^aNJFX^?9U{uA+HF9HDqsq!s+Wm3J0zox;k
zN&dL{aC1gx&0{QB;O@HRmH1|!zwW?kxexp1s(z@RM(C~g#tIEEJ@`dGfpPnz^g28J
z)qCQC#sqGOcRtDQ3`8<3|2~Dif6N;z>ZO&sR(e2PYMTd@(o_7V@)z#sEAV)dc)%dh
z5hE*IWpby>=5AV6_2+ZyVQGvnq2~I+5hbf-l-m57YpM`3X5bvloqd;qE<J~Dy+M2d
zsf@d$zoj{}K5?Mv+xii{8I}y!R9p))cC}vEY?z`lYajeJ31@BEzU2&;Z<er1qLGtP
zxdU$5Tm1m)&iCe=%(ESi-9(|QcY1vH;N=wn%4Iu^roW@pxL>})?ADQAW9Adq+1YSN
z7)zmkd~FS(;QXL^5O-hGDKDH3|8%m%68c~JZdWO<=u0w*2^4-kV}<S^K{_$os05u>
z@sht(Q5)J<v!QjNG0T&0@glH^o%sE{b%p|ORQ8O{ygAjUbV+J-#^>}sr*Y6!JBige
zRuH65c%-i)P><qw!p1_!zPN!adu78;LPtXt=wUJ~Zl=d!SF=YXxt=7du|H`=LkKL0
z_Nt3ik2%CH7K~Q>fL=X(Xfd6DZ{3R<yDj2w90beR!1V96rEi`YLaCY*eeZU9i#nwj
zcEKp@l$)i5{9BW@MKwmnQ&1X<R><~uSC_NLzq0Q`d@wgsl;+JpN&}71ccj0wx#-dq
zP0W3=x<MjVGOD;dLrO}8H<61mY_}#zanX7Qjni0V>fH&8z_@?6fM6%|LfuOi7Tu38
z3P4~%MfR-BlS0<;t|G5g43lMf=BJ+gZS<jkQ&)T*VI>Ks{PsoSLH+79X_Prod_3m!
zdLH8Bial(zj57r94?uUw-@CBpeFY`gj*E?D8hHA?0;a??8IQ~RIMM8ORXEvPyZbsL
zP)+!UM8r?=1B&jOu+^ae3-qd|-V%0BOLy}TTfKy{<MD#F48Y@UZee=^ap#^A8P9i=
zRCF$(n!&F`okomVgtF$yiQ-stAjb$p_R)*{gbxR0*);yV55_gZv!J?5#jTGPElfFT
zTuK*1;nd?-UI||7R4{Ei#|VA0ns6zPhk>BQ1fgpxgDn!5btYj0eqCbZW29nu`v?ca
z{j1q+h#7>VuV(kN%qgMG3<~15)fvLYPX7*huQsY~*&_OV6fWd&QfrV2=xTa!e>9gS
zeB$>POpD{K>E3h<ZHK~`%n{0YBPW<y4VLl7veydu-e}pK7*w+clct<eZ%w>lS@+GQ
zvX#13)^un|O!pA=ZLMFg6!N1WSjMD_c^~RGnpZNZ;3asqJeNIuB-=T-3@LhSkO25#
z5|AXM{jrQMz^A7^n))%QMO#7oE*cgWhOa?^7_-8_;y(nxZYY}skbY1i>@C}o*fOV9
zP(KcfuBey)3}Rhh{yw2(X!%7@NVL=WmwrW{z3#ZZd^SHW&T?<o(ohy#yP6xh#7G7e
z(iDeF{}%RuS>Z}3ODIYsUCu78^(37b2VC_waMuqJ@r}6Km{#KONuPvQK{;KvmPdS=
z2-#9mPlvX6RW+(@;B3_J{BQ?X%&Ar<?b?zk>K-x57SEQXI`M+(!-rLr$&wV|slP+s
zSp@ffj;Ql8pR|#(SfuyU6E)cD&=%E?DylX91faLbNtM(nhh+{WFXVozg1b?qNJ+mm
zh7;2sYg|>??1UfHP8r4|kVM!X&_!6GETM&P;F+P+dZPb&BNM#!`Gd)7VQg6;df>ah
z4;<uaK0P}H%+VY#irPz<`Gq*>NvQzDW#qLnqD0@&>Z>uvxYj&{8wCO9$e0@dW)1*d
zj8!epcgNX}IQfg`NdXR1jxP>n+7JW9;ZL8Jh+re?PmvBk+`di7qPB%z;JtOXO@Bzs
zshVTyqg)^hWf{nocf{d6%TxJgvr#S5uKakb1e(?Z?U4&d2BcgNL>-V*6Lk<L!O15x
z{!sApTxRpErO{PZ6Hu2(VDAaxvZliB=g$fd^IN4hM0k#u))R0!s@F!Jyc)~u88WIC
ze>nYx9Tnb1f~)|;`GF{~+e8vn+A1z=&EYxq0d~C$>x_3rgEKHgowF^4ZxKJ19^3~$
z+W*LFR@By4$`!5>6=xMa3*JH`h6^_w?MD#t34d|eO86mr8bxILfiZSZ)C$=KH1D;V
z53SbnUzB+Ztk!tZ3x&H?nW?<Wdyd1e7LAl0NGZKuwUOnqsIA7EE~j6Z_PbOUeMy_v
z^CI!bl^VRi%)`t^<nP(jK$b0smoj2>@UCTmrGCTEdSCFVr1O`y=5Sdk8P9H20@4_(
zSS9)E-vr|KL@zQU#7sa|rEeP+R4)!r3BEPxEees-v~@hV<R~C4o?MlfklJn>xp2x6
z3ue>dpL5Ps^2mDw#(!UGd;}!>-MnHbo@ZL7k8&X_j-=hGK57ecTQK=>bvg`y>9{@9
za{k0#*B3ud9~4$MWLVj=8uwleMTeAkv~fFbK{ZO1V|3{(es5$?jMc3x!95FQT#S5c
z0}%PBJc+1)qiQH}7oE*vC0Nr_S*K>(^)a^FSr7$ir{}n3@aS@`V*Y@0MJ@vRT%lP4
zR{E+$@mM}KQ;pt3i#_+yF;XBzdb}zALOZ4Lbl!>27^};zO!#~HBsEHwnp;E3TUs`x
z*3WFT=MciRVG>i4ACBEVL!uvxuf4mM8v@q-A*w0H&_m30&-Np1RKW_5A-<_|5HzB$
z&o%_py@LG2?Z%zHP)AM*dux2!70){6(3Pln!W_$MRr3%J?vjm!UNU39j*~EKLE)t)
z%3x<=54Lt85s|p>5cmWpbeN1BWz(b24?KpiHzmv*iFyMCR!^2<{BU+Nd`KyGime{=
zplu|7+vk>un3aXu`)vka#73cNYnu^evb9Y6g|<5fo?MZTQ?^rphKxli<9D@(TcZsE
zyLHJ8g4K3f*b5Vemirty+>q^49+lqtq=qhxARjkIw9F1!y(i`(fWF?>_LE;Uxgd|#
z_ix))fM@v;(}IR%Jz65!efQ&-RMZ>ppG&AfHwTm~mD2T`<QR9hIHmg$6(K)^30(=n
z_o^qGWS`soKUd&8&y^PgjWOG85hr%gUBCMnH2CPY>VJss@&*#`8{d5L0JLIgz7P#8
zVg-T$hkzPCHJ!>h(l|cm4Vs_K7wax5s3oYC*<dZPq<g(2nCyt2$qiu(K@Us+>sFju
zZt!pNp1ehldE~Mn-Yv3_CtgQ^FQRn0!1p!u{Gas@mcB95+S@G2`8UNb3IyGO9$geY
zUw{2Up3G==g5a!0e7aQ-i7LLJ51uIxVMfY2-p4lUWPu5VhJ0jH-;L`(VcQPMkLZ6G
z7>s#A`u&X6j;v)!{0SDFe~8+QX}uRhN-JtZtS6Giv)p1-<^xl!U8rBa@{zp)w-$Y@
zqjRCstK?Dz-Xe^gNRyI#ao%ogfI=^Eeq89Z)EBiUmy-<0-rsf+9Vvv_0GLk8;JIcw
z1f)6)EjXK9CIc&{G{HG6h3oPmv=Df&zaz*LO&D<m#kUvM8J&6-{^}MBJh{<bh$unQ
zj$`KcE8KiP+l*v`eD6}i+tfs#Dro~#%NvZCmv!+xjO54G0{rjnDzy-scDa^p8CZ4h
zN`0HUPzvI})yLWcnFxE0XiXUKDfYJrPYj!6TBXoE8B2MdmXV9QG#4EFbj28G1Dvnp
zsJgO>2qdk-B{18sA^6IwVONZ#oV3bX@qh>tyxC{t%oB)$SmQE!lxSYvWVn!WQSzCY
zZefVv!qTjT!CO4OI>kq4g-V;iMz4J3W2c=VLSJM;h&`$?w@K+u^%0yQKB#lU>qj$h
z)9T^8x#;9UbS0ln2)zGLqT}Vjo&m^z#n2AR%TY9a;uAF~#pf<&JFFqcD)<IcZ8`E6
zX`2NTFGUH-7OU2fw_4t7xzwH&PTz4kUrZQZIIt*_DO>zG+I8gKfU=#UoYF6mdttc#
z3J>v9ROuVrKlD#R$E4;28cmemq_E#(WmAK{Slz1q#@?F}z@kWws2*{elIw`Y1FydB
z92gC=!hk6>>sqVYb|_{uBwYz~8qy=CZ|hby^S8j*kjkz~*c6S}B}XU$Uo36k<af-@
zmR!};D|3B&@dc`+PpccvRtia}|4hwX4ctMe>=pg;qj@sATG;VRdB(Vz*L6!}VLdU}
zRKoz_3H@u>Rtj|S+vrNDjY7VuxCKSZCo0;#l7WlF?eFl{c8e*-=*5z0!Z4O&^{|Ga
zKD_t|REM|z>JMzm&m1kka=8YhHWjy|C8`n^TFo0Q4$$|uLU7#L|JuiHuW5-nWLWFc
zbG9yiR(7+!#@TE6Jk+o<?xHSkp+=PL?<X;Y*O>yA(ppspsqN`A0_op?(AxxkUWT|0
zty`%A-v9mV^BpgMW|gt9GSHv;IfW#)@u_jdg34LJzQvaXJ4=&g&~2Xj%j=v!LGgT=
zzx>k8(q!paw8lGsPfGD~cI4M)TVkV-dvU8awOaH<$W=!~7<{)4N;CM<nUPhAq?w6t
zKxLnV$;1rfDqA^(n0PUdDK(|Pp`Bhv-(3SE665>LQVFj3sfWIiIqn-lBx!cUX-$s2
z$2ZcS8Sd)3w4^q!lO>LVMeFlQE^k?$f)(AA-?#*v>CWdHl@ndaIK5Lv3j3wt-BA&{
zcbiVVyeHA^6`uFdrNwY=&B-_uJ5ZScRkm``<ss3bz>qbV-MfuaG$15oAx^Q`8`A*J
zrruUF23+fj9)aJ6vQ1cz(Xa15MpYM_kg1VmU->ijBnf}<KcOqOQ`cf+C+`-7ufxhJ
zE-p$uwEGqZzsxmHDy#O}m^_V0E%OS(<*m~3S9g|F)s5AlVDyt74XGnNL-5gImtTup
z_KzTC6F%ahXO+n!QX|2lX{h|mw+~&=0Z{b-eM`RdyX@&%6kqg8gYu&!a3$8p@wM!&
z8E8)_ACdD99b#Q=z7;jCw8~R|{=Do_aZ~E3qp^!_^D2*k(>SH9ncko+&E67ve||Q3
zRlQl=nK*c*bCE!!s=eN+o>ph@m*7lv9?1*B<Z2A_FV;l+0#xk0<Xx_z{)yK4FKUn|
zu91L5+#0T4)jcD?lrp_@7aU0x`4F@ajSn^GYu4h3zBv~Qj_AA3yS7&i%_CNSL>2rK
z`Gy&#w!1O!o|5v2weTCie(Zx;%cYmm?zjdW0LtE_iBu+EV;d>g8$n8S7%CkKm&&#T
z6Q!Mbl<;I=F|I{$fR}@jE(27Zov^&+7n$qKQ!=h;s(~5zuPw8o3!FA6rcelVbcjem
zOo4f0Wm8bn#_?oXNL(zOmKHJdGg<PqazS*KT;ZP!GhhaZAyaBle35JCR=1aq7iX1M
zGl||g14xax`N25M%-yh<pgg2JwRhit^xzX&?l`JZ*y^A16v4;^Oqi*WGEbU_ZkZYA
z#Ogyxse!;d|Ap4ip*%MVG|cs}xZlc-8-$xDCG%X&zHe;r8PX%okS+=0yAAqwzBj}|
zi!31-eqt0;wJ-i?CV~hr<w(AuZ~5fTiKjD&Az2jg9mjDZ%^6F+gg`W-QWw1Pbz-?h
zBZ|}9h3i3uqdfJhb=*qvD~m0Im-8s#_p^L-@ZpmGlKJHs^WMX5S?v4~kVQctDx`kk
ztm-^BC1iT!5nt1R^;zwz<dIvDilONi&YRz6VDAcl<EWr6nK7d#?%?K5NrK8F5H&H%
z#ZMXXjzzR&Ldb7fl|S<aVFMq?D79YCfS<yv?xt4L$gh-uA9~r)%t-D(Q!&wgH})ym
zXTJQ}rIVc%XnJrn_6vd|k}wZoj{`W+*C>g3z<n^?^ZF>D==OWqovTCmgWwISQ>zhv
z>HC5#ef98Tsp6+!Wf56#l5Yl%S;INPG#yIll4eK-KE9CkAD(5Ce5(CZnk-6=RFL_h
zy_ah`XpAE@2;Qs1DKj3ML`k~_l^6kKhN)c&^>WB^NX*^Ra#L+)$J8g<N?)O4Tr6pg
z1T}=hI7IhLM+_`g<9s8zYxnJ!A<Gh2y0VRWVDk6m6B?R;+=lvRKjCmY;YgwGL55oa
z0CbzBHlHsWXxkb>A>|xjQd>I<pfCBcfzoynbq_zoQ_f2Ee*(oBI_7YbbM6KyNsUmq
z=u!&Cj{$sNncpik#an~c%9F+;4;M?g;)?Q}oC?F!4EGfIfi(V>#xrJ<&*H+|L-W__
z`r8`V87(Uo0C`-T>Ni#XV~G?b%`WJJ&c|$gxe(YDbuylJiT_4dO1dGMk&gAWI96Al
zsdJcFtSgCSVCAhNUb<jqXK`hu6+tV9y9n+gk=$DPb~XOA0R!@$!tV+J2Krifht$LL
z?~qnN4ds^Ra<5c<KP7}e9krT*MN$Ovm~ns7$69_m1A9Sc`eS^WHM$QL9h>bpDdu_~
zYn`)5G(;ZbyVUbbGTLZy?)2P8SRF8W;;DZU57Wg@qzj%<4C;=!FE`C`<){pn-pf6q
zm7SK)>H>}RugN|GeIc6eqn%^C*Z4@w8>LC|8&6ghdwC*@OUgq$G*)gB9fK_G@ha*1
zgVG1|No|wFW7x&lqug4Bmww!6eX%ku5VAB!an%$ok|Nlw{|fOs${Rb%w#MY}$Ci#3
zSIq1^R^F}3&xt#AH3KcD-Py45T_Z+M2y5w3CSSDdm-&G4i>8_1+RPr`FiE&|000;V
zNkl<ZCDq^P2kW|a^qJSyz{KQu($hxkH#sIQiRJ2~gc{-x@-z%Erl+KA=oqZlR-~Ju
zf64bpCb->t$zM%5NlHtM$4D(v{`vAV+hWKVe?7pA4ZJaQobs(R9p>Ypu@2{>oCBTJ
zZLv?_8m#4y(hj9RPL?G8J1jwR(NTfJL#HUa4;CGCOTAdd{Kdzbo_ug<9Iq?W;Uk`F
zQvIU(=BxvV(oPAZQvVwL>Qt)uM9bUelTDv-)Yb>6+vkP%3CW+PM`*48{y;`gU&8oJ
zKT6)cV3!C^M@BiQLjKd`bO<qiJHV2+Dwu4?drDJ0!C5}eMr<D}IyA10(svhQcb&&%
zWq)aVT<eVGrCP^h2@N<or7yNGZA1u6_AoT?*9O&P((B`NJh>J(KsEHft{PZceK~tD
zFxlFCgwa@f)U`UV^O2S}N>j~0r5Zt5`hO{WVOd;KkQtu-OW$KF3sl^^45NqtU+b_#
zE~z>shah$W&!wD|!g^+YlZ{I{`-z_!46;t+PYr-^ZSZu1=}0vc;Ydu@gMEgy8OL~9
z2e-To8XQgmNd{s%=yX&As~R8NWsVtTQUN9xhOyZbsmY2fnF$t`z<{=)&I~ZtUdA&a
zOG&iRw)xm3{gG_6OeDNu^E0`>5!#ki-bBFok@Y?5ldo4^Rtd!Ic3_=umk5qxq#zzO
zvF|%o$nq9Nr7f$JTmpM4WG~Wiu*cxh<|)z|7!t~5&kb0EhoN2b_xYY}T3IHu9hg~|
zQ-1tr1aT4?Uz=4eZyfDT#YxZbw(@9$G%z~kdejp-*~Evu3~E4<yf!GarLb*<@pKuh
z{IUF#JzY{cQ8twGu#Y%PE`5c*nKo#{*V4bKy4fQ>o8XkZd%!L=d{iK#=h-LE_?Ia-
z&#Q%UYi(F&8+TGG9yBnW=pZ3bO@7L#wH(K=-ai-!b@ch>dN--#A-~NK_9BYOH_GR=
zun(3v#<$uGT#{)g0%lNrEEPzTjdYB@e2-7wT30d~q|5Cvl3T4T$aw6B{^ht|53&g&
ztzQ`5hn0_S6|i4lEmj|N48wufXna(OL;H0Lsnd6SiVX%nIQ*S71r?=7xhP0D*bc`*
z2n~dy77Ho29%V&GRE&NPAz+3OhARI;S8dN-nXA4SXUMAGVaoObpEXG?tMeMsF%SaQ
z$&&JfRHc7wHN?_f^@V6JGki0sBo0=Gjml$#pw)+EKC3inEUui7m9v%v>9VGN>6sYq
zehTmF<C4lD*<@YsX>IWMy28H}(w=Ms(tyiM(C|ra_lQp}b=+;hlol(nDucoEoEWR+
z$00bhOU)C<Ncm`3Q_Mn7lh!!pY#3n;CV3Crg=(^}d~Wmf)o49g-HvsnGnA7u=nzR|
z8SU?+6Taq(4lJ%(XA^0W?{osR3GOZe*FKVQwD=$3%SdW7i+9H4`-s(Rvd?SAohEK)
zm2%?pnywF0A_hshCQlr_#dUfQv^Jr!B4s;DU$sG6-dM27h(~;U*PoTG_3_vY!zt2~
zRw0wGgz6Li*rmc}3O}ka`)pHGMZS~>>!f#x2Ia!TI!8)2XGohP1XH{}ts~KxR^E#|
zeJy20imC3E9_+CWgVDQvj-*b9yw+KM;c3x<)x-AfX?n5tIz(kpg9kErUPJ9zJ$Tw+
zGikLl)_Q_y9ol?cn*Oa`ou1ap_Q6!gXlVn$^cjYd|E~5)^l{Fj*pA+(xDWgIYJ4cy
z2J7!DKbOlee(FB#!4zH;6v2gbXnso_BR^5tA%brSKP~?R#WVA}C2lceb&oYXC%0}N
zS0#(lvv1rOp`9A>CwXZU%l~?aB=fF#?rLRxK}E2wV40g_h;$_My+(MG5Fo$t;MqL7
z0W2;I4g95E3~0<Ouk+DE+R@amtn#A$;yEfOZM%f5M&IdWB5X5en~zI!$ZvH(H-m*K
zK4X&Zj4wI_I^ExaKNzjLN$@pA{Z?4Zk{=pYqqBuy6#~XT3=Px2)?jX5rHW(l_KD&b
zD13wDzEyafkWNVhohKX&SXu1FsvY*g-x_eGWa4vD);X+!t0@?Ic)V13Pk^Doc!Z&y
z_$!{w5Db9t5}P*(>FBjA^{!JsmiavGK-V_2@J8BtAz*P~sPwPZn|B%5hVc$U(qL1X
zvr0&OxEDb0r+h1Cyw4VmE9YcwyT&JoMe$MIq;lf2Fnq~RTW#=Z*7iP_>KN^mRH}hQ
zlXb0+qI^3{w6wDn#Q=+8+W;nSEYRlgI9K|6o$BqLWW`brqew0waA=3Y^Gp+=rwe~5
z8MzsDvJk~Glc5CQol4(dw9KIO7?R~bC1g`viu+)N;~GS<zx640Sz>|@TgR-(X!LSz
zxqlgPgOJ&?8gNu+<iO}LxD%D~4WF7O4|TyfS<c%Tht<ywy2z%qr9z;X76wcH(mA1>
zCj6LBp$*f@_hiYA@-v9A!lQnZMQOU5<a1#FX=NbO22bl8@xdr;;D{zg{bsmq=N^hC
zRP&2J@|a<ooM|#r7U?{?+b9Hzx>o(&N>=U)#b4-BL;fbCk1BECXX)RPOxOmihank^
zOG?@m0SP)@_*EgsZ$Jf5h5?-&a5tCa(y&!QbR!{>%a>|434v%iVNB^?IwA0UD?t8z
zgxtFaqH={X@qeiUz!I*wvN4|ZxZ=Tg-(1I%d$Q1EU=U0L)7!_r3?+I~M%PK?zSlD`
z`XruKF6k$Fo`e3Pka%z?=0N@*3*Qq7U#2jSB$LU)2RrsApWG4NJy3ognE^@dZS8Zf
zXq;Z;+RSTW#JWa@E98SDKvFr;wABi@I?F8?C=@Q1E&uO?KvLbq)bOv-{L0szEmH~a
z*h)NIkrHlS;l)C}jc!0=nBiF*E^!wBT3^A$4kubSJ9;|K4*nzrtg*vT<zGwxv*q<J
zhgw#?xs|yqW2`e=0v|vm^?ZmD*=@x_-j76zsAlw5mleum242!36z^6}a{1o98IsAY
zs93Oa`RdKFzE18bjH_UkKg#21<+_e8cdYPcA@E|$E<XbSko3tclk=WP9y2Pw;cx9^
z^>WN2f6QQRjpQ#HpT)(w;$ixv4?paACY3|7wZ$Lcr*KI-SQ(190q9Wqt$t~IMdrZx
zzu)sO8Q_iouy7^r-Ck%M8rK>s5?$goML>JJRYHD@!)+V9zu0&Rvwh%$5r8Qt48{L7
znzwlc+6;1nC$FUuqMV_(JU@`%?g7yF3{FwGsfX1Cx7a!4KSH=zXkfaqaJfsFrcb9U
zg<WSgz4`h@b#<#}Vg^ypGa5GIr;Pb!JgrW+(K@+s8SUw!S?3u)CU>fri^1WKK0Zkv
zli^06hqBxtDfbkg7x_&8KC0+gTYo!}zo_%5oaKITuyUR&zqLhDITUDh_@R|m&eOu8
zHRJyX83iV17>fUEGLU+NFQYhN81G@>O2DuCI&k1rD_CPgf03|gNYP3u{s7^pgbWfu
zEy}aTxkvQQAmrK%awbd!(elHX>VN4FmXG(?IgD{j)2PgGe$g>=eMe|OJ<>Xu(OJDO
zg|8wLJAu-yqhW*qoyZ0a7m@JC>O?Xts#nI<p*&7>%<z<Dt1x*TtkGY0ijOPobg=`v
z^TDEH<-4;B#Wo9VCZ6x9poyysahCs+)pQh72h-WBeK|OB<yf-yt<U(fDh6RILm7&L
zkNmfL2KDsce<+Ww<cb5k@qfBqDo%r)CFIH|2OYHtNj?r$sMx237!e~r*HAhA=h-iN
z^Op;4P~h+<vEwTZ;E==`<0609L>@alX_FtYa2TrXFZH4^k>fz&Rl-XoH%rXl3YQ5V
z5pt+CT6mCfj_?&iy9*$%OGn0H#~?BWc+A6q#rFX05Y7&wV#ve8G|~RIWV%oUM+uR|
z^bwbbIMQi15IJvA{=4O}*StfxUbsz&9`_f{6uv}A+Z-SSkc+aA_d&_<gj|4jBoB~O
zUu%m_M-|dpJ==U7G)ehs7hp~3B+~ihSZ9)trCj3|M`$v1#wAAMqywqZ9eSgevxVps
z9a2Wxp*DjNZU0>16+)hZK~MV(FHi-4O)UjBB8H*re;Hk?je?}7jl$7Vto-ZIkP_Td
zTdwQ;a<B8mHcR+m-43&LoZQh>K{?aG)MJ^LFo;l>Y6c9A&0w$ah1asG@TAT`FVW?u
zr6tX{QF_zqPmmW-<`;Pev|myiTE5eK9#;qN=htumYu7L|@Hbk&<ID3CP6M&Q5Rvmz
z&(dTWC}F^d{8k==Ir-8I7!qi#j8Ax0)Fnw5p2Rs;)~TKkgX*vW8y&yyB|;H4g4ZG>
zBdCpnjB-6fW>j2|QI2#QY2{4>^3iE{q*FXqX5>}aM$3uv1GVrADdYcA@2K3A4G?bz
zN|{z4WT>?s+)AV#4+u|?{A{6kRL0Da3Zh|f;nC`?BBV`_2Y$5zveSmthxV|Umw0qW
zJ)&twujnFl7JUO>#)VIk&dqq%`h3y;GbzF=%}X6O#N<!RBRwIH)o*eQd65f!(ymsP
z&z9=AzNa?%jHvDu(zaFGQZz1~+;2%%9`P|?(1%KZdJtcR#4+j7J`UtJI_j{~;Fw6L
zpNiApgfwC;l`1coe2i!)h05WAwF*g@rj$2Ip3NviE~BRV*o0TYc+mk#oiI$T{jbSQ
z*)$X#3B*qsR-anSW1GW4<VN9(CI1G=%|#s>SQK3qti#L~<y}Jf)noh=Kz(Qr>*!W5
zI=`iv!79pu)v?ouai(~`$rMR)UiguZdQ)eBIA+Q*at_2VFy3LP@^ADRzRO=pQV}Dy
z7B4?dtVpelusS@dq=$WS)v{CueUN?toE17wht=|8v<$vncmB4|$7-yo3;Y89^p+uU
zc9G7}N9b5>6nS{QbvvABTctkkbC-9Bx|!CQ0i(B(?F?1$cajOCH)FBEgZ<`yb@Hx<
zoN}l;h`%1?(g|1{a9`nG;n9-66Wz)szGe_T4Qnd|mWdbz#!n}z7pOydy^?rXeOE^a
zrbPU4UNfdmN_@l<m_g9tt-Y)b%Xt_rpVZ#s6OCs%ZF$(qD_wCmWfMX<O*CwLA5<hC
zXsH=!L*JH5l<CtxqxH`;KG+s?uT}o{i4XO4eO+O9OHwXBTEd4s*;c(!_!{9J!YRV(
zK9Ad$D14-FU*ZJTUTNhgrct;pFMlnnYlJk?U^-c1At?zc#f<(O$?`o_h^1~+sn%(j
zJ@)dvs#V%%?UbU2B#DMZR^*|PZxB9T@;{(@V7zr1z4AJs{>smwu}%@k3Lh6TgVQc%
zEb<O1n4T%qV2bx7>D!EHgJkV0>x{IQySODcDZkZa1LY_`GFTi@wLo-2NGd1F2U+PK
zKs{x-KB>H)R6Y*+%&6g$1jxfg0$ic`9j5%WEvqgA4p0A0_;um8gs&Gen0-aaqxQY!
z0w)Xa6rMytV0^<+oqw&dogyUEIm-WMA)Son+pYp;d5i)D(0N$iU!zJg2w0=YTLKdm
zLXl~WWoamdx>1I8PI-q6D93nPBYsSEI914n8jL<Vc-397^4l!K3gY>~`-QX%`q?So
z%3ltqH&z$Sh!`PlfzAQzCt+ydujO0gqoQ<{m$t5Ekd}Wi-_mD*8SBfhRt{vH{89bt
zG90ysvK05+XTd{2+tVItfKIHvejpxVRnj+BD?Cp$HwkZ(|19Cd@?(Eng$xS7zYCuf
zT7NU$lU6slhEhS%N-?7RQ^;t|3`4^W6<~z$q*F0sE*0`F^CAi0l!z>fi-XBNGRcsM
zr{Q^YwNOYWrVIchP2%eaYWxEac+uGKrt{n`<P+eZqfv$BD;6aW4#d-pf0Z~KA^e#T
z-Jz?YY#ZyC?0;<;%<AenLLh3;VNB`Y&eEU@S$m<gB!jeg%WvTrc8O63`4rkeO-@T&
z>*=N5#J(0=nCRJ~GpPSn1fTar)eIjp+6x^&Pxv(<Ksy2^mrYh-Y-o4iSKt_xd9+;8
zuNShy0S2S1FsJ@np}fFim7?KaDEyU>83qAuc9XYmpy6$Xr#yIaSoLWkOCK)kxC1cx
zp)CiCYx4<79mxg+Mnw+B4inxeq)dzyMVZkWPer8ybnr!f8V(-tLMIGbZ<ov*;&Cg{
zjl}W~<v>ZhVElBpt-?2`%y$Ua3axESHsq|gUew73F?lohsSM(PeS|!UB;Cp&&5$^&
z>uUMSVS<l9=Yz>#&Mzr^fKRZ2(BzA{*6xsYdiuc{j`g#Ti4UtW`UA6bw7fV!`H+K4
zO|SOtYR0c<*LKnNW8nR*kcsSv!k-G+^0|H{cc-9e_;8`oB@{E(b|HI;_X;VD%0Y`@
z))B1X2}jZ5aj7f{rh@Qb>2js;Q1Sny&<?UtNCik1^1>UK96D7yH%nCp683VhmH!0c
zEj}+YFml2-t}Z4!Wt!~N8~IT<@xK<nQFyrce_t-kMz^$AINER|4GD-Z?ZV?7&Sb6=
z)8`5=5Z)&=*^rZZAvbbbU7{}W$Os+u)Cqm86@EeZF0XTWGY1P9Tu6tm94U8{5Lh2N
z?yu?KNFT@GVKPQ}5C?z2@>||8HvO;BGK&JNP^0{#e%js2+5qujLHc*6{JyQ<?fImY
zZE{dPasW3;&$GpYwn4wZ9v)ple-0F4FSiM2i1vTvwh7_4(?AEP011WhBNUx_Upe!H
zZxzlE)(gzQcL*7=7K!geFc4oVz^lM?##j)Idz43_{I|<FM#yLyS9y!SQ9L<CMaeu8
zar_mYt?=o>R|^jn(jlt>o$CSNox<M;ZxX*1avk_G_#_#|w#z}~(OwKPZlLt#zfy6m
z1YRnfD#W0x0V}Rm!aoRqDP(WStc9Lk+sno9(G@dkTt7iKR@P6IpSSSjgoB^_uT%O#
zLdr5ac}oDkv=9B_iD>>3s+fS$$~{iWtnTP%W<=#Vkd-w0G_vmWZxX0n&LbEE=8ESF
zgy=7+o=S2+eQ7%m>~2^7+mzq*P2Yy+ALY^xua$F`@KGVFN@!1%pO1vP>+Es|-S6TW
zRJ>imq#+2(LJ%s2aE6oRXHXa;G}tcBCgDcmGT|M<zlg^=xemm~HICvE4L(W>o$^q|
zKZKNtJj@p5VDVQg&phRU2E`yZ0|1ThKxyzfL^xeIK?r|FWEzD!Apd&dav{rKmRk;#
z*Xcz82CYFW-m@E&BN?a@gQ9C2m?)0HaHf!YO%XmvNWIWE#>>iWgK)KwALPI-g7Q0k
zQ78D<TQ}uVmH{0X8Q16;O#Z>-L7c(nE3b<9IO8vmuQjavPS56&4?j-1E~8~d>1!>k
zoG#JujnE511({JxKxr)XSfUtoiH_w-2^bGWVt_Rpi}KlMI8}M?5i(0NV7S3Ut^r03
zU*bQkGA>hImiumIgh*l1P%1Mt^ff~yZ(Z`&T)-d+C6^6Cs#0b9RAa0B3>N63WMCwp
z($H18o=J*>3Y{uH14-5X<(XCFk1HfDPUTp;#wEwa4Z5s2X1;u6sHab%7j#~n{5&7F
z+b+)Q&(hrl>}ePjYPO*`nTexuQo$F(EP$|d6kL>t;LMI-35b?XV+{`-Gt<rp%YFx=
z2a-CI#dkWYGCJf{d{n8(NFEzBd^Q(nyv!(yd5vf|xT+KKxmn-qoDLY3-8v(2hE`wN
zrD8C*dMS<e8g!c}e{nTpJa>hwa-P<<8LIL<pQ4J&^yhS0PE#Zvgr}<E8&4HVhq1jS
zkFCsbtfLtzorO`UINMsnPPh|%UK*Kl<G|wA`b6vW6|`katw`S;BUGoIGSuC%&JCFd
zuYsBkKU#zz1QHD^N;^dUHqVF7owoesfQS7#FzDGT%-;Ch(7-cS12#(z*BaX3(&hzB
zGG?QDM06Z30u~pB8h;xaXlS6QfnjU-{!GCPeR5Iac#j$R16qx^$5JA*yzx02u?-D8
zi#0H84WBk)OX1HFY>%HLuNnS&@+l8b11yfH1~fF#&_F~3Nrn_5dD^{Pb~wOGQGSqU
z!kLkG`iR}3UM73PZD`;r*T8T#5Hs@qrIm4BGu7l>{gi94QD8#@yP*cE8Pd~PB{Tf-
z5&}8dvs+xbTvm@YKEYN1<#dg3Ljw&BNCRdt!%-P#<P*JcJj1uRa(V3$-#VYrIM^%K
zP$S&XKtuz(gW<P%d6Q;wGjgs9yBC1uCeeCU4X2@jXSW81v*DZZPO8#GX85Z`_b{|N
m(YLrNSsTd>4LmzF@c#kMJd-QCCZ=Hk0000<MNUMnLSTZP$IR#e

literal 0
HcmV?d00001

diff --git a/python/docs/source/_static/spark-logo-light.png b/python/docs/source/_static/spark-logo-light.png
new file mode 100644
index 0000000000000000000000000000000000000000..41938560822ca90f6c3c9e1dfc719e81dc0922e4
GIT binary patch
literal 18773
zcmY)U19&A(&@c?g=Ek;d+dR9mv(bhd8z;7H+s4NB#<rb}os;~z-}m{S@B4bLo~o`c
z^-NvUHQgQlU0DVh0UrSj3=COLR#N2)g1%}P9L(39aqTkZ3qYBRDT#rB)yE>f8$o}y
zNlj!`l)%6|slmYfg22FDzes*3U|=q+U|?rI!NB;_z`$_qv)X?Mesu(yY03RkQUas@
z!r{Q6!M}k)eqrEW7Z^A`*nir-Ffe&=g8#);z-j-72LcQ%)DjHpe|U7h;D1H-tNthR
ze<5TZ`2V9Z58{8hL3xn>i--Lu2ATpad;xfSS#2jUFhq?1DmYk17VeieZA&#RXDuZ~
zeiOUj%toem#%9d!zwQ5%0uyxS|3ZJ8IUA9>|F*Gp;&&G!|34i3U-*Ay7IM=6!{Tf$
zM6RXuom9fk(TtRvnU$H9To{3rlvL2s^cTO1r1bxYf3<|jEu5Y0`B_-p+}xPmIGF7m
z%~{y^`1n{@*;&}xnZ7ufoIGrujog`RohbgVlK;ORNi!!CM@xHWOFLWA|MVIe+qpOk
zk(2*t=zq)q_0!q%*Z*V5*6DwS^)*12|9V*1m|0o=SNE5w;D1zpfTN|^m*xNT3$qFS
zAI|>|?SJeDvixWK|K-g89qIp}ehpO^L6GHtADb`&p203G7?>!SoTQkVJNRWbydUP!
z!*z$(FV|FpJ`8A0DJfTQEOh9>A>2ag{qW&8h$#GMz9>0Pm}t#gO(zRuNkNkZz-0N4
zK<Ye*fC%!7z2Na^a>oH;j(HHs0Pw`F>9#yM#f<1IcIB+u<@s=?-c_Ml%dCW`Kk6R`
zNd_y95=02cR|j$w-3B&dA2A+?PVinow)u3ZYh}kabKZIDH+r76f7tX#iSStLh2}OR
z-7Q5EA%K<+ebYPz4tqByV4`BAn+);WUGvN=@jshsNgU?&OfuF$o69bLnxBez&erK(
zx=n+{uO@O>?PU&ny-X@<Bk^8q(NKhJBiig7M!Pi7xtd<vUc_Yz-#0+ilLoC+&g<@o
z!N7Ie(ztQ%J<-dw3McZAo75t+Q|;RvF|m=f{NFK|zMns{A1)iI7rD%!TJBH_v*&RV
za2)gg@?LMH`M+spd<eX{8v5e|pt)B|EbvwvYF>tLQ`AYq2a6k$ONUsrz@nt>|JZbs
zdv>|8T(wcm(BU-JGxXA4yym=e{lrJ`>0l{{>uuTT-EaWGNIG9$z(8p)%B_oLIT~*W
z9qx@IEV}CwFk`+}>q8j5UIs#rt#}-w7wR!S1h)+r6{`=DOL$2N_>{)E>tb+!agBjk
zFJ1fm+h^6yGHkQ+DwfPKh-9uQ+JS*9Pz?iX`);)Nm-TAm8EZ+957|*Za@W0*`J0^*
zR4Kq}M&paB^rhdw<=x}MznrJs@3&Y-g&ikf^Scc*sI$tQ@*fT=A1YW{bs9n9W!vl#
z>JAm(CRU&-Dv##-iNblG!laF^3dS2pb3WD1P|4rw0D~a^<#@mf7KB|aVV!eVPy64k
zOh~1u%k&PCIE%TSW8~%s3Gw{xS2Hw~IxId`GjF9N;qKaUULF$3ipkCSeZ!|MEJ#$O
zfK8p<{24o@snF9FM3jq`_eg!xqE_7fcfutPCUV3p$gC!!71;KAj$>ERKnkcG>*yU<
zIW=oj&Otm!QWDss=xX8+G)0Lc7nb_idI*{vR>K8n9~SOCq|vRQ#AwZ+2$+8Pw6S>@
z<*d)?i|eI#YSh>F$Vyj*6ub_L5)JJ6Xyjjv^_^$2E(Dv-68Ua?=pX;$2;Hw5D%gmO
z<6#Nvy1n>@PuGru_+Xl!F<O>CQeV6WD*f#pWF^2i-jy;5Vg&bC5SKbWs&5321p*%z
zGOPOH!(2GuK0?KG4LM_Mi?9xn{9vLYb7eR|$<$0Q9f2fShB&{G$;GqCU|aA|v|j!4
zasWuqa8u@x)JsZc>z9=H{*54f_^}Sg74VuK_1Rq=VLS95l|N4`7t8aEkx1-ZK9q$q
z@jB#utlt*B#(|$uMbz=nOdbvVtemT*j0Lfa3Gp09`Wik)wD-mI;b_y`W8l=QuW!-E
zX1IXokRtWRaML(wK^|_ZzqJ&yLhpO<Rk#n|?|fO^HvG(Cl+&L4pRx8!^gVy3GHVox
zWO_-bwQFFs-b+xlFIJKp!~HJ!@l#w@i0|1F%5*lqm-JaEq5bhE?LD6!f{-gFiXxu=
zV*2kppUapHig|WlJDsJu+btyqDAQ{R7sMk2jfcvJw42Hz=SpD|zWrY0E?sh?t>N?Z
zCXx?yLcIn?-dQe3(Lq__v5QaIiH4AGO#nQ7z9!P%IW$Y1uIV^R677Qex=8!j?<aQ-
zH23RdyfcLgrhq{UH*{qd8tk_qOQ0GTa6$}EDL&$@N^*eDW`>Wq_|<D@i4zYd3GdbN
zZW!q~h1($~^#{Ow6IhJShC1@hjgU_K%i;*!EME4{&z4|9azw0KZ?*6<5(FRNsUV;L
z6l_)4ACZ(A1QF}bNlOm#8U^!FBaB~mIc}B@4=MRwqE|#WmK+-vEAv(B+Z{i57y>^8
z-4flG^!1v(*nbg1eBgWD7IOZrim~)el^qAOs5}sJg~ah7>tWY}5AH^b39ehmf<;oQ
zl_zpNWoju<rg&^##Ur`4zdZP3QjsgJFDxR<Tl~YQqSm)l@a1HqUj6Dc1BjXNNLGcg
z2JNe>p32s0!aAv3QMAp@2p8O4#E2aKTa~b`>daZdjr-C$0(YPDBc0?|=$IR0#<PbB
z`y+CcX9pDCgXHyUxEpnyxx7TbrEZIZQczcQ7ANb>jlTs;J|bV;fI}~SE#Ao`=sk$5
z8Gb}ZJ*5fz&}2r;_w}dI9OqRWp9dC=(4VM@!=30PnEY>*&a96S;+yR>n6Gm;oP$ls
z4m+DtK3?qUo1q*RLfQV#KQ|4ZKOaFqqj6@A9?m}o9aii(h>oD=nBNaqY(vOuC6}yX
zgT=R*BPb~gs`=2Oolgc<+Di>%Bk7qV^|nYvh%?(m3AimSg%I6qOYfAMf>$ERluzgf
z!>Q*?GZZ*gdpaZb+a>Z@g1pEs9tXfD1mm_=mv5O1f)|BG!czYRiD8_7-~ciJvUWOF
zG`CD94%!HeYfwji>T*iDwEeYvlEou7b+b+Ph%mx}SE}Kvp7iRp@J8Y&HfAV4dn-}C
z5wdk_m+%<|FGxzhLDQVpFgD>#4Kp<`av;q4)uD(6d2=|O+ZASf5_>9}@BiCfAs0qV
zBc(xx`BX}QGUU7N^QdjV&VSfClzi@xaPTAU?8&uafe19~BKFtK>rWe?USRq)vQ=F+
z*Mq84o*ENt$_WXcvZY7ZpH2EVHoe0a(rZ9%K5P=d?$Hl$FPC_-!xg>g=axD`J~DBm
zCsu=0C|Q{PKGcW02;w99nk<}cgRZ_JchFJ9t~};&OQ2)UgyNpTmEpW?l7OD0sGz_|
z9CUPhr^PI=U{{<oiD1u=LfB$dyRr`!D}`M0sG-JFTE`8PetIrJee`f9>)6+O$^o$-
zH&k2&zQHOXKwMoQd0XrRy0luNw!5qJ<j<JjQoOIl@O{nueMBa|Ks)Vb@rdLh%{|+M
ze{lCPGFtL*DzDVE8$P=b@sDGO?1c&gQp1Uz)-g{9rHE2PBtwuCF=|Y6KJ>Y-0gFuM
z4<WKXmqr-E4_$ChQ)y@O+EZSoGZ%Xg1rzlPJ%l{twPa(2*WXXNcyV6sa0t`9hWxV0
z;V<D-8<pp(hfNs&p3p2^4Ez0MpF`AbFX7Aqm?uSLuL9rPmm5aST9o|6__{t|h8LUD
zo(vW6kJJVhwySfr-lRMQd)z#q2Y!6gK&XaHzVRzx#&<5_!=LvL5dYo&`Ct0c&uXuG
z&?}HiyizInjR5?TCf%V@n&*BCE6Oh006QO!7&X!cFNLc%?VaZDic)6AUWd`EVc*Jo
z5AOCBA+N(fasGF(_`gb*n~`to1yq9Jj8_Ldrg3QnN_EkXPE*gk=Dme3xyQx|FQPIh
zU!nI#n8s7%DWZ00GCIbi{hcFg%iH|4`j8c|N8QbUPX>5W*o+QIT$Q~GJ-_yWB-5U(
z8_iPZ?WXlzf=UARBeM8&0UX2H@Uf8T;uO?|5z?%SLK^@2?hyD&!*xfHrSDQ;rW2Yf
zMWKJ9eMu~|$bh|3zxI;M#%p!IMosaux@7E^HK*I+ya+XXJP^>bKz@M>%~d(pRD*{}
zdYzE_{iIc+JmeI665O*}%cH;{4r>;WC9=f7HJ2D_@2ME9*cCoZ+_bL5C3ecCG@;kF
zTe#V6Dwbo_l+gd6^?G=TH9M*;2Eh_O?hTY6_h5(&B32<I$MnQv4rbMjhUAdGTFkdz
zwheVuT#yS{y+@8@DC50nqM!fGF3pmh*MXO0&<J^FsT93rURa_$<hT78Gr(VK8Uy0z
zvt<|^TObs4_wF@t*$@1=`4uSo%0WGs5s)rgRZ99Ih-8A`cgtdGw)!QNMKYSFAcV1~
zvbzS}kVNLe(dAy43^%Qarx9`cBm``K5j?L!A>+`Up4!IJql6<&lWZ5P7SIlMf1k9g
z#8;{avIjE~$pJrWMq2F|zE@pnX`{FRf9Tmk3PT`C9&=~qq2fFmt>)2Yxf49z>74uY
zZ-0WPb4l>$oFeA<X%9Q#89vk~@K#NRCfR^OlJ=CDaASh%^E`0qKROR4o)0jQer|vG
zFPRQ}EMqNnBa1v9@5W*qOx8Iwy$ioTF^24|V7WV(x75o4Q17Xtc-#jD>#{s$JU#}i
z`_PX+XnwbN)J)F^<YAbVj}f@vZC`Ngf(S!he1k~C3TQ!rJ=uIS_WkhkP^^`1PMNoo
z-z$=Fg4XAR`Cze;r7m3)YA*qpSSfh;E6Ty$in8BzcIz{YH_@bJtc#3Flg-B+ROaaW
z;9yZ)i;L6V6GNkWwj#p{?<hXC)re>toh)V^B9&wFqdRLCk;lTJb-lhR!JnHcv>Ke`
zX-;m~_1kP8)-Wk{@0_SwSrrfoct4chmI2k`sv$hg>1uy*WgIoz6d2)t&q56D=Isuy
zmml<RBkmxyeu+zg>x2s|yJT0Enc1(B&orm5!qKj}Gj3(p-#!Ip@|-D1+K$cMZ(L32
zre_Gy$nH=XrDs6?iwUGzQnG+2%b#ft@-o`Mb`U+<tTx?$De%PE-yqJ`H_u#&eN`6c
z`S{xv|Ec}wn7O%d*e?6DMf|o8AH*;^@Bpp&SLkN9eJv9~j)KI7_Wq^q+iNpCY&&ka
z{;G=3*xYQud}2NSyv|*M+<7b8-?TZqy^U!E=>{wc`_c`bu>m1tHKdFT1*UE`Lw)n%
zT0!e?rFSheM)Po+kG=l0g)%ZN&E7V)GbZecOcAzg@Ke<+r4UbH1V`NJvD+Z!9(mPP
zVyf*b>ig*~PFsih_aG&K<5>6S0pyPvt9;|96{F4T-2s5P*f1VDm4fx#?)~Y!-4CPT
zcp2i=)OqD$jOksHML;V)@9Dwi-%K76F7v6ZW_ow&DBRi5=DmbpQ8fM?MoU%N3hH|Z
zlQa1ePqEU4g?Qt{&>$~S0LGP*vyjL3dF^z@d9e9PosmeCbhJd`*a6Hx?Kd{dUwXxg
zA1JYsQCDN(%t0cYCc{ym9=i~a1o5&5;uGT+e$KWb2lipHVP1`z8+4i)F8^lB{_YzO
z7kGI+rcFr26VzMwh}XWDdk&;3P!Sq9w=Nj|V$gDz<g?5U2-?y%e%s3nJyW)^Y`1+U
z(&&=eacy9{BOYLVOi3I}7#lq1gy<2whSZ&xl_dBUE;-E+@2EX=$nR=ruvV@*qJTD>
zh+fotO1k$+i5+P`=`PuemL#`>fcc8g*nQ*vK0*niIN2UkGUU!At&;c#f3Es|ivZf`
z3p5!dFu{!njupL(X&REbYR3rrWDg%~sC`V*Gz4%6`wIob9Bfds=Z+z^Gz@2cq=08V
zu>JQR!S`WFAK>NS^Zjx<f+3fkG0wfNh6C}pa&UC1>+*zf1(CoGp8WkO{XgSrjNg@B
z>JISK`!AvV(oxm{NbXt2l|$sxN|!4YWQ(XvUFryXt#r+0n0T%ZS{i!@P^NaC0#L$q
zLO!vwlz&nQ{(V+zhWmwoM{}c^X3q)Q$zLW2K+`)UF`PdkgC+IGneAA2Kfwww(1aes
z@VB$F@&;S8gG>pDtS%foz!7YoX%xRsE9ss=1V~8@F$Y~zH|-xWYF!%>=P#qkA2QX`
zePau((GvC>mbfK-F`T04)cTb%W@h7%O&|Wuw>kGII%F$pAL-i9zv@X3BX&*LE8k6E
z`LUHH((?MJy?)WWOUQQHR*J%|IPAwjduDn_t?6*z^p8$<(29T6CRmn89Z5k`9g)h(
z>8uwUc_%fM1>INlrwr}C`dfD1c)0{SA@jz>42@UNWhsRFg;*li>M>oMx0PL+7p<}p
zg|$J>_uO^yaC3izg&eBLSZI=2YsMY)hJ*}D<eQK}xN{ki97Zw#L*h3=bN}5c`%E}>
zHvT3!q1hE>43vC}I9hN!GQ`iD+F6-C-QAW}Eyp$(HNl;=vH+?6irQfL7vz<>=+Kkw
za!_pF;H~ZnfU$8ddX*FWtnEMJ&CI!-T2rWmNhXh0*(uv2RkC<5WGqE?oXe9!-lsgN
zg%41N-zHo13Oa|qPja;j?R`&=@WO`BiUHFcX!3n#tEs4{h~j+{V2#{z-$(ACA!OUT
z)=t4n_OD(r*xuBZA#xeC_Y|{%Z^;yY6`<gS>|F;SIVNT$0<y1<a^opbw`tNjYXDq~
z*pZ3}rAoQN);b)1CDnVN?xw3OP5$S=Q3lnO0(1+$v%Q)8NP*ebi%Us8R+qDd5JJir
zScxbhk4>Qm%PyhcYn;=WKRfNiJ~IwVZsOgKK?1{3^SbPYxj8VajkU4gL_Ehq#vu+q
zfxirN1zJ4m0~<sd883tVpQafiw(gyEZf&`w|5YweDp;mH-e>m3*+NB=Ca^{k<l+gv
zHa7~Mh0-(qroRs*0lEW*Bf83+yMBPX<rmDWrlC);<3EEU+S}^t*<Qvn?Ydu*tzZy>
z14AIu8tTj4U-bsUwj=6tCf)dk+rY)1Js&UROyccB!6#I80;xlJHsC-}sl|s&<}W+D
znAH2ny+dIG>K5Oo+pygie7YQW`-2OB$3kbc@nJDB(hvVs@=yQzrczxC%h|${Us~m(
ze<%QasinIpT7%Z?I*idS;%^(hr^5n)Y!(_tO(<Omdad$;O}K$DW(bey+ZcK(mKJNY
z%D`;)_OpWQj?B#Q!eQt+-8XoD0>Fdj^_T>3LVNHIdC!M}xD`A9^^+n$6M&q-x?S6;
zmPm&iV_m7~xN~Ty745G19cac(vL-{c;`6~HcVY-llY*VU7@QMf&zgJzg$qPV#%f+k
zkjDy_-b}~Wtj4gewKQ7MVKW}IN9fDKj7|}e0W`8jZ(;Y$oxDSlozn{uO@AuoejLdR
zgrdF$xTzTw=0>H<%{P|*Lh44v>dJ&e{%3t4rFMXDyCN2%cVGm46)XQ!Scbwu@MA;{
zH)IfJ2J=@GF53krtZS(jBlVe^LH>wg)qR5W2J2)GzXQT+KH;mv)=*z)WHGp`+c#?Q
ztN{4V&0_gIM8P)c8{zRjh4*Kbt>hqsy{HVeR`d$y47XK3o^pJix@1?kRd;e-MTJAC
zWIbZK8r8{_4l>K=#l-lh8xIWTB<6ewxPJ@z^MT07nanZ-H$vH(-}|j}R6!v)HUHjn
zG59=M{Y=?K^}{Ew8T&&0^Uu_RPDS8G6?4|h(MH7YiFZoO0qTt9jHjqZey3gT$`Y9#
z?D1jKlCsSGJ(SKdxNOd>O<5GDci!UscF}}<4Lr|12YuQ;xS{`2Jf}Z#e(#X(t@^9o
zI}R@k+&H6&*aKxd9)2%u480h~66>pnjBY_yl`YE^2u2tCvTQBihV7h$7!f70Fav`u
zIn!F!O*4M{I7D>R`Zp9(ms^xf58=cStvM{=v4Tw8Lwx{6L_je>whlgJfK8w&giLX7
zdDU)I(L?SINs{4aXrlT)T2kbJqw*X~_|0FmtN<<uLytY?h&#iV^4n{E(UTv;we$DV
z@5J4QBx$1lU~k(3)<P~v%VID>VhquwDenVb%bHQltm#PC2~YQMr|4=&h6xDbIcLyC
z3FYl-!HRttJrNO4v_iY(xFIzjm(9y5WqV_=<{9w>KZJ-})Fh-@e;GTc!W2U$YZN$3
z_&x2qrF*;c>IzXa`Uk(tS?Eo83ALq>M|Sd>3_DLd>@}EePW7j-F32Hr8GcgW$1a?d
zJ(JyY24tkN0kj5KM@uR=VEFJ9aGGNAnPi4{ykz+&4Fo3$-=hx?xrN^Vg0@-*$7B}x
z;b66vR_r*x;s1IQhyXj;JZ=vli=!MA(a*LcB)=)$lDTJOWC-pfMCCfQ{4wA;?icH6
z9#1$<7^H%$*u)(dOFe70RG=3x#zo|lv6V|K-{idRTN}A<_`&wI$Xe^xyJK>5-b1Ca
z-uYW+`+-HL6<ITWB#_D%$V^4D2KaAPW*d*#<~HyFfgyb+65Q(L_Ha3I23ajAMkPGB
zH>FJt=8`y$Z$%PI=z(v(<Zp&PlCsW?iG`@LYcL{yar?Tu!JNapOVzf@$$aPh%%=Q}
z6r%?4Pi{T~5$r(76kPu_B@;U4=_35qQ9thh^p9O!2oJtD*`xTu=!W-u`aEWZP)rJw
zTw$Mk9K*@%5R5jh$raRHQ(WT?9xa9I`n!K0lDK=3N@AS;Qad`(Q3NSBZyQe24`pSv
zwa$XaY{1$HFxtM6zStg`=&R2~eQg(C)xr;^`@)Ic=^u<*vu071I$ak+_oT-1SM^E`
z=f|NUD(9}m@w)@Dgc7zR^SD>ac$4Qd`2(6%q+O(#1JqqVM7~!0*2C`=ze}d4RE6Pz
z2<{JtdOyhEgbm&0g1SqNcK*S=LoVz$>Vc1^YSjZ{ME?P%pR>w-R^575qUn{xSkeu1
z3=rwO2ai+iyB`2Je@7Y^iYVCMUM|jCadD|<?>j^rh-EdS`D+qul3k~MuSb!;8S%tm
z+I305W(pji7`te96|NP^&#UD`+nK62%X<}(dcOk05xZtFdmfB^fKmlL;6B65qGdGz
zL9Yt%PcvYWP7S|l3Vs}v{kwK$If>hiX^ZXrg-(gKE<s{(N%ukq3xmHEo|M?vPjzd;
zS)<?axY*v#-*JrdoYF>*eS`Knc9ZdVpT1ua^Go~eu*k<Tv}|kEZUbPzi86)R{7jJ%
zmTRB&u&vK;nhkY$!!Z5^RjQb`u7j|Bxf_kRtkI*SvnsOU&6&*wGe89ZDn})x#R3PV
zhymCfkh8}5wdHu7PJ#wENJxZ-j~YNI9D*ab4`R4+jxrNNlZ`0(@*|5lWw9M=!V{hn
zKrR`?(nSpKUYPvChu!WEe}`(1tTm*a&cXf7AF)jJ3Og4!HNK}sU|hp<>{M?0@}f;J
zP<Dj(=2`9mU=x#lY`vHM`u@_gjinGon2*z6S$S5>&#ll`kWeTIPT-v}RPHc+f1+Lh
zYmIVj;zx}4ng<2S0QzO(J*x-Jr4M@2R7?*ptq@VPqR0+~IBcg-`ctCzAq}tTubo6;
zAN<NQ`HiR85mI)b_$Fpph6+!Mueu4lBVK}M3@)e$wkfzTUu*bsR2gEu7xdsc&lIZk
zY>^B#o2NnfO=+8=Rm6YkpLh>%ex`Xos2=NsklV=B``^k(D;pH3HEv%f;Gj{0Q)P9=
zR36qol*Fc+wWD+lCvQ*Ye2Xfc*0;SGdaHP-{3KazZKRmT83Y2dO@<YJXq@g2vT>yi
zP#$HP5q2y8q<#L_8BWMOm;EFiepkGdOV=7#&rr9oa*4eU0WAEuOt<k*O}aLuJ-ti1
zYm*2gB)34TktmbI(qr-e2Ag5SJDQ;YIOM90Nhw7B=Dv|?LCm)X_Cj#77a{q6Cu0W;
z+ex4qLd}GFBLen(?nMsirmCjBe~y1!KZ896<&8z5{*u~`Wew?K31HrLW&0qzzG4T8
z)SvaOS?l@Zcwr3hrdk&LgYs(Z?V!td@Hy(Xi{9?J<ogtHn8;#0zYNT(Mm|~NLh{LS
z`6kBJ5`?okA4^h;$I@d1iWmRXv3|VPl~w8r-=Pp3*uRlR->PgJXza;nkuUDV%k&##
z)CC!MJpg~q|N7<kH%G`z2x|{w_-=ES@>`{5ZHze>FUd~2%%L&vPZpc>cvR}xT$I16
ziE<z!_`n=XkDroS|Hwe8lt}JT0WU;N2)}vqa)cpj@bTF!p$N+BoL+o{d>NHY!V2?m
zHqvA%1!nK7A*PQ6>plZR8fCW&mVsOV_eLDn#8Hd^%vr8hUVLmFNuhO(I_SJXyQ%y*
zH;(Dg-Wq|^ZDTn~FPJ7b?xp2zmqA19734zBch2_Zd0HTeaEJ6dq5RHaKjAd@y=ckg
z{IFK51KjBHPh-x&dTr-XwGs>72PL^C?8{Q^zV6&PGpxh!9KK=Kn^O5WSUtt@w1SbZ
zuIT%3BCVL3xV~?Py<zAwgx1>eh9-9WlzBDeCa<hW3F$%s<knWW+<8L!e~xp!wnCYT
z<WS(k7e^){>g$}49vi>*b~pm2Clde3=t^J+xqk;ghKD*w_h475b;nigOwXYLl=a~B
zTUGkqu78tZp@9R^SGy}*2RXDFqD0|39H^%e8K|MT8}(D4-9?c9$v2^S79w|_pDR|l
zG%}xzx++PfG0YWl-Pvz8sbEgJY$_mP*yZ`V&iC9vCMzyE(*OMrhWY)cEPO3wk+dZK
zForU3e%AChc~iDpumKby85Uy$nHd#oM_rYR<%cC5R1X5P&ZXr=#2dG&yuYqdAX^C9
zItA=W;#AR7%d+V6;q<AWMsjj87MFhLk{6<DK>g0?qz)Y0m;V^&Y=md}492g}jAb5*
zkJGt-Y<iLY)qW;tU4Bkcg<kAQ&F9H%`G%VP0aV(ECzLBF;^8dns~lRwjw_e`%r|Vc
z-Q;V^8h}(QK5`~B`utY;{2Q|$JqoK5PxEoam-Kh<7$;aFSO|^SB|f8~C4AY#*tYXU
z+pha@UReKP^slr2)XQYZso}CsHGb;SGoKiAXVv_7(ag93X<JN>JY|_WJ0enEQxR~k
zS0B%dg*ii%$zEZeB!)l0Dw!x8llbQi4;xT5k>&&FWgmvP{ou{(&pD0FH!VRI8i=mG
zC3By!gspByD`$D!SAcgfdq4r6Avnq0M6F@ZKOHP-1bTG$ARid%5`}c?H&(<hTy5+-
zK^6s9a1e(0rM4`cUI8$DJ5v~hY)LDQ<Jg-BnJwU!G9a~KHj!gr4%RWC-R~tv1UIm8
z7v_}8fI8>?Jv7rxPs4xG9DG<VKtsB}VU778WnaVkX1XbFc<?gKB2Cr7TJ)w!4o}IO
zGq%%bsw&<ucIi+1DKm9nQ!D1{9mUhj)mr<d4wK4%LXx|A(E*;G*gzx-Rme@nR|geO
zGbkXjNrfs~j47hGv>JEzJLJSEojZ{3rXhRY{T-)eQ85!6!JFH+|EiO4CAxh<sk%n~
zy`S-E@?(s;0bbJPAFnDkvvuT*?yv0Bbls|}h8LN~4+=U)jM=yPfot{<2!OP+)6@pr
zVr|299nO5be`GMFehW%!qx_!vOd^A5LKmlI+{{24#t$=yc>ZT5jWE#O!?NKgJCo4#
zvgQ1*DdQ{_U4u*xI%Kd|8<8#wd?o?QwI&{sTS+vSAVS%FzYC7Mf1HS>I^U-%;gran
z?%B6viQk~Y_)B=`Z>K&%X3%BXvh25HBTgN#bVo(!o?)J3h_-_|R~HZ$;~9bRj3nJ)
z-tr!CTLw)LPt2#jZIR38t)ltMs5mgteCQgubuFEOH_~V}p1kvRpOU?wq(U8nZB*X5
z{w3EOxl%NzwEa(Ukle6kzHhQZY7XR86;AgQGQ@q%3YU2HPjT+6D4)U;DNH7O&5BJl
zIN<ft@ln6j*p*X<c*G_gGAXXfi;@U99YY+9+I_p-iZvw3m+By5pxL_a%s5Gm->Y6d
zmqM{mdVLWCrj8IU6H7{-c~KUgt3)xS0w;{pCAV-2CqFQ^x;q>HE*nebY7Bd2Z9hD-
z6b5)J(x{BI-C)%IWs<`9u@~lu_RGV=gXG!j7{2?T+-%)1gR4=2)m<Ao)x_eC$koD6
z^iT!w-l);JZ;7)<C$z2di;}y@0@M$^g3Aj?n+)+`f{62;#~h{t4P?>&MO}`E{rVaY
z8+MD|7Ve1^b;}b}IkOn0N(5zmyj`|$K|YV`42VLAs6`31f5R1yj3$)@3#<e+Wtnle
zYo(uKMiAPe{8PF1s?S)-A0;YziN&7z2XHR1bXc&7Rs+-2_`*$X$i%j^FZxWljrX-!
zjR)&K$wj9Nq7`Ast>Ids#Rxj{fv#lf=z=u2ghLcG8_wV0r;oGLI}gGIb_$lP_9nXh
zj{QE($-D?}cbSK;$_iuFg87|E1JuK>9aZAGC{)yb2|zY5BZ^E>$b#4xA2BmXRZVpX
zci(JYZh;s%aZ5ULI;Y5HB&KVD9GJQOv7kHnPJ-0-!4%6o(e7=+p`+}l$4VW)Wm4)&
z{Wy&`7yJ~1%yWTp=R9+Y62|)~0MGciaWE3B^OD17zz|36oYTq#nYxpLl=WhL$E3K;
zMqAA9me=C2>sd@wnR39K*3ECbM)f@Pr5z{7sJ^kbqeZy!^TD1}vhnS(4)vhi;af0^
zHaH_(C18<x>mB7wLRInYaW2Q+{SF@V0_~1n!H>)!?ZZ~Z*r})3)rRkQu(ewCLDKa7
zyh}+TR&My)Q7}9_&|Xe}hWvan{uJl@)Zu&pvp0}r<(_IiY}x%cMaqCoU6YWqHpwZ?
zb-<>jChUz>E#{$@u(*@-N0RPPDhH>Mw!;2tQZO-A{KIC+eP;m#EPwgWCi_4>l#FEg
zH7ddC?FDF+;<0dHbGwzgk5Ej~(i76`rd~1V(_n5YMb{SIMQIYb6dA{b<%yk6@Lk^t
z27_|Yn5rTD;S*i%oux&GJaEo}f%eb6JDi9YdQ|<vIpbsYr|m$9QcsRRYg-_}dstq5
zg?6|1>fg!xD9XRT)`GD`XzJRe^jH&&7)!waDY~o+(RNE>6V_R$GiE1W*^JbsP>f`z
zDRXz1YXC0T=6~U!YNwHyJSg@jg$^$xTLc^K!b}faFp|M`xbq+85LlnU;HGb7c#^w7
z2FX=vwuM0>Eq4HQ&r>7pai<k*I^*VTVGw^?1(!+J?<B~>{HvV@dyiGuS?1AoSp1HO
z6kO-#*D8`P;~fa|C_ndayiT@UMpI7lHgPY0E84NM+<JK`an~qWUJnZwo%VWDylYR3
zD^QvvIVYkH0f?^kQIkhzh0T5=!?1~$HD%y?bK_Rjx`GHHu=Lrtji`2p;1TzGh$YWs
z<8rT1GrKdN&I9LGICd>QiO8>kfYo9)nxH4nkPjh>)9MYh3t2zw&Tp1)K1r!+Nm7J+
z8O}mOI>fl2sy5~1^p!tL&|}kAT-0R*umbf@e$e$A%Q@M*l@x6JF&=85`HtLy6*<8|
z96jDw#FoBwgR<lL%S~@ANJv0m6ZiB|t2DohJX5Q|2mwUQYyXqRw$~#-@SX78Y$zq1
zq7IM9LQ2XM6(2Ih#tKCn#7IIZ*zG_3H7l-3?)blvn1AGspbWb(3)?no|J@~8PF7z)
z?X#m@_wmh;)ls$F=4NHFvxC;)BbgpLeqv}CoB79<(T@Z)9r=On84fqK+<;CCoZrDi
zO)6-hEin%?0<1)X<`rvK-bw{3F%8I`yME3G9jp`5J-G2n4j{JR`qJ-17btxnrJw{P
z80hkb`d)0uykjx#7a7lEX3D2BXOo!FI<{C)5Al~KuuR5G`=|P#iT{91B9!hS%s1zC
z;-gKIL*Skc>s5uBH>n?PJSDuVSTa^3;dR)$sDQuwSCykBFdlIV%R^nIpk4fg1k;A)
zSd?l3KBvook&_}2J|XzIpnembIM;dCjZJx<H0?X8*LR+nEkDM~a7&4$q0_11rZx^N
zyDi|(SxO0r&ta2Gy_aza8&<U-Jm@(@`2YosKjWsks+7ku0oX7CRa5lqzcvw>C^#e*
zyO(MWy<olh3aKDi=-`=Npi+i^l5ojGg)%Kf&hL&br#oXBM<OJlqPyGh+AS~hMCI_i
zwA@~&n$!pvqq`TK^?8`5;weF{O8BxPdi(?EEWsvlp&Qpj&&<B^!~cV0#FzZdh~Fi-
z^!_GI%F3#G3YVs=3sz{D5QTkK<VPE&UsgAr;;5Gvi-IH(_?^JePkAbup_^tbwBx%7
zx}pVNOL_T?+aNNnxZy>Xgxi784n8W&;ZG66oh2m2Vy1m>5d0sP(zvFXM_!$Yy4W{*
zN&{i7P}s<ZfoshYc<u(}KE2&N(VbW_L)5#!O41*K*nD}jKg<6BVd#B0P;gSuuKWoP
zhtPIjD#DvJU9UC{XUb}#EflHk`mT8}q`Jgh*T*PYz-FNi;@vE9-MRR_U7Wg0?BBqG
zET_biIT34uSPx~>Zt0Q2%PHfK6OA!pu^f}ovHt-Y26SKldbkl<njw0dDR_$iAez_z
z4lNt{_lfzrOag}^`%|m?np6aT{4?;15D}}&_H|c&T2tYU(kz_Gh+TjMpuq=q*R~&X
z-efsHtUdG75bR8*)EGAvD2ei`+-=y|_^caxzm3$OczjR?JMwqU*GrkwIh)E?al3xn
zcs`t?@+d{|g+kRWCRX6zAF&?BQ>Z*lbI^8Ti!$o<e_v}fXZ-s5Gn;acq<uc@gr1kc
zcUDYPtwz-UJPTwskHU?OaIi%TFX;1pwZyAXXM5h@WjtF6{N0+^XGo;BV@vp?X_`em
z`jOP;T`M80UujPR(T=dWhcLwCkcBY($N3cvD%a3O2Gy4+`E7ZqguybBqPt9pT`=*K
zPWyr!(RG|~%_8aDi+jf@>pG}<Zd(fRT|)V13M}%n6sKM+c491}8ncLIf9lyih^PSq
zU0}T9fUeHS;Z`V<C@*w2{Sx-~XP;dU>8r}a@@tg0uhY?F`jz?oQpUC_b4?Z6JH1sW
z4pH#==}+>;2H$5BO}NfDuNOpB^g3`P{SViLAA$8vEgNGWmS^{p7WA?^#XG9OWz{n3
zUO6F1i}U!(ft+agmqR?_42Wtm5T7nHAN&iBZwf|L82<hCi;-p9ER?M={F8L$!CZCX
zxd+HRxouAgwN9hUXk&<nu(4I(dT9=D_x2_JFAOJ@816JN?BNM4L5JqXoXBBN167d_
zKiLDQP=Tnu)@g;)tXVD>D?d=UHn||<M`Y-0mr5N+$hA`sn=2q#LVL5emLOPG(TwcE
zdl6#_JUzqH+C8{#$!i@w3f-|TibTupIh?yb7M2Ztc})DJ@<%PoeaZGW#e*c~O0QEm
z{<49mdx|@><(qKu;<x)MzndV)0qF5M_nfhP^5pFRIgiu`@3UUzx4i1{L!xF~qWn){
z+iIS>ucW~7ERh$!?6m!tOI_LSosLPs_8=c@&%OM}*o0ueS-5?kle-jgmG2L3Ls_Wz
z42BFwq9e(z+8Im^we{VB(7G7XN9G;>foZju%vgvHhAT%~k=m`PU8V__#O(zA{H?f4
zfg+3iRGS}h+a<}qSqRvpKUuhWqjJSjona1JhY_cyN%#k+u0>Aj)Ja6N;GO6(scaD6
zs^Nwj@_1Kwxh6jw?f3aly3<+z_;V9bUbP;iT3^&0>VdIcyzsGY2t~eHO$$GRw;R!I
zEP2bm&_$2;=zWR<RV9iBC5pVn<IvjNiGV-^b5v@5Y==XB{N#O0LIbs3jAv7UhbPOR
ztG`1BR}%c&q}i!za}KOr2?ic$od@WxFwv)+XA_klYE4%&{mu(jo!@Bz15ot9o8O^o
zjPwlWoquTtXA+Yocg(lDeI~%$5%H6tiJ#TshF9+uH8H<V86+`kkPAQMQjOfEyTO<C
z9y(RMh(P)pZgG4FVRU0sUc-$iDiRUtSM?%RN5yJZ>30~|(tAqbqhrxPNE9-3V8lKK
zz^WT2dy-ow#Fj>&yUS&A&cq;8#8!N5Acx4VxA!8(F378s3K1$k69of1tf0q2htc0L
zcz&;4+i!P2_GP~G-o9S8pVu7EHK3}VCTES4dfgB0z1kQAL3=tH-<m9V{4p`0E18kY
z7KBo8AgP6e>qJVog{e(QY$}g)qqx9fbtw|mya*;#c$XeLX16dqDWn(W=SIv%=6E4;
zf1D;sIL)*Yt?Ds-t?-SX5<Z*hi&Osiv%1_F*Zz^agL_FLUR?rn0C`Rm(shul#sCy{
zrt6E%YCf+yH~kT$q9pLI&A$NeU5j2m&ItkF`8<#Q+#0Zh2!FL$xhyop(g-6z>vxVM
z1nUK!n2TsJ`dWUVFjs{B2I=vN{`?MGYyf|Bld6s7?ov6>Du|YDAkJ{Piq3`P#V(;@
z2=(;$IQJ4Z$1_EPar99WL=7yauOi=&h5{l%e4<J{yP}q8C<Hbd<$nXBVu36dBXUuT
z;e?ZY2!%EQx54Ibzc*H1IMsJaYoaiEB<!ADxpPv0rOciSzThBvt8Tb&-uP)O`XyOf
z9JW5KVE(>`ku2TMM#6eb4<@;<x#ic0YS6hh>c%tcR08_x(gU@C?>Ur2AHG91Ei8==
zv_J*(-EuLyJqwt`cE#)u28d6!#$(rO75e;TQ-|T#2ZE9BH)d3NvsLJQr8{=mfr?<?
z<V+R8qKFvpNv~1o2|m=JyK!;x6Vva1)m9$p(~AJMa!|ztPzWC{TY%1Ce!Z288}Pn|
z0npV>c7(2x(M5Mj5i<fFXTvD1S#&GnC9^;O9wYs@#fJnvlNqf9@tzE=QIt8r_wR>L
zTz8(){J=?H+W_j_%%z9IHZ0>?Kfb6*7mw${2A>b@=Dhmcqva5&)50Cia0N>s+w49k
z5T9V<cvyqxT1LU~<x$pF+Y-)kadJz^O(6pPHtyD}`P?Rt$$$H!n!N~$7d54%4@Xp1
zPzW_920pQePQ&wl4AiN0KT!2zb1AO2IZ!-EJymh6Iz#&){xL15ak=r&_;#5MC*R1_
zuRwn<UZ>dM(2}he@XU<FK#!`zS0?Bq;7r>yg$Ryy0Hx-|*D<xs*MB3coAoxF@XWIa
zB}w12m@I<Vf~dVCzGSTy1o^?t!eH&)%X3EscW{ka0#;2$=YKDA(rc=>M#>h(LNiaF
zlq{NwcxcBKgJDucc1bwcBW^!nj^~JC<k_+)5G{UU1#jb^f)}@^YuEL_C-jkCCc~uD
z9I6<VC`I&=P+0h;BRXOvQwYKNeV2GTv~U5)=@=<{RY?jiH(Q<RFe(l6$ya7d>)?h`
z(G)<ECKr5hM$cVaut(7nvG3`ns;=x7^H{(cP3=9_YOS2?I?w5;3WJASZn%->=(umZ
z{BzV&FR=Plw`RNLJ@dz@d^)=4J3!mSe`g|s=g`rDT8v&8npS~++V0WW;QOW5H9x5=
zujD+~(dOxV&j*k@X6bp$=V?<@^|y#-pVA{^3W_;=7@v5s=DqMnOdd#|DqE#j^xPOr
zki3a;Zb};zN_tg@y=}{hUEX^XXe4NuKD|8<t6#N5j|CLC(S|JB4CQV~h3)=5+_4yB
zz#G?Ytt^tc+zidyn(k4cGi#7F>`rfjABLgJ(AR1mH-kt{zh`}+NpBUkq8PS^Xd-{2
z_QT|o>)*rv$ngx81*_w8o8UJ(UqE~ncHj>fMj{l~B-ke;{PPB~o0`eTi6>-24#twG
z(@4i_>2e2$xOv`m&G=t=zo@T!#~_NfkOZ!NUz*HBjC+H}>!_auqj<hsIxFJb(N7Q!
zDXhX5I)DfW2pHxmX0b=lm@PfjHwdxvRC=onx^y@(s3g95OlUbqJ@#v9)jHo^$i)l{
z0KRuDdqO1h`sL_Y(kGY_6waD*>&I0+p^%S_EqgX`cxC&bd3W1aD3}R<+KA9pE7fZa
z^@KS^-en`F++9Xp0S?(CD;*(P$znBdJa?g0%|46NnXH)8Ax?WK-oi482$)c;KCV6D
z#rv_UnP2m!2qal%V0GLOzUk52?m1JcJ!U^o)q9r?*m$6Jrx_vIOl*J8#EP<a4ea)x
zAZ7RToa#?>ft8=Pxc-f0mNJW5p>kyr=tX}^+_Z$lndQKr^d^^hhp{Lrc0Iqp&2Kz7
z(^t{d%){H9mEvc`omRKK)^3>m$2LKkVPgc8Vu%S-OqK0N?anQm)ndQl&4KHg%!vj?
zH!vkSgkcRF-n1~_RJ~!XY0fSzO@)ygs{XfBts6390y-uH?F>)S9Un3Vjn%`KQAS<&
zi&*nut*Gg^4y2nZ`EBSqBfqDvM?w@v&OnZ^pN>K{=<&0P3qV<L<fnG3deq1)=2rIW
zrD{S<BpRF+nLfG<fPi$QYiU6VXs^3JQ0HS+{fh&%InnS2P;RxryBZvsX8IC(b9m6?
zz`=@e+C=on;XR(o-!f{o7>8f(tk!Gm<60#d0;SEE6!>F;nGt6st?3dB<%!h@4{4;2
z`m|Vq<q+FjKiAYww3}P}BYZFvM;mlt>xXQx^>mh}L>bB*Fp)|Tq>_eL#4w?P3k{4H
z7p;qEuAP*Jbcmx}%alSXh{dduW1Mf9FXTov``yYZ<lkP%d>wL}4^4H3JT=}4g*x}Q
z#NJ*j1t||a`>v+veEt+%Ji*;mRuu6EeCn4Z0pvy+nK~J;rB53m8fgYthdc~tQ7$k5
zJRO+VOkK)5dM&+?XjF-w$po!jzS=-fGAwF)!7I4OCL0Mtd^rq{#NDu33}A0Eq_VK~
z(n)K#NrXJUc5sftIDIY1)J~n=@`)Yl2UiYcX%lwukem+UrrV2B_D$-~&oK2-`#4?H
z<~QjWLOhEyMS!(+PbGp-=@6j@e%D2hObp>%0n&I{bcD!u`Mdl}DQQZF-(VuP{-1t|
zCqt+GmZhsWGKmupru2^=@(^`$tA3LmwIcxcM53e{CQGn+^UJk{vnW4}S5hWz3^Jbj
z*Amjc6*|Wt>>eM82Gf0)>*IM$?|B!<d{sIk|I}Tq%BPezhs|xv`7Q6*ymd9NH#Kcs
zFRH}wx0YhdPSJcBA3hi(-*EL&-+e-l7|aO$1NQP2LwZ>~E@o&`>B2P{%WGpYz}r2L
zssd^W(^IjLXZW4W%xqS-oYtSdZssrc!L(N7ha-1ML`1d*M~RsOpbip{RF#c^wCV^q
zmtfGXidK>|gU=_f5UYbXW3W;d&Z62r5ioayK+~<luk4MKKa2HZZxr^<^zbXAVn=}p
z@=<ppm8FZHr2FyYnC>3#4NbZ0u#Gy~XYXq@yi84Fwc7PhBY92PyKJQF=2d0|<p-^#
zti9n+I?zgRQqF*n=1Z|?+cwjj6*NRHvzQi(3qG!40;LFCpTvvpO}3!NEp=1ybe7=T
z9{N`N(M#51DqH0)LHVfYb1M$uCj+9AQm+{tlWPpE_7+1~m8wU}QttpuB&BKr-A%uo
z^<oB>^(}Xuaad7|A>`kNf$!mjrGh}YWA`+bplYNfihD;}LAM?U>5LEjV0Re2h&C)8
z_7S~%16B!31?%q4YOVS`**#cd0HZ@TL<U`^yPn?6_tWmm_Y#X`mSeTmeERF)-ch;c
z>r;b0s3nv!#HLZ!L$O*;upevZxfP}j(%YSzh??~ub9s07PWzSD0_w^cjN@ZyYYUti
zC7jVPh$7~mfDm42ffF;MM65DPc;ozJb7&KH{MGjuvXc%*l+J-MO<*3UVX2^L;WRf}
z^fKf9ob@O&y&e-N<X+D-uOI8N>3vpJj-?P3H=NJDX?U2Ol50tK=bBc|$(7u1`Obym
zc70Z(=-!c64=zbuLUjapHrMubpnw-pa|)qLvnv;;pw4~1tcY5DAJSY>wv+Iz!o4FH
zE9kRLU}!|vFcBN_p^g6t+DwDC&@lW(B7<C`-&<(9QKZZ?(1vF*>U=8yNtK<w6MV~v
zJVdqcu`O=Wyd|`-h_|B*7{AjWhg{{uY-;s@Xifx^2*NSy2}kN4v$dd)+hWSC>$sOM
z4npr}*t8cvOTyt9{M6JjQI<>>)ticK9a-x}mz4EOWXfh~^P7TalxJ!ukO2n&y1p-k
z_#{%W78?k37Q9s+Sj1C0(OlsrK326PC{-gIzHP)MQW_mj{m?huzK3aIPUx&`Q|oRr
zR0S?q^uG16D)O;oM|O+P6*>#$*h;Lfv*ZNv-7}{KGi%owJFD~8C3P}RMpVAb35jLa
zwZn@n=+*SG2@oTHhcsif>MoUzBcf?00QAU5ZLJzzo@Pp*-YbyDpZvv~vL%nAkHX-k
zzg-q^E{m&7gVfccSK~Srl#y`oA5?cBr^P%NbYARMNM+u<M#>T%4>g>V3gjbHsN5j(
zl@?AT74cle_yg@m?+!$&%I{}Dhb!leI;=&}`5iJ-Rk-z`zK6Mk%R|u9_i-!BCeUj!
z=vq3v^HUXMHdh^@!5Po4s|3(4dgx{QBXM3}lhIfFl_;%JT~0UbR{~WvO%<wjQjt`~
z#p#&NwruzZ46*!lWJ?TAo?3#tq>@Ee?Tw*Jp5zRW=jS!M%mxm<1p<8WVfF6|k2@&R
zn?96^ZMkD)YbSA|=YLlr(k&>Ir*-2?dP1n2vf(qcMV)|7KT1~p+HvXfqyVv-Ro^rN
z447w=I95nbR_Yzj7;0_)00cQgO2%+;JY&eFv??YX4hA%;8+SXqsaQ5>ogh0sCJ?DN
zNuJ?nT=;<5bD*v*|K`B5G#r`2SBE5M>oG1qasIB>sW4_uDZ7m}S2Te=X(yR?J`49p
zV;b0H`30}_kN{9{arFz}c}&#8_W&W`Y(}&J>@T8|L&b}z8&w00eFP4$o-$UD4>?Jb
z@T5KR1|)ZVV4sYlaIK2%Rj|4#6X@w#FM-YCDZfin%+4$UmQ&cEPYZ>!r_xmlM0<F3
z@Qk3k$8BcKHRw-*CUS()6QNJ?c`HR2*l3daPI{PfFECEYNWARDmiw3QklA#%$Cj8b
z4Wrjd+{fod{Pq(Y>t2Zd?>CKq1xr+HNK%i~B!OtRWN`y)P+o06;(8NdqEq?Mi9ZdJ
zRWz7@a3m!82|k(&QtHP^&}U73jw*NjxA)Zd^E0@)3CGbp=mHUC>!9UY!x3Wez=m0h
zeuY+>AOrUq!4~Qq3w3`J$Db$Vchw~v^i+(a@!fJde0N7;HyierdbAW&GW-tib&Bw~
zq~kTjflStS)t~49OS_=!L{DG#fmLf!hgd6}-_Yh)s-ftp@uVSiZz=-;bJR6(Wo71F
z+P4w*VUe=dRwA~+@-MKT$q_OPFDi^09~Rus^vJsUQz-P$>&1v!_wSxme7H=qD#;_i
z5Dm+{zwdNT6#l~<59{smgMRZGq^D)apSqlJi#X7HE<4l9Nkn8`Ux*C`5<8@SBcpBY
zEg{|LSy-0(C&20zAoB>-X5ir)oC*ipyAa*G%+7Y@j=JlLmsUQ#j`zO#QOW=*+f>*G
z7<<y0dnG3!W)b<*9}$v<s7}7p{qC{5hsNtp1KX)wxSI!VE25m?`g>aFDNDJp`Gi$r
ztO&FFt&E)}8%sIgeopy-9<><`Tq$<;_Q|GQ|8;ZQ_%`0EZBo;c@!&T6Z?Fno>m=F}
z?4W9;5g)v1N^mC5(=r2~-*;^TFh4^VJ%(-*rC)!fp7^aNBprVgm9t$wqt|D#e*d+0
zZjR|78HCkm4L*lfGnr#1i+JaKxnP;NfiEafjO-D>1q&BDrfUxM30ToFO<S<dV)~kp
z1gfbtIknGTO~V`G=)I8_*E*3P<voY2mEpOR%cbmM>aYw5{XY$TeD>3ut}7#{{MlX_
z;e_4h)}_=KGXnPHq>70dohizr#)$Iu|851U+X}u&4eU~{>{xP!12Pz>0Xda6Q~&_%
zdi(}VlUIB0a);_j%3-?Mb;R5S%l!{x<G-ab6?~SabUhMj;|wpJ2wVua5`R3a2%XvR
zutFy2$&V50f1hUc%jPW^vY8A|rh3mw>)Z4&Q(Z3=eI47bHx9R>D~6kj#=<i5V*CFE
z9th$0yK=6qhHPy|xr^MVsp)FRPZ4JYs|>|%(Ck3OG@9O35GuH|G!>xJKrj~XOuG@d
zH^7YoZ1eEN^;7+&5ubsr<TsS_PgP|^;IN-u!Hm%qWGrjViKb!YA58iq;8P<HN55NN
zMWf~aq7glt@7ct$w_r0&ID~W$WvlXEQqzg5ROiBFbt4A%uUnr=<!vkH2z&$Sjzi!d
z^-4FTE~;PI?1iT_{3Q=O3C8fZQYhz^Lg0l?>5-^2W3c`|B{qUKaP($o`kM4UL20J)
zZZCGp=qVUC%ktewaD18ueT#Mx1fLyFT;&Z|LMwvwSvHPBzA0a^x6CO}Mq~)`djvnH
z(*jo`g1#2Iet_7~P}&=>x-;bYE#J5DE#n`ZVFK^gJs1aDJXFZ2Nm!1TL&OCTutpfv
z+AhU7$Ppl^000KcNkl<ZeJMLi&!9zn#4kO1I+;>r%1+72I&dYxmH-Kl9AW&BM(b4a
z@N*AaMA6BJ^_U+2qC~>HTheQ-a&ho}RZ&r4tLgMC%wVYPqo+>h2oiNDr-J8WLP`<H
zXZg<uo$H4I5d*SXU4<rWPQ_0RO6rvFM$pI>+{RpL^7}}-)JItj`Ug6%X6;&_sWY{&
zy*B+ZUOzU?__Lfa^*l1D6XhM^nKTqt@(?a3{LCpUi?_I9=O7RjhMwgt5?R?=f+K|_
zxXILWWUyxX^jR;Ep07jbr4<ZR7gH5LzX~Dz1Mp^R<iVw%mX|Z(%)6t}h7lCrzkM^%
zf<Wo<!T%%jbPi~iNJFDq(N&Xm@U5g$+hc3^CXH<1!Yjeg*gr?;Dc(vISrmZ)ZjmaT
z*>FTxG&b5(lNKi_FV4nM=r?8Y8uIx2LbNaHxpWZUqyhI@J;~nuNck38ELJxf|KwX1
zlAQ{V|HW@n%apg^Ougj@y?D=@Ir9}N*f%JUSt2y0s$hc45yJVX!WT}PHeGthPqK;1
z_f0%$(F4^*Xio6VL~Hpjc{q{xX+pn~Za&n?erb46$IJ&Q$6+FjZE@*@5J_53$K#1%
zEQGp`&kdyzem76>$%IXl!D~rcR8nbVWd)>N{sNcpk#P8sBeWRj_9158IO%vuSIia4
z^X8IXx+9&+^lz_sNfXy#S^2lrdd2hdr?J14I$T>wy&|t7_%~x0I3f?;3|YhuAXBB!
z0ms5H>uARL(lIJ6Y0iT?!z&}9^bji8ob~HBaEnaxr__U`k~w)C=B0yrz6-3xVSFd?
zIMdvEgJ&#xKLY%JCSt}o&=ZjWnr3BC+D`a-R3KYLMQiy_eSO0*#6F#LUm$!_dOlM1
zAb+ZYOCA><s3r>GEojGxV;wea+Kh4Jp*DZ)*7KF?xoFT>L76y0v<y<Rx8u5WIy~?s
zz|E6WCZcU|E01|U+TBR<bdwvALnbhg=`0Jj$D1<d%==I-a$&?mH=gs0dTMngY&~@c
zgJ%8*7{c|%#4l*xJUuPq%Hp97_<_s!k)zrcX3sLD<0|A`wntMJ<lq!pUm&W!SoM7$
zU<HmvQr5kw?7Q;b?8pbh2Gh3Ia?_DmR5w$<WOEk#G4z>eU|&!jH_>R=jdF9uB3?4a
z7uoUULQ4GZWS)@~V%q5#6HbR1Iq_9?H6av`pGAWB1LHnxiC;&HC`SxG->cUiZ>O$x
z&Q2-#TV?vl!LqVLhc)&xQu!9un4iDP!#K4c`S==Y)6q~~jh~FOx<a$v4e)N$&5sB;
zl~r3tL2$}XWCktov_1CNqqbQ31$QYO7m3UV-;Zi)YHp<>A4$U|*L)4dZ(p7sJQZo#
zl6*-?u?Il=l01D(j#)5byozD!Z7P(MQ-*73NKEh2MH^6GpW&VJAr-V&uU<JqOWGGA
z(A2MH*5<@b)gLQC<z^LUaV{2Xx~8EaQbGO7s)?ESB>R+MeAcWtq;V*TgYMTjHHj&?
zD26w<=|KaHUo#Wny&j8MvCW$k(f~9&==)B$ZmXrvX+3-P{Cx4^#TUpsw-}F?Uz|n5
zd_j*Medy3;rF@F~+f0mVd;~`>Bdj8S2a%V3D7c=`!}3I3)I$y)&{m4E1egyJiNwd!
zGpXYCgJU}JZ;W|PgMpr4(%aF%{FlNw))UAHkoD)AHf`D!Yv?Y->yAN)<-JBS5;1A$
ztTi0Gu?mK-Mj%QujtZiuRN+hkNzW&@|Hz(!&g6^p5cDSA*GD4puMDH(m*gp9daYDx
zhj_LP4iqf(TO(Us|75)U^5mHa=MV*<mwV_h+ZuF+X@(rw7C7KJrf*NgKlj{oth&+y
zZ%Lsy6S6;RPqc@$gdAvZ9M~p=UUk(~YTTAPXWZVH9?}zXU~6(<n-F@``aNJ$#~?;I
zs{i63xG+5Az>ds;Z9?eT-!8YLsFYJZz5Q|(zz`<nz>dlRB}v-a%yDK3e94gUJpaEc
zE8AS`7ENFLVPMFC?S%u|gwV?zy*<kuz;$4_et1sF=3xVp3xh)rY?B<=CWIc&?j6Kk
z$8s-g0=y^kgOG&bAqRE<4zzorud1plqZ|THV3D$-r(d)Klv$X?kOM8^K)V<E9zA-n
z=|kToD3>xtPj4U2(FZXq4Bz27(C&qv5)y&$E_3jHPJ&r-8us*lObWw74!{96M6_?m
z`u*PAmKzHedbwb;_$`|4n+qXzAqPswfp#zS%+Jd<!#Wn;q#5W?<e8yk6fY-?@T+ml
zi@iN_LFrf%VhTABav<bD$bpapAqPSZgd7Mtu&r_6{{x)sxU+}i0jU50002ovPDHLk
FV1hDbC^`TD

literal 0
HcmV?d00001

diff --git a/python/docs/source/_templates/autosummary/accessor_attribute.rst b/python/docs/source/_templates/autosummary/accessor_attribute.rst
new file mode 100644
index 0000000000000..28a94614b98f5
--- /dev/null
+++ b/python/docs/source/_templates/autosummary/accessor_attribute.rst
@@ -0,0 +1,6 @@
+{{ fullname }}
+{{ underline }}
+
+.. currentmodule:: {{ module + "." + objname.split(".")[0] }}
+
+.. autoattribute:: {{ ".".join(objname.split(".")[1:]) }}
diff --git a/python/docs/source/_templates/autosummary/accessor_method.rst b/python/docs/source/_templates/autosummary/accessor_method.rst
new file mode 100644
index 0000000000000..dce014d7b5da9
--- /dev/null
+++ b/python/docs/source/_templates/autosummary/accessor_method.rst
@@ -0,0 +1,6 @@
+{{ fullname }}
+{{ underline }}
+
+.. currentmodule:: {{ module + "." + objname.split(".")[0] }}
+
+.. automethod:: {{ ".".join(objname.split(".")[1:]) }}
diff --git a/python/docs/source/_templates/autosummary/class_with_docs.rst b/python/docs/source/_templates/autosummary/class_with_docs.rst
index 7c37b83c0e90e..1141fa68a256b 100644
--- a/python/docs/source/_templates/autosummary/class_with_docs.rst
+++ b/python/docs/source/_templates/autosummary/class_with_docs.rst
@@ -47,7 +47,9 @@
 
     .. autosummary::
     {% for item in attributes %}
-       ~{{ name }}.{{ item }}
+        {% if not (item == 'uid') %}
+           ~{{ name }}.{{ item }}
+        {% endif %}
     {%- endfor %}
 
     {% endif %}
diff --git a/python/docs/source/_templates/autosummary/plot_class.rst b/python/docs/source/_templates/autosummary/plot_class.rst
new file mode 100644
index 0000000000000..5e6a73bd0ecc2
--- /dev/null
+++ b/python/docs/source/_templates/autosummary/plot_class.rst
@@ -0,0 +1,53 @@
+..  Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+..    http://www.apache.org/licenses/LICENSE-2.0
+
+..  Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+{{ fullname }}
+{{ underline }}
+
+.. currentmodule:: {{ module + "." + objname.split(".")[0] }}
+
+.. automethod:: {{ ".".join(objname.split(".")[1:]) }}
+
+{% if '__init__' in methods %}
+  {% set caught_result = methods.remove('__init__') %}
+{% endif %}
+
+{% block methods %}
+{% if methods %}
+
+   .. rubric:: Methods
+
+   .. autosummary::
+      {% for item in methods %}
+         ~{{ name.split(".")[1] }}.{{ item }}
+      {%- endfor %}
+
+{% endif %}
+{% endblock %}
+
+{% block attributes_summary %}
+{% if attributes %}
+
+   .. rubric:: Attributes
+   
+   .. autosummary::
+      {% for item in attributes %}
+         ~{{ name.split(".")[1] }}.{{ item }}
+      {%- endfor %}
+
+{% endif %}
+{% endblock %}
diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py
index b9884d55b3a1e..81083c007b346 100644
--- a/python/docs/source/conf.py
+++ b/python/docs/source/conf.py
@@ -194,7 +194,11 @@
 # further.  For a list of options available for each theme, see the
 # documentation.
 html_theme_options = {
-    "navbar_end": ["version-switcher"]
+    "navbar_end": ["version-switcher", "theme-switcher"],
+    "logo": {
+        "image_light": "_static/spark-logo-light.png",
+        "image_dark": "_static/spark-logo-dark.png",
+    }
 }
 
 # Add any paths that contain custom themes here, relative to this directory.
diff --git a/python/docs/source/reference/pyspark.pandas/frame.rst b/python/docs/source/reference/pyspark.pandas/frame.rst
index 911999b56be5e..12cf6e7db12fc 100644
--- a/python/docs/source/reference/pyspark.pandas/frame.rst
+++ b/python/docs/source/reference/pyspark.pandas/frame.rst
@@ -299,6 +299,7 @@ in Spark. These can be accessed by ``DataFrame.spark.<function/property>``.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_method.rst
 
    DataFrame.spark.frame
    DataFrame.spark.cache
@@ -319,8 +320,8 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_method.rst
 
-   DataFrame.plot
    DataFrame.plot.area
    DataFrame.plot.barh
    DataFrame.plot.bar
@@ -330,6 +331,10 @@ specific plotting methods of the form ``DataFrame.plot.<kind>``.
    DataFrame.plot.pie
    DataFrame.plot.scatter
    DataFrame.plot.density
+
+.. autosummary::
+   :toctree: api/
+
    DataFrame.hist
    DataFrame.boxplot
    DataFrame.kde
@@ -341,6 +346,7 @@ These can be accessed by ``DataFrame.pandas_on_spark.<function/property>``.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_method.rst
 
    DataFrame.pandas_on_spark.apply_batch
    DataFrame.pandas_on_spark.transform_batch
diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst
index 7ec4387bb679a..301e849ffe28a 100644
--- a/python/docs/source/reference/pyspark.pandas/indexing.rst
+++ b/python/docs/source/reference/pyspark.pandas/indexing.rst
@@ -129,8 +129,14 @@ in Spark. These can be accessed by ``Index.spark.<function/property>``.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_attribute.rst
 
    Index.spark.column
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/accessor_method.rst
+
    Index.spark.transform
 
 Sorting
@@ -308,9 +314,15 @@ in Spark. These can be accessed by ``MultiIndex.spark.<function/property>``.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_attribute.rst
 
    MultiIndex.spark.data_type
    MultiIndex.spark.column
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/accessor_method.rst
+
    MultiIndex.spark.transform
 
 MultiIndex Sorting
diff --git a/python/docs/source/reference/pyspark.pandas/io.rst b/python/docs/source/reference/pyspark.pandas/io.rst
index 118dd49a4ada9..fd41a03699cac 100644
--- a/python/docs/source/reference/pyspark.pandas/io.rst
+++ b/python/docs/source/reference/pyspark.pandas/io.rst
@@ -69,6 +69,11 @@ Generic Spark I/O
    :toctree: api/
 
    read_spark_io
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/accessor_method.rst
+
    DataFrame.spark.to_spark_io
 
 Flat File / CSV
diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst
index 01fb5aa87fb15..88d1861c6ccf0 100644
--- a/python/docs/source/reference/pyspark.pandas/series.rst
+++ b/python/docs/source/reference/pyspark.pandas/series.rst
@@ -270,8 +270,14 @@ in Spark. These can be accessed by ``Series.spark.<function/property>``.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_attribute.rst
 
    Series.spark.column
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/accessor_method.rst
+
    Series.spark.transform
    Series.spark.apply
 
@@ -304,6 +310,7 @@ Datetime Properties
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_attribute.rst
 
    Series.dt.date
    Series.dt.year
@@ -333,6 +340,7 @@ Datetime Methods
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_method.rst
 
    Series.dt.normalize
    Series.dt.strftime
@@ -353,6 +361,7 @@ like ``Series.str.<function/property>``.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_method.rst
 
    Series.str.capitalize
    Series.str.cat
@@ -416,10 +425,16 @@ the ``Series.cat`` accessor.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_attribute.rst
 
    Series.cat.categories
    Series.cat.ordered
    Series.cat.codes
+
+.. autosummary::
+   :toctree: api/
+   :template: autosummary/accessor_method.rst
+
    Series.cat.rename_categories
    Series.cat.reorder_categories
    Series.cat.add_categories
@@ -438,8 +453,8 @@ specific plotting methods of the form ``Series.plot.<kind>``.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_method.rst
 
-   Series.plot
    Series.plot.area
    Series.plot.bar
    Series.plot.barh
@@ -449,6 +464,10 @@ specific plotting methods of the form ``Series.plot.<kind>``.
    Series.plot.line
    Series.plot.pie
    Series.plot.kde
+
+.. autosummary::
+   :toctree: api/
+
    Series.hist
 
 Serialization / IO / Conversion
@@ -476,6 +495,7 @@ These can be accessed by ``Series.pandas_on_spark.<function/property>``.
 
 .. autosummary::
    :toctree: api/
+   :template: autosummary/accessor_method.rst
 
    Series.pandas_on_spark.transform_batch
 
diff --git a/python/docs/source/reference/pyspark.sql/spark_session.rst b/python/docs/source/reference/pyspark.sql/spark_session.rst
index f25dbab5f6b9b..f242e4439cf4c 100644
--- a/python/docs/source/reference/pyspark.sql/spark_session.rst
+++ b/python/docs/source/reference/pyspark.sql/spark_session.rst
@@ -29,12 +29,21 @@ See also :class:`SparkSession`.
     :toctree: api/
 
     SparkSession.active
+
+.. autosummary::
+    :toctree: api/
+    :template: autosummary/accessor_method.rst
+
     SparkSession.builder.appName
     SparkSession.builder.config
     SparkSession.builder.enableHiveSupport
     SparkSession.builder.getOrCreate
     SparkSession.builder.master
     SparkSession.builder.remote
+
+.. autosummary::
+    :toctree: api/
+
     SparkSession.catalog
     SparkSession.conf
     SparkSession.createDataFrame
@@ -58,8 +67,13 @@ Spark Connect Only
 
 .. autosummary::
     :toctree: api/
+    :template: autosummary/accessor_method.rst
 
     SparkSession.builder.create
+
+.. autosummary::
+    :toctree: api/
+
     SparkSession.addArtifact
     SparkSession.addArtifacts
     SparkSession.copyFromLocalToFs

From d971dc461f5c461fb4972e83fe70ae8b2ef27eeb Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Mon, 27 Nov 2023 08:59:24 +0900
Subject: [PATCH 12/40] [SPARK-46084][PS][FOLLOWUP] More refactoring by using
 `create_map`

### What changes were proposed in this pull request?

This PR follows-up for https://github.com/apache/spark/pull/43993 to make more refactoring for `CategoricalOps`.

### Why are the changes needed?

To optimize performance/debuggability/readability by using official API

### Does this PR introduce _any_ user-facing change?

No, it's internal refactoring

### How was this patch tested?

The existing CI should pass.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44015 from itholic/refactor_remaining_create_map.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/pandas/data_type_ops/categorical_ops.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/pandas/data_type_ops/categorical_ops.py b/python/pyspark/pandas/data_type_ops/categorical_ops.py
index bbaded42be905..824666b5819b3 100644
--- a/python/pyspark/pandas/data_type_ops/categorical_ops.py
+++ b/python/pyspark/pandas/data_type_ops/categorical_ops.py
@@ -15,6 +15,7 @@
 # limitations under the License.
 #
 
+from itertools import chain
 from typing import cast, Any, Union
 
 import pandas as pd
@@ -134,7 +135,7 @@ def _to_cat(index_ops: IndexOpsLike) -> IndexOpsLike:
     if len(categories) == 0:
         scol = F.lit(None)
     else:
-        scol = F.lit(None)
-        for code, category in reversed(list(enumerate(categories))):
-            scol = F.when(index_ops.spark.column == F.lit(code), F.lit(category)).otherwise(scol)
+        kvs = chain(*[(F.lit(code), F.lit(category)) for code, category in enumerate(categories)])
+        map_scol = F.create_map(*kvs)
+        scol = map_scol[index_ops.spark.column]
     return index_ops._with_new_scol(scol)

From ef27b9b15687dad416b6353409b1b44bc1451885 Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Mon, 27 Nov 2023 09:00:11 +0900
Subject: [PATCH 13/40] [SPARK-46099][PS][DOCS] Refactor "Supported pandas API"
 generation script

### What changes were proposed in this pull request?

This PR proposes to refactor the script used to generate the [Supported pandas API](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/supported_pandas_api.html) documentation. The script has been restructured for better readability and maintainability. The refactoring includes:

- Simplifying complex functions and breaking them into smaller, more manageable pieces.
- Improving variable and function naming for clarity.
- Adding comprehensive docstrings in the NumPy docstyle.
- Streamlining the flow of the script to enhance logical coherence.

### Why are the changes needed?

The previous version of the script was hard to understand and maintain due to its complexity and lack of documentation. This refactoring makes the script more accessible to new contributors and easier to modify or extend in the future. It also ensures that the script adheres to best practices in Python coding, making it a more reliable tool for generating accurate and up-to-date documentation.

### Does this PR introduce _any_ user-facing change?

No user-facing changes. This PR only affects the internal documentation generation process.

### How was this patch tested?

Tested by generating the documentation manually and verifying that the output remains consistent with the previous version.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44010 from itholic/refactor_doc_gen_script.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/pandas/supported_api_gen.py | 188 ++++++++++++++-------
 1 file changed, 124 insertions(+), 64 deletions(-)

diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py
index 27d5cd4b37f9d..1f893520d2cef 100644
--- a/python/pyspark/pandas/supported_api_gen.py
+++ b/python/pyspark/pandas/supported_api_gen.py
@@ -33,13 +33,11 @@
 from pyspark.loose_version import LooseVersion
 from pyspark.pandas.exceptions import PandasNotImplementedError
 
+# Constants
 MAX_MISSING_PARAMS_SIZE = 5
-COMMON_PARAMETER_SET = {
-    "kwargs",
-    "args",
-    "cls",
-}  # These are not counted as missing parameters.
+COMMON_PARAMETER_SET = {"kwargs", "args", "cls"}
 MODULE_GROUP_MATCH = [(pd, ps), (pdw, psw), (pdg, psg)]
+PANDAS_LATEST_VERSION = "2.1.3"
 
 RST_HEADER = """
 =====================
@@ -73,6 +71,10 @@
 
 @unique
 class Implemented(Enum):
+    """
+    Enumeration of implementation statuses.
+    """
+
     IMPLEMENTED = "Y"
     NOT_IMPLEMENTED = "N"
     PARTIALLY_IMPLEMENTED = "P"
@@ -80,7 +82,7 @@ class Implemented(Enum):
 
 class SupportedStatus(NamedTuple):
     """
-    Defines a supported status for specific pandas API
+    Defines a supported status for specific pandas API.
     """
 
     implemented: str
@@ -89,47 +91,108 @@ class SupportedStatus(NamedTuple):
 
 def generate_supported_api(output_rst_file_path: str) -> None:
     """
-    Generate supported APIs status dictionary.
+    Generate the supported APIs status dictionary and write it to an RST file.
 
     Parameters
     ----------
     output_rst_file_path : str
         The path to the document file in RST format.
+    """
+    _check_pandas_version()
+    all_supported_status = _collect_supported_status()
+    _write_rst(output_rst_file_path, all_supported_status)
+
 
-    Write supported APIs documentation.
+def _check_pandas_version() -> None:
     """
-    pandas_latest_version = "2.1.3"
-    if LooseVersion(pd.__version__) != LooseVersion(pandas_latest_version):
+    Check if the installed pandas version matches the expected version.
+    """
+    if LooseVersion(pd.__version__) != LooseVersion(PANDAS_LATEST_VERSION):
         msg = (
-            "Warning: Latest version of pandas (%s) is required to generate the documentation; "
-            "however, your version was %s" % (pandas_latest_version, pd.__version__)
+            f"Warning: pandas {PANDAS_LATEST_VERSION} is required; your version is {pd.__version__}"
         )
         warnings.warn(msg, UserWarning)
         raise ImportError(msg)
 
+
+def _collect_supported_status() -> Dict[Tuple[str, str], Dict[str, SupportedStatus]]:
+    """
+    Collect the supported status across multiple module paths.
+    """
     all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]] = {}
     for pd_module_group, ps_module_group in MODULE_GROUP_MATCH:
         pd_modules = _get_pd_modules(pd_module_group)
         _update_all_supported_status(
             all_supported_status, pd_modules, pd_module_group, ps_module_group
         )
-    _write_rst(output_rst_file_path, all_supported_status)
+    return all_supported_status
+
+
+def _get_pd_modules(pd_module_group: Any) -> List[str]:
+    """
+    Get sorted list of pandas member names from a pandas module.
+
+    Parameters
+    ----------
+    pd_module_group : Any
+        Importable pandas module.
+
+    Returns
+    -------
+    List[str]
+        Sorted list of member names.
+    """
+    return sorted(m[0] for m in getmembers(pd_module_group, isclass) if not m[0].startswith("_"))
+
+
+def _update_all_supported_status(
+    all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]],
+    pd_modules: List[str],
+    pd_module_group: Any,
+    ps_module_group: Any,
+) -> None:
+    """
+    Update the supported status dictionary with status from multiple modules.
+
+    Parameters
+    ----------
+    all_supported_status : Dict[Tuple[str, str], Dict[str, SupportedStatus]]
+        The dictionary to update with supported statuses.
+    pd_modules : List[str]
+        List of module names in pandas.
+    pd_module_group : Any
+        Importable pandas module group.
+    ps_module_group : Any
+        Corresponding pyspark.pandas module group.
+    """
+    pd_modules.append("")  # Include General Function APIs
+    for module_name in pd_modules:
+        supported_status = _create_supported_by_module(
+            module_name, pd_module_group, ps_module_group
+        )
+        if supported_status:
+            all_supported_status[(module_name, ps_module_group.__name__)] = supported_status
 
 
 def _create_supported_by_module(
     module_name: str, pd_module_group: Any, ps_module_group: Any
 ) -> Dict[str, SupportedStatus]:
     """
-    Retrieves supported status of pandas module
+    Create a dictionary of supported status for a specific pandas module.
 
     Parameters
     ----------
     module_name : str
-        Class name that exists in the path of the module.
+        Name of the module in pandas.
     pd_module_group : Any
-        Specific path of importable pandas module.
-    ps_module_group: Any
-        Specific path of importable pyspark.pandas module.
+        Importable pandas module.
+    ps_module_group : Any
+        Corresponding pyspark.pandas module.
+
+    Returns
+    -------
+    Dict[str, SupportedStatus]
+        Dictionary of supported status for the module.
     """
     pd_module = getattr(pd_module_group, module_name) if module_name else pd_module_group
     try:
@@ -157,7 +220,7 @@ def _organize_by_implementation_status(
     ps_module_group: Any,
 ) -> Dict[str, SupportedStatus]:
     """
-    Check the implementation status and parameters of both modules.
+    Organize functions by implementation status between pandas and pyspark.pandas.
 
     Parameters
     ----------
@@ -171,6 +234,11 @@ def _organize_by_implementation_status(
         Specific path of importable pandas module.
     ps_module_group: Any
         Specific path of importable pyspark.pandas module.
+
+    Returns
+    -------
+    Dict[str, SupportedStatus]
+        Dictionary of implementation status.
     """
     pd_dict = {}
     for pd_func_name, pd_func in pd_funcs.items():
@@ -214,7 +282,7 @@ def _transform_missing(
     ps_module_path: str,
 ) -> str:
     """
-    Transform missing parameters into table information string.
+    Transform missing parameters into a formatted string for table display.
 
     Parameters
     ----------
@@ -229,6 +297,11 @@ def _transform_missing(
     ps_module_path : str
         Path string of pyspark.pandas module.
 
+    Returns
+    -------
+    str
+        Formatted string representing missing parameters.
+
     Examples
     --------
     >>> _transform_missing("DataFrame", "add", {"axis", "fill_value", "level"},
@@ -251,47 +324,6 @@ def _transform_missing(
     return missing_str
 
 
-def _get_pd_modules(pd_module_group: Any) -> List[str]:
-    """
-    Returns sorted pandas member list from pandas module path.
-
-    Parameters
-    ----------
-    pd_module_group : Any
-        Specific path of importable pandas module.
-    """
-    return sorted([m[0] for m in getmembers(pd_module_group, isclass) if not m[0].startswith("_")])
-
-
-def _update_all_supported_status(
-    all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]],
-    pd_modules: List[str],
-    pd_module_group: Any,
-    ps_module_group: Any,
-) -> None:
-    """
-    Updates supported status across multiple module paths.
-
-    Parameters
-    ----------
-    all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]]
-        Data that stores the supported status across multiple module paths.
-    pd_modules: List[str]
-        Name list of pandas modules.
-    pd_module_group : Any
-        Specific path of importable pandas module.
-    ps_module_group: Any
-        Specific path of importable pyspark.pandas module.
-    """
-    pd_modules += [""]  # for General Function APIs
-    for module_name in pd_modules:
-        supported_status = _create_supported_by_module(
-            module_name, pd_module_group, ps_module_group
-        )
-        if supported_status:
-            all_supported_status[(module_name, ps_module_group.__name__)] = supported_status
-
-
 def _write_table(
     module_name: str,
     module_path: str,
@@ -299,7 +331,18 @@ def _write_table(
     w_fd: TextIO,
 ) -> None:
     """
-    Write table by using Sphinx list-table directive.
+    Write the support status in a table format using Sphinx list-table directive.
+
+    Parameters
+    ----------
+    module_name : str
+        The name of the module whose support status is being documented.
+    module_path : str
+        The import path of the module in the documentation.
+    supported_status : Dict[str, SupportedStatus]
+        A dictionary mapping each function name to its support status.
+    w_fd : TextIO
+        An open file descriptor where the table will be written.
     """
     lines = []
     if module_name:
@@ -336,7 +379,17 @@ def _write_table(
 
 def _escape_func_str(func_str: str) -> str:
     """
-    Transforms which affecting rst data format.
+    Escape function names to conform to RST format.
+
+    Parameters
+    ----------
+    func_str : str
+        Function name to escape.
+
+    Returns
+    -------
+    str
+        Escaped function name.
     """
     # TODO: Take into account that this function can create links incorrectly
     # We can create alias links or links to parent methods
@@ -351,7 +404,14 @@ def _write_rst(
     all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]],
 ) -> None:
     """
-    Writes the documentation to the target file path.
+    Write the final RST file with the collected support status.
+
+    Parameters
+    ----------
+    output_rst_file_path : str
+        Path to the output RST file.
+    all_supported_status : Dict
+        Collected support status data.
     """
     with open(output_rst_file_path, "w") as w_fd:
         w_fd.write(RST_HEADER)

From e1a2255f99be88e776295f30f995b339c3e4b5af Mon Sep 17 00:00:00 2001
From: hannahkamundson <hannahkamundson@gmail.com>
Date: Mon, 27 Nov 2023 10:38:22 +0800
Subject: [PATCH 14/40] 
 [SPARK-45699][BUILD][CORE][SQL][SS][CONNECT][MLLIB][ML][DSTREAM][GRAPHX][K8S][UI]
 Fixing all compilation warnings related to widening conversions

### What changes were proposed in this pull request?

1. Change the silencing of the widening conversion compilation warnings in the parent `pom.xml` and `SparkBuild` to throw an error
2. All widening conversion compilation warnings were removed. This almost exclusively involved adding `.toDouble` to longs. However, it also involved some `.toFloat` on ints and longs.

### Why are the changes needed?
It allows us to upgrade to Scala 2.13 without adding a bunch of compilation issues. This is removing the following compilation error
```shell
[error] /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala:1207:60: Widening conversion from Long to Double is deprecated because it loses precision. Write `.toDouble` instead. [quickfixable]
[error] Applicable -Wconf / nowarn filters for this fatal warning: msg=<part of the message>, cat=deprecation, site=org.apache.spark.scheduler.TaskSetManager.checkSpeculatableTasks
[error]       foundTasks = checkAndSubmitSpeculatableTasks(timeMs, threshold, customizedThreshold = true)
```

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
No tests were added.
For every profile (including the base profile), I ran `mvn clean compile test-compile`. I then `grep`ed any lines that had the word `Wide` in it. I determined I was done when no output remained.

Here is a script of what was run:
```shell
mvn clean compile test-compile |& tee output.txt
cat output.txt | grep .*Wide.* |& tee output-widening.txt
mvn clean compile test-compile -Pspark-ganglia-lgpl |& tee output-spark-ganglia-lgpl.txt
cat output-spark-ganglia-lgpl.txt | grep .*Wide.* |& tee output-spark-ganglia-lgpl-widening.txt
mvn clean compile test-compile -Pkinesis-asl |& tee output-kinesis-asl.txt
cat output-kinesis-asl.txt | grep .*Wide.* |& tee output-kinesis-asl-widening.txt
mvn clean compile test-compile -Pdocker-integration-tests |& tee output-docker-integration-tests.txt
cat output-docker-integration-tests.txt | grep .*Wide.* |& tee output-docker-integration-tests-widening.txt
mvn clean compile test-compile -Pyarn \& tee output-yarn.txt
cat output-yarn.txt | grep .*Wide.* |& tee output-yarn-widening.txt
mvn clean compile test-compile -Pkubernetes |& tee output-kubernetes.txt
cat output-kubernetes.txt | grep .*Wide.* |& tee output-kubernetes-widening.txt
mvn clean compile test-compile -Pkubernetes-integration-tests |& tee output-kubernetes-integration-tests.txt
cat output-integration-tests.txt | grep .*Wide.* |& tee output-integration-tests-widening.txt
mvn clean compile test-compile -Phive-thriftserver |& tee output-hive-thriftserver.txt
cat output-thriftserver.txt | grep .*Wide.* |& tee output-thriftserver-widening.txt
mvn clean compile test-compile -Phadoop-cloud |& tee output-hadoop-cloud.txt
cat output-hadoop-cloud.txt | grep .*Wide.* |& tee output-hadoop-cloud-widening.txt
```

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #43890 from hannahkamundson/SPARK-45699.

Authored-by: hannahkamundson <hannahkamundson@gmail.com>
Signed-off-by: yangjie01 <yangjie01@baidu.com>
---
 .../types/UTF8StringPropertyCheckSuite.scala  |  4 +-
 .../client/arrow/ArrowVectorReader.scala      |  6 +--
 .../kafka010/DirectKafkaInputDStream.scala    |  2 +-
 .../spark/streaming/kafka010/KafkaRDD.scala   |  2 +-
 .../kafka010/DirectKafkaStreamSuite.scala     |  2 +-
 .../input/FixedLengthBinaryInputFormat.scala  |  2 +-
 .../spark/metrics/sink/StatsdReporter.scala   |  4 +-
 .../apache/spark/partial/CountEvaluator.scala |  5 +-
 .../spark/partial/GroupedCountEvaluator.scala |  4 +-
 .../apache/spark/resource/ResourceUtils.scala |  2 +-
 .../apache/spark/scheduler/MapStatus.scala    |  8 +--
 .../spark/scheduler/TaskSetManager.scala      |  7 +--
 .../scala/org/apache/spark/util/Clock.scala   |  2 +-
 .../util/random/StratifiedSamplingUtils.scala |  4 +-
 .../apache/spark/benchmark/Benchmark.scala    |  2 +-
 .../deploy/history/EventLogTestHelper.scala   |  2 +-
 .../spark/status/AppStatusStoreSuite.scala    | 54 ++++++++++---------
 .../apache/spark/graphx/lib/SVDPlusPlus.scala |  2 +-
 .../spark/graphx/lib/PageRankSuite.scala      |  2 +-
 .../GeneralizedLinearRegression.scala         |  2 +-
 .../org/apache/spark/ml/stat/ANOVATest.scala  |  2 +-
 .../org/apache/spark/ml/stat/FValueTest.scala |  2 +-
 .../spark/mllib/clustering/LDAOptimizer.scala |  2 +-
 .../mllib/clustering/StreamingKMeans.scala    |  2 +-
 .../spark/mllib/fpm/AssociationRules.scala    |  4 +-
 .../mllib/linalg/distributed/RowMatrix.scala  |  5 +-
 .../correlation/SpearmanCorrelation.scala     |  2 +-
 .../spark/mllib/stat/test/ChiSqTest.scala     |  2 +-
 .../mllib/stat/test/StreamingTestMethod.scala |  2 +-
 pom.xml                                       |  2 +-
 project/SparkBuild.scala                      |  2 +-
 .../cluster/k8s/ExecutorRollPlugin.scala      | 17 +++---
 .../plans/logical/basicLogicalOperators.scala |  3 +-
 .../statsEstimation/EstimationUtils.scala     |  4 +-
 .../sql/catalyst/util/QuantileSummaries.scala |  2 +-
 .../ui/StreamingQueryStatisticsPage.scala     | 16 +++---
 .../sql/StatisticsCollectionTestBase.scala    |  8 +--
 .../PassThroughEncodingSuite.scala            |  2 +-
 .../streaming/EventTimeWatermarkSuite.scala   |  5 +-
 .../streaming/receiver/RateLimiter.scala      |  4 +-
 .../spark/streaming/ui/StreamingPage.scala    | 14 ++---
 .../apache/spark/streaming/ui/UIUtils.scala   |  8 +--
 .../ExecutorAllocationManagerSuite.scala      |  4 +-
 .../scheduler/RateControllerSuite.scala       |  2 +-
 44 files changed, 124 insertions(+), 110 deletions(-)

diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
index ab488e18ba3f4..75c56451592e4 100644
--- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
+++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala
@@ -80,7 +80,9 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp
 
   test("compare") {
     forAll { (s1: String, s2: String) =>
-      assert(Math.signum(toUTF8(s1).compareTo(toUTF8(s2))) === Math.signum(s1.compareTo(s2)))
+      assert(Math.signum {
+        toUTF8(s1).compareTo(toUTF8(s2)).toFloat
+      } === Math.signum(s1.compareTo(s2).toFloat))
     }
   }
 
diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowVectorReader.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowVectorReader.scala
index 488208574809b..53d8d46e62689 100644
--- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowVectorReader.scala
+++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowVectorReader.scala
@@ -134,7 +134,7 @@ private[arrow] class SmallIntVectorReader(v: SmallIntVector)
 private[arrow] class IntVectorReader(v: IntVector) extends TypedArrowVectorReader[IntVector](v) {
   override def getInt(i: Int): Int = vector.get(i)
   override def getLong(i: Int): Long = getInt(i)
-  override def getFloat(i: Int): Float = getInt(i)
+  override def getFloat(i: Int): Float = getInt(i).toFloat
   override def getDouble(i: Int): Double = getInt(i)
   override def getString(i: Int): String = String.valueOf(getInt(i))
   override def getJavaDecimal(i: Int): JBigDecimal = JBigDecimal.valueOf(getInt(i))
@@ -143,8 +143,8 @@ private[arrow] class IntVectorReader(v: IntVector) extends TypedArrowVectorReade
 private[arrow] class BigIntVectorReader(v: BigIntVector)
     extends TypedArrowVectorReader[BigIntVector](v) {
   override def getLong(i: Int): Long = vector.get(i)
-  override def getFloat(i: Int): Float = getLong(i)
-  override def getDouble(i: Int): Double = getLong(i)
+  override def getFloat(i: Int): Float = getLong(i).toFloat
+  override def getDouble(i: Int): Double = getLong(i).toDouble
   override def getString(i: Int): String = String.valueOf(getLong(i))
   override def getJavaDecimal(i: Int): JBigDecimal = JBigDecimal.valueOf(getLong(i))
   override def getTimestamp(i: Int): Timestamp = toJavaTimestamp(getLong(i) * MICROS_PER_SECOND)
diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
index f5967a74ad339..c412486ce197e 100644
--- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
+++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala
@@ -146,7 +146,7 @@ private[spark] class DirectKafkaInputDStream[K, V](
           val maxRateLimitPerPartition = ppc.maxRatePerPartition(tp)
           val backpressureRate = lag / totalLag.toDouble * rate
           tp -> (if (maxRateLimitPerPartition > 0) {
-            Math.min(backpressureRate, maxRateLimitPerPartition)} else backpressureRate)
+            Math.min(backpressureRate, maxRateLimitPerPartition.toDouble)} else backpressureRate)
         }
       case None => offsets.map { case (tp, offset) => tp -> ppc.maxRatePerPartition(tp).toDouble }
     }
diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
index 286b073125ff0..6c57091bc3c46 100644
--- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
+++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala
@@ -98,7 +98,7 @@ private[spark] class KafkaRDD[K, V](
     if (compacted) {
       super.countApprox(timeout, confidence)
     } else {
-      val c = count()
+      val c = count().toDouble
       new PartialResult(new BoundedDouble(c, 1.0, c, c), true)
     }
 
diff --git a/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
index faf114108fac5..28f0906258303 100644
--- a/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
+++ b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala
@@ -805,7 +805,7 @@ private[streaming] class ConstantEstimator(@volatile private var rate: Long)
       time: Long,
       elements: Long,
       processingDelay: Long,
-      schedulingDelay: Long): Option[Double] = Some(rate)
+      schedulingDelay: Long): Option[Double] = Some(rate.toDouble)
 }
 
 private[streaming] class ConstantRateController(id: Int, estimator: RateEstimator, rate: Long)
diff --git a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala
index 978afaffab30b..4897cf694ae8e 100644
--- a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala
+++ b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala
@@ -74,7 +74,7 @@ private[spark] class FixedLengthBinaryInputFormat
     if (defaultSize < recordLength) {
       recordLength.toLong
     } else {
-      (Math.floor(defaultSize / recordLength) * recordLength).toLong
+      defaultSize / recordLength * recordLength
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala
index 877f04b1adc01..189d390d37999 100644
--- a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala
@@ -124,9 +124,9 @@ private[spark] class StatsdReporter(
 
   private def reportTimer(name: String, timer: Timer)(implicit socket: DatagramSocket): Unit = {
     val snapshot = timer.getSnapshot
-    send(fullName(name, "max"), format(convertDuration(snapshot.getMax)), TIMER)
+    send(fullName(name, "max"), format(convertDuration(snapshot.getMax.toDouble)), TIMER)
     send(fullName(name, "mean"), format(convertDuration(snapshot.getMean)), TIMER)
-    send(fullName(name, "min"), format(convertDuration(snapshot.getMin)), TIMER)
+    send(fullName(name, "min"), format(convertDuration(snapshot.getMin.toDouble)), TIMER)
     send(fullName(name, "stddev"), format(convertDuration(snapshot.getStdDev)), TIMER)
     send(fullName(name, "p50"), format(convertDuration(snapshot.getMedian)), TIMER)
     send(fullName(name, "p75"), format(convertDuration(snapshot.get75thPercentile)), TIMER)
diff --git a/core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala
index cbee136871012..a974ca2f1a05b 100644
--- a/core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala
+++ b/core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala
@@ -35,7 +35,7 @@ private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double)
 
   override def currentResult(): BoundedDouble = {
     if (outputsMerged == totalOutputs) {
-      new BoundedDouble(sum, 1.0, sum, sum)
+      new BoundedDouble(sum.toDouble, 1.0, sum.toDouble, sum.toDouble)
     } else if (outputsMerged == 0 || sum == 0) {
       new BoundedDouble(0, 0.0, 0.0, Double.PositiveInfinity)
     } else {
@@ -57,7 +57,8 @@ private[partial] object CountEvaluator {
     val low = dist.inverseCumulativeProbability((1 - confidence) / 2)
     val high = dist.inverseCumulativeProbability((1 + confidence) / 2)
     // Add 'sum' to each because distribution is just of remaining count, not observed
-    new BoundedDouble(sum + dist.getNumericalMean, confidence, sum + low, sum + high)
+    new BoundedDouble(
+      sum + dist.getNumericalMean, confidence, (sum + low).toDouble, (sum + high).toDouble)
   }
 
 
diff --git a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
index d2b4187df5d50..7cd60815fadbe 100644
--- a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
+++ b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala
@@ -41,7 +41,9 @@ private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, conf
 
   override def currentResult(): Map[T, BoundedDouble] = {
     if (outputsMerged == totalOutputs) {
-      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
+      sums.map { case (key, sum) =>
+        (key, new BoundedDouble(sum.toDouble, 1.0, sum.toDouble, sum.toDouble))
+      }.toMap
     } else if (outputsMerged == 0) {
       new HashMap[T, BoundedDouble]
     } else {
diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
index 00c655f4a4f4d..fe08e8337f76f 100644
--- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
+++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala
@@ -476,7 +476,7 @@ private[spark] object ResourceUtils extends Logging {
       if (maxTaskPerExec < (execAmount * numParts / taskAmount)) {
         val origTaskAmount = treq.amount
         val taskReqStr = s"${origTaskAmount}/${numParts}"
-        val resourceNumSlots = Math.floor(execAmount * numParts / taskAmount).toInt
+        val resourceNumSlots = (execAmount * numParts / taskAmount).toInt
         val message = s"The configuration of resource: ${treq.resourceName} " +
           s"(exec = ${execAmount}, task = ${taskReqStr}, " +
           s"runnable tasks = ${resourceNumSlots}) will " +
diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
index d10cf55ed0d10..113521453ad7b 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
@@ -95,7 +95,7 @@ private[spark] object MapStatus {
     } else if (size <= 1L) {
       1
     } else {
-      math.min(255, math.ceil(math.log(size) / math.log(LOG_BASE)).toInt).toByte
+      math.min(255, math.ceil(math.log(size.toDouble) / math.log(LOG_BASE)).toInt).toByte
     }
   }
 
@@ -276,12 +276,12 @@ private[spark] object HighlyCompressedMapStatus {
         val skewSizeThreshold =
           Math.max(
             medianSize * accurateBlockSkewedFactor,
-            sortedSizes(totalNumBlocks - maxAccurateSkewedBlockNumber)
+            sortedSizes(totalNumBlocks - maxAccurateSkewedBlockNumber).toDouble
           )
-        Math.min(shuffleAccurateBlockThreshold, skewSizeThreshold)
+        Math.min(shuffleAccurateBlockThreshold.toDouble, skewSizeThreshold)
       } else {
         // Disable skew detection if accurateBlockSkewedFactor <= 0
-        shuffleAccurateBlockThreshold
+        shuffleAccurateBlockThreshold.toDouble
       }
 
     val hugeBlockSizes = mutable.Map.empty[Int, Byte]
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index 6157a3e46c875..d17e6735c4ecf 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -809,7 +809,7 @@ private[spark] class TaskSetManager(
 
     info.markFinished(TaskState.FINISHED, clock.getTimeMillis())
     if (speculationEnabled) {
-      successfulTaskDurations.insert(info.duration)
+      successfulTaskDurations.insert(info.duration.toDouble)
       taskProcessRateCalculator.foreach(_.updateAvgTaskProcessRate(tid, result))
     }
     removeRunningTask(tid)
@@ -1196,7 +1196,7 @@ private[spark] class TaskSetManager(
     val timeMs = clock.getTimeMillis()
     if (numSuccessfulTasks >= minFinishedForSpeculation) {
       val medianDuration = successfulTaskDurations.percentile()
-      val threshold = max(speculationMultiplier * medianDuration, minTimeToSpeculation)
+      val threshold = max(speculationMultiplier * medianDuration, minTimeToSpeculation.toDouble)
       // TODO: Threshold should also look at standard deviation of task durations and have a lower
       // bound based on that.
       logDebug("Task length threshold for speculation: " + threshold)
@@ -1204,7 +1204,8 @@ private[spark] class TaskSetManager(
     } else if (isSpeculationThresholdSpecified && speculationTasksLessEqToSlots) {
       val threshold = speculationTaskDurationThresOpt.get
       logDebug(s"Tasks taking longer time than provided speculation threshold: $threshold")
-      foundTasks = checkAndSubmitSpeculatableTasks(timeMs, threshold, customizedThreshold = true)
+      foundTasks = checkAndSubmitSpeculatableTasks(
+        timeMs, threshold.toDouble, customizedThreshold = true)
     }
     // avoid more warning logs.
     if (foundTasks) {
diff --git a/core/src/main/scala/org/apache/spark/util/Clock.scala b/core/src/main/scala/org/apache/spark/util/Clock.scala
index 226f15d3d38c2..e0cb3f4188e6d 100644
--- a/core/src/main/scala/org/apache/spark/util/Clock.scala
+++ b/core/src/main/scala/org/apache/spark/util/Clock.scala
@@ -85,7 +85,7 @@ private[spark] class SystemClock extends Clock {
       return currentTime
     }
 
-    val pollTime = math.max(waitTime / 10.0, minPollTime).toLong
+    val pollTime = math.max(waitTime / 10.0, minPollTime.toDouble).toLong
 
     while (true) {
       currentTime = System.currentTimeMillis()
diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
index f08cf44e4e12b..08e2ea01f623e 100644
--- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
+++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala
@@ -98,8 +98,8 @@ private[spark] object StratifiedSamplingUtils extends Logging {
         if (acceptResult.areBoundsEmpty) {
           val n = counts.get(key)
           val sampleSize = math.ceil(n * fraction).toLong
-          val lmbd1 = PoissonBounds.getLowerBound(sampleSize)
-          val lmbd2 = PoissonBounds.getUpperBound(sampleSize)
+          val lmbd1 = PoissonBounds.getLowerBound(sampleSize.toDouble)
+          val lmbd2 = PoissonBounds.getUpperBound(sampleSize.toDouble)
           acceptResult.acceptBound = lmbd1 / n
           acceptResult.waitListBound = (lmbd2 - lmbd1) / n
         }
diff --git a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala
index 0b33e2a9426ce..e7315d6119be0 100644
--- a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala
+++ b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala
@@ -163,7 +163,7 @@ private[spark] class Benchmark(
     // scalastyle:on
     assert(runTimes.nonEmpty)
     val best = runTimes.min
-    val avg = runTimes.sum / runTimes.size
+    val avg = runTimes.sum.toDouble / runTimes.size
     val stdev = if (runTimes.size > 1) {
       math.sqrt(runTimes.map(time => (time - avg) * (time - avg)).sum / (runTimes.size - 1))
     } else 0
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala
index ac89f60955eed..0161917f8853d 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala
@@ -56,7 +56,7 @@ object EventLogTestHelper {
       eventStr: String,
       desiredSize: Long): Seq[String] = {
     val stringLen = eventStr.getBytes(StandardCharsets.UTF_8).length
-    val repeatCount = Math.floor(desiredSize / stringLen).toInt
+    val repeatCount = (desiredSize / stringLen).toInt
     (0 until repeatCount).map { _ =>
       writer.writeEvent(eventStr, flushLogger = true)
       eventStr
diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala
index ccf6c9184cc96..f2b795764b7e8 100644
--- a/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala
+++ b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala
@@ -170,40 +170,44 @@ class AppStatusStoreSuite extends SparkFunSuite {
           assert(actualQuantiles === expectedQuantiles)
         }
 
-        assertQuantiles(_.executorDeserializeTime, summary.executorDeserializeTime)
-        assertQuantiles(_.executorDeserializeCpuTime, summary.executorDeserializeCpuTime)
-        assertQuantiles(_.executorRunTime, summary.executorRunTime)
-        assertQuantiles(_.executorRunTime, summary.executorRunTime)
-        assertQuantiles(_.executorCpuTime, summary.executorCpuTime)
-        assertQuantiles(_.resultSize, summary.resultSize)
-        assertQuantiles(_.jvmGCTime, summary.jvmGcTime)
-        assertQuantiles(_.resultSerializationTime, summary.resultSerializationTime)
-        assertQuantiles(_.memoryBytesSpilled, summary.memoryBytesSpilled)
-        assertQuantiles(_.diskBytesSpilled, summary.diskBytesSpilled)
-        assertQuantiles(_.peakExecutionMemory, summary.peakExecutionMemory)
-        assertQuantiles(_.inputMetrics.bytesRead, summary.inputMetrics.bytesRead)
-        assertQuantiles(_.inputMetrics.recordsRead, summary.inputMetrics.recordsRead)
-        assertQuantiles(_.outputMetrics.bytesWritten, summary.outputMetrics.bytesWritten)
-        assertQuantiles(_.outputMetrics.recordsWritten, summary.outputMetrics.recordsWritten)
-        assertQuantiles(_.shuffleReadMetrics.remoteBlocksFetched,
+        assertQuantiles(_.executorDeserializeTime.toDouble, summary.executorDeserializeTime)
+        assertQuantiles(_.executorDeserializeCpuTime.toDouble, summary.executorDeserializeCpuTime)
+        assertQuantiles(_.executorRunTime.toDouble, summary.executorRunTime)
+        assertQuantiles(_.executorRunTime.toDouble, summary.executorRunTime)
+        assertQuantiles(_.executorCpuTime.toDouble, summary.executorCpuTime)
+        assertQuantiles(_.resultSize.toDouble, summary.resultSize)
+        assertQuantiles(_.jvmGCTime.toDouble, summary.jvmGcTime)
+        assertQuantiles(_.resultSerializationTime.toDouble, summary.resultSerializationTime)
+        assertQuantiles(_.memoryBytesSpilled.toDouble, summary.memoryBytesSpilled)
+        assertQuantiles(_.diskBytesSpilled.toDouble, summary.diskBytesSpilled)
+        assertQuantiles(_.peakExecutionMemory.toDouble, summary.peakExecutionMemory)
+        assertQuantiles(_.inputMetrics.bytesRead.toDouble, summary.inputMetrics.bytesRead)
+        assertQuantiles(_.inputMetrics.recordsRead.toDouble, summary.inputMetrics.recordsRead)
+        assertQuantiles(_.outputMetrics.bytesWritten.toDouble, summary.outputMetrics.bytesWritten)
+        assertQuantiles(_.outputMetrics.recordsWritten.toDouble,
+          summary.outputMetrics.recordsWritten)
+        assertQuantiles(_.shuffleReadMetrics.remoteBlocksFetched.toDouble,
           summary.shuffleReadMetrics.remoteBlocksFetched)
-        assertQuantiles(_.shuffleReadMetrics.localBlocksFetched,
+        assertQuantiles(_.shuffleReadMetrics.localBlocksFetched.toDouble,
           summary.shuffleReadMetrics.localBlocksFetched)
-        assertQuantiles(_.shuffleReadMetrics.fetchWaitTime,
+        assertQuantiles(_.shuffleReadMetrics.fetchWaitTime.toDouble,
           summary.shuffleReadMetrics.fetchWaitTime)
-        assertQuantiles(_.shuffleReadMetrics.remoteBytesRead,
+        assertQuantiles(_.shuffleReadMetrics.remoteBytesRead.toDouble,
           summary.shuffleReadMetrics.remoteBytesRead)
-        assertQuantiles(_.shuffleReadMetrics.remoteBytesReadToDisk,
+        assertQuantiles(_.shuffleReadMetrics.remoteBytesReadToDisk.toDouble,
           summary.shuffleReadMetrics.remoteBytesReadToDisk)
         assertQuantiles(
-          t => t.shuffleReadMetrics.localBytesRead + t.shuffleReadMetrics.remoteBytesRead,
+          t => t.shuffleReadMetrics.localBytesRead + t.shuffleReadMetrics.remoteBytesRead.toDouble,
           summary.shuffleReadMetrics.readBytes)
         assertQuantiles(
-          t => t.shuffleReadMetrics.localBlocksFetched + t.shuffleReadMetrics.remoteBlocksFetched,
+          t => t.shuffleReadMetrics.localBlocksFetched +
+            t.shuffleReadMetrics.remoteBlocksFetched.toDouble,
           summary.shuffleReadMetrics.totalBlocksFetched)
-        assertQuantiles(_.shuffleWriteMetrics.bytesWritten, summary.shuffleWriteMetrics.writeBytes)
-        assertQuantiles(_.shuffleWriteMetrics.writeTime, summary.shuffleWriteMetrics.writeTime)
-        assertQuantiles(_.shuffleWriteMetrics.recordsWritten,
+        assertQuantiles(_.shuffleWriteMetrics.bytesWritten.toDouble,
+          summary.shuffleWriteMetrics.writeBytes)
+        assertQuantiles(_.shuffleWriteMetrics.writeTime.toDouble,
+          summary.shuffleWriteMetrics.writeTime)
+        assertQuantiles(_.shuffleWriteMetrics.recordsWritten.toDouble,
           summary.shuffleWriteMetrics.writeRecords)
       } finally {
         appStore.close()
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
index d7099c5c953c1..bc6fab45810eb 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
@@ -87,7 +87,7 @@ object SVDPlusPlus {
     val gJoinT0 = g.outerJoinVertices(t0) {
       (vid: VertexId, vd: (Array[Double], Array[Double], Double, Double),
        msg: Option[(Long, Double)]) =>
-        (vd._1, vd._2, msg.get._2 / msg.get._1 - u, 1.0 / scala.math.sqrt(msg.get._1))
+        (vd._1, vd._2, msg.get._2 / msg.get._1 - u, 1.0 / scala.math.sqrt(msg.get._1.toDouble))
     }.cache()
     materialize(gJoinT0)
     g.unpersist()
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
index caa2fdcdf5d2b..666790958c353 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
@@ -321,7 +321,7 @@ class PageRankSuite extends SparkFunSuite with LocalSparkContext {
         val rank = if (vid < source) {
           0.0
         } else {
-          a * Math.pow(1 - resetProb, vid - source)
+          a * Math.pow(1 - resetProb, vid.toDouble - source)
         }
         vid -> rank
       }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 6e26a78e9c7e6..aa39a3e177eeb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -1418,7 +1418,7 @@ class GeneralizedLinearRegressionSummary private[regression] (
         case Row(label: Double, pred: Double, weight: Double) =>
           (label, pred, weight)
     }
-    family.aic(t, deviance, numInstances, weightSum) + 2 * rank
+    family.aic(t, deviance, numInstances.toDouble, weightSum) + 2 * rank
   }
 }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala
index d7b13f1bf25f3..482bb7fdc2105 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala
@@ -224,7 +224,7 @@ private[ml] object ANOVATest {
     // mean square within
     val msw = sswn / dfwn
     val fValue = msb / msw
-    val pValue = 1 - new FDistribution(dfbn, dfwn).cumulativeProbability(fValue)
+    val pValue = 1 - new FDistribution(dfbn.toDouble, dfwn.toDouble).cumulativeProbability(fValue)
     val degreesOfFreedom = dfbn + dfwn
     (pValue, degreesOfFreedom, fValue)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala
index 89579dfcbb0c3..e2ce6cf7214f7 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala
@@ -135,7 +135,7 @@ private[ml] object FValueTest {
         } else Iterator.empty
       }.reduceByKey(_ + _
       ).mapPartitions { iter =>
-        val fd = new FDistribution(1, degreesOfFreedom)
+        val fd = new FDistribution(1.0, degreesOfFreedom.toDouble)
         iter.map { case (col, sumForCov) =>
           // Cov(X,Y) = Sum(((Xi - Avg(X)) * ((Yi-Avg(Y))) / (N-1)
           val covariance = sumForCov / (numSamples - 1)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index dbcf9017f1748..234ecbc460638 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -525,7 +525,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer with Logging {
     updateLambda(batchResult, batchSize)
 
     logphatOption.foreach(_ /= nonEmptyDocsN.toDouble)
-    logphatOption.foreach(updateAlpha(_, nonEmptyDocsN))
+    logphatOption.foreach(updateAlpha(_, nonEmptyDocsN.toDouble))
 
     this
   }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
index ed6e3ea966b26..17b28ed3eba5d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala
@@ -106,7 +106,7 @@ class StreamingKMeansModel @Since("1.2.0") (
         val numNewPoints = pointStats.iterator.map { case (_, (_, n)) =>
           n
         }.sum
-        math.pow(decayFactor, numNewPoints)
+        math.pow(decayFactor, numNewPoints.toDouble)
     }
 
     // apply discount to weights
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
index 06c7754691953..79f482347289a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala
@@ -91,8 +91,8 @@ class AssociationRules private[fpm] (
       .map { case (antecedent, ((consequent, freqUnion), freqAntecedent)) =>
         new Rule(antecedent.toArray,
           consequent.toArray,
-          freqUnion,
-          freqAntecedent,
+          freqUnion.toDouble,
+          freqAntecedent.toDouble,
           // the consequent contains always only one element
           itemSupport.get(consequent.head))
       }.filter(_.confidence >= minConfidence)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
index 2bd4877ffc72e..37bf9d45f6646 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala
@@ -633,7 +633,7 @@ class RowMatrix @Since("1.0.0") (
     val gamma = if (threshold < 1e-6) {
       Double.PositiveInfinity
     } else {
-      10 * math.log(numCols()) / threshold
+      10 * math.log(numCols().toDouble) / threshold
     }
 
     val summary = Statistics.colStats(rows.map((_, 1.0)), Seq("normL2"))
@@ -823,7 +823,8 @@ class RowMatrix @Since("1.0.0") (
         + s"as it's bigger than maxResultSize ($maxDriverResultSizeInBytes Bytes)")
 
     val numerator = math.log(rows.getNumPartitions)
-    val denominator = math.log(maxDriverResultSizeInBytes) - math.log(aggregatedObjectSizeInBytes)
+    val denominator = math.log(maxDriverResultSizeInBytes.toDouble) -
+      math.log(aggregatedObjectSizeInBytes.toDouble)
     val desiredTreeDepth = math.ceil(numerator / denominator)
 
     if (desiredTreeDepth > 4) {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
index aa0bf51ebcd25..28c2b5d5027ab 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala
@@ -70,7 +70,7 @@ private[stat] object SpearmanCorrelation extends Correlation with Logging {
           val output = flush()
           preCol = j
           preVal = v
-          startRank = rank
+          startRank = rank.toDouble
           cachedUids += uid
           output
         } else {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
index ead9f887fe811..d42df3e2f0ddf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala
@@ -201,7 +201,7 @@ private[spark] object ChiSqTest extends Logging {
     counts.foreach { case ((label, value), c) =>
       val i = value2Index(value)
       val j = label2Index(label)
-      contingency.update(i, j, c)
+      contingency.update(i, j, c.toDouble)
     }
 
     ChiSqTest.chiSquaredMatrix(contingency, methodName)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
index 8f3d0f8b3214c..cf0fd388fa749 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala
@@ -131,7 +131,7 @@ private[stat] object StudentTTest extends StreamingTestMethod with Logging {
       statsA: StatCounter,
       statsB: StatCounter): StreamingTestResult = {
     def studentDF(sample1: StatisticalSummaryValues, sample2: StatisticalSummaryValues): Double =
-      sample1.getN + sample2.getN - 2
+      sample1.getN + sample2.getN - 2.0
 
     new StreamingTestResult(
       tTester.get.homoscedasticTTest(statsA, statsB),
diff --git a/pom.xml b/pom.xml
index ac096a19804db..6ed16d88b0dc4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2978,7 +2978,7 @@
                 TODO(SPARK-33805): Undo the corresponding deprecated usage suppression rule after fixed.
               -->
               <arg>-Wconf:msg=^(?=.*?method|value|type|object|trait|inheritance)(?=.*?deprecated)(?=.*?since 2.13).+$:e</arg>
-              <arg>-Wconf:msg=^(?=.*?Widening conversion from)(?=.*?is deprecated because it loses precision).+$:s</arg>
+              <arg>-Wconf:msg=^(?=.*?Widening conversion from)(?=.*?is deprecated because it loses precision).+$:e</arg>
               <!-- SPARK-45610 Convert "Auto-application to `()` is deprecated" to compile error, as it will become a compile error in Scala 3. -->
               <arg>-Wconf:cat=deprecation&amp;msg=Auto-application to \`\(\)\` is deprecated:e</arg>
               <!--
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index e1db7b506c51e..72ea06a8d0504 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -236,7 +236,7 @@ object SparkBuild extends PomBuild {
         // TODO(SPARK-33805): Undo the corresponding deprecated usage suppression rule after
         //  fixed.
         "-Wconf:msg=^(?=.*?method|value|type|object|trait|inheritance)(?=.*?deprecated)(?=.*?since 2.13).+$:e",
-        "-Wconf:msg=^(?=.*?Widening conversion from)(?=.*?is deprecated because it loses precision).+$:s",
+        "-Wconf:msg=^(?=.*?Widening conversion from)(?=.*?is deprecated because it loses precision).+$:e",
         // SPARK-45610 Convert "Auto-application to `()` is deprecated" to compile error, as it will become a compile error in Scala 3.
         "-Wconf:cat=deprecation&msg=Auto-application to \\`\\(\\)\\` is deprecated:e",
         // TODO(SPARK-45615): The issue described by https://github.com/scalatest/scalatest/issues/2297 can cause false positives.
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorRollPlugin.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorRollPlugin.scala
index 9f5acf2e7e45b..a2c4b9be0c56f 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorRollPlugin.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorRollPlugin.scala
@@ -150,14 +150,15 @@ class ExecutorRollDriverPlugin extends DriverPlugin with Logging {
    * Since we will choose only first item, the duplication is okay.
    */
   private def outliersFromMultipleDimensions(listWithoutDriver: Seq[v1.ExecutorSummary]) =
-    outliers(listWithoutDriver.filter(_.totalTasks > 0), e => e.totalDuration / e.totalTasks) ++
-      outliers(listWithoutDriver, e => e.totalDuration) ++
-      outliers(listWithoutDriver, e => e.totalGCTime) ++
-      outliers(listWithoutDriver, e => e.failedTasks) ++
-      outliers(listWithoutDriver, e => getPeakMetrics(e, "JVMHeapMemory")) ++
-      outliers(listWithoutDriver, e => getPeakMetrics(e, "JVMOffHeapMemory")) ++
-      outliers(listWithoutDriver, e => e.totalShuffleWrite) ++
-      outliers(listWithoutDriver, e => e.diskUsed)
+    outliers(listWithoutDriver.filter(_.totalTasks > 0),
+      e => (e.totalDuration / e.totalTasks).toFloat) ++
+      outliers(listWithoutDriver, e => e.totalDuration.toFloat) ++
+      outliers(listWithoutDriver, e => e.totalGCTime.toFloat) ++
+      outliers(listWithoutDriver, e => e.failedTasks.toFloat) ++
+      outliers(listWithoutDriver, e => getPeakMetrics(e, "JVMHeapMemory").toFloat) ++
+      outliers(listWithoutDriver, e => getPeakMetrics(e, "JVMOffHeapMemory").toFloat) ++
+      outliers(listWithoutDriver, e => e.totalShuffleWrite.toFloat) ++
+      outliers(listWithoutDriver, e => e.diskUsed.toFloat)
 
   /**
    * Return executors whose metrics is outstanding, '(value - mean) > 2-sigma'. This is
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index ff59e60482dc6..9bd0f58e3df83 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -1127,7 +1127,8 @@ case class Range(
           val upperBinValue = getRangeValue(math.max(upperIndexPos, 0))
           val ndv = math.max(upperIndexPos - lowerIndexPos, 1)
           // Update the lowerIndex and lowerBinValue with upper ones for the next iteration.
-          (upperIndex, upperBinValue, binAr :+ HistogramBin(lowerBinValue, upperBinValue, ndv))
+          (upperIndex, upperBinValue,
+            binAr :+ HistogramBin(lowerBinValue.toDouble, upperBinValue.toDouble, ndv))
       }
     Histogram(height, binArray.toArray)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index d645929eea7d8..7083014f1f38f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -337,7 +337,7 @@ object EstimationUtils {
               lo = right.lo,
               hi = right.hi,
               leftNdv = left.ndv * leftRatio,
-              rightNdv = right.ndv,
+              rightNdv = right.ndv.toDouble,
               leftNumRows = leftHeight * leftRatio,
               rightNumRows = rightHeight
             )
@@ -350,7 +350,7 @@ object EstimationUtils {
             OverlappedRange(
               lo = left.lo,
               hi = left.hi,
-              leftNdv = left.ndv,
+              leftNdv = left.ndv.toDouble,
               rightNdv = right.ndv * rightRatio,
               leftNumRows = leftHeight,
               rightNumRows = rightHeight * rightRatio
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
index 62b6ebde4e09a..7b0e6ddd330dc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/QuantileSummaries.scala
@@ -299,7 +299,7 @@ class QuantileSummaries(
           result(pos) = sampled.last.value
         } else {
           val (newIndex, newMinRank, approxQuantile) =
-            findApproxQuantile(index, minRank, targetError, percentile)
+            findApproxQuantile(index, minRank, targetError.toDouble, percentile)
           index = newIndex
           minRank = newMinRank
           result(pos) = approxQuantile
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala
index 26cdbcab79fe1..b8edebd8bac5d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/ui/StreamingQueryStatisticsPage.scala
@@ -415,10 +415,10 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
         withNumberInvalid { p.processedRowsPerSecond })), Array.empty[(Long, Double)])
     val inputRowsData = withNoProgress(query,
       query.recentProgress.map(p => (parseProgressTimestamp(p.timestamp),
-        withNumberInvalid { p.numInputRows })), Array.empty[(Long, Double)])
+        withNumberInvalid { p.numInputRows.toDouble })), Array.empty[(Long, Double)])
     val batchDurations = withNoProgress(query,
       query.recentProgress.map(p => (parseProgressTimestamp(p.timestamp),
-        withNumberInvalid { p.batchDuration })), Array.empty[(Long, Double)])
+        withNumberInvalid { p.batchDuration.toDouble })), Array.empty[(Long, Double)])
     val operationDurationData = withNoProgress(
       query,
       query.recentProgress.map { p =>
@@ -437,7 +437,7 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
         inputRateData.toImmutableArraySeq,
         minBatchTime,
         maxBatchTime,
-        minRecordRate,
+        minRecordRate.toDouble,
         maxRecordRate,
         "records/sec")
     graphUIDataForInputRate.generateDataJs(jsCollector)
@@ -449,7 +449,7 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
         processRateData.toImmutableArraySeq,
         minBatchTime,
         maxBatchTime,
-        minProcessRate,
+        minProcessRate.toDouble,
         maxProcessRate,
         "records/sec")
     graphUIDataForProcessRate.generateDataJs(jsCollector)
@@ -461,8 +461,8 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
         inputRowsData.toImmutableArraySeq,
         minBatchTime,
         maxBatchTime,
-        minRows,
-        maxRows,
+        minRows.toDouble,
+        maxRows.toDouble,
         "records")
     graphUIDataForInputRows.generateDataJs(jsCollector)
 
@@ -473,8 +473,8 @@ private[ui] class StreamingQueryStatisticsPage(parent: StreamingQueryTab)
         batchDurations.toImmutableArraySeq,
         minBatchTime,
         maxBatchTime,
-        minBatchDuration,
-        maxBatchDuration,
+        minBatchDuration.toDouble,
+        maxBatchDuration.toDouble,
         "ms")
     graphUIDataForBatchDuration.generateDataJs(jsCollector)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
index 04e47ac4a1132..87eb35ee3e506 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala
@@ -139,11 +139,11 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils
       Some(Histogram(1, Array(HistogramBin(d1Internal, d1Internal, 1),
         HistogramBin(d1Internal, d2Internal, 1))))))
     colStats.update("ctimestamp", stats("ctimestamp").copy(histogram =
-      Some(Histogram(1, Array(HistogramBin(t1Internal, t1Internal, 1),
-        HistogramBin(t1Internal, t2Internal, 1))))))
+      Some(Histogram(1, Array(HistogramBin(t1Internal.toDouble, t1Internal.toDouble, 1),
+        HistogramBin(t1Internal.toDouble, t2Internal.toDouble, 1))))))
     colStats.update("ctimestamp_ntz", stats("ctimestamp_ntz").copy(histogram =
-      Some(Histogram(1, Array(HistogramBin(t1Internal, t1Internal, 1),
-        HistogramBin(t1Internal, t2Internal, 1))))))
+      Some(Histogram(1, Array(HistogramBin(t1Internal.toDouble, t1Internal.toDouble, 1),
+        HistogramBin(t1Internal.toDouble, t2Internal.toDouble, 1))))))
     colStats
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala
index 9395c402fa905..39b76ede73d20 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/columnar/compression/PassThroughEncodingSuite.scala
@@ -178,7 +178,7 @@ class PassThroughSuite extends SparkFunSuite {
         case SHORT => Seq(2: Short, 1: Short, 2: Short, nullValue.toShort: Short, 5: Short)
         case INT => Seq(2: Int, 1: Int, 2: Int, nullValue: Int, 5: Int)
         case LONG => Seq(2: Long, 1: Long, 2: Long, nullValue: Long, 5: Long)
-        case FLOAT => Seq(2: Float, 1: Float, 2: Float, nullValue: Float, 5: Float)
+        case FLOAT => Seq(2: Float, 1: Float, 2: Float, nullValue.toFloat: Float, 5: Float)
         case DOUBLE => Seq(2: Double, 1: Double, 2: Double, nullValue: Double, 5: Double)
       }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
index 36ee3226087f3..b0041b5ee9896 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/EventTimeWatermarkSuite.scala
@@ -74,14 +74,15 @@ class EventTimeWatermarkSuite extends StreamTest with BeforeAndAfter with Matche
     // Make sure `largeValue` will cause overflow if we use a Long sum to calc avg.
     assert(largeValue * largeValue != BigInt(largeValue) * BigInt(largeValue))
     val stats =
-      EventTimeStats(max = largeValue, min = largeValue, avg = largeValue, count = largeValue - 1)
+      EventTimeStats(
+        max = largeValue, min = largeValue, avg = largeValue.toDouble, count = largeValue - 1)
     stats.add(largeValue)
     stats.avg should be (largeValue.toDouble +- epsilon)
 
     val stats2 = EventTimeStats(
       max = largeValue + 1,
       min = largeValue,
-      avg = largeValue + 1,
+      avg = largeValue + 1.0,
       count = largeValue)
     stats.merge(stats2)
     stats.avg should be ((largeValue + 0.5) +- epsilon)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
index f77ca3e8fdb45..14e47b8f96fd6 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receiver/RateLimiter.scala
@@ -59,9 +59,9 @@ private[receiver] abstract class RateLimiter(conf: SparkConf) extends Logging {
   private[receiver] def updateRate(newRate: Long): Unit =
     if (newRate > 0) {
       if (maxRateLimit > 0) {
-        rateLimiter.setRate(newRate.min(maxRateLimit))
+        rateLimiter.setRate(newRate.min(maxRateLimit).toDouble)
       } else {
-        rateLimiter.setRate(newRate)
+        rateLimiter.setRate(newRate.toDouble)
       }
     }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
index 9b7014a6640d8..bac82b5d331f7 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/StreamingPage.scala
@@ -206,8 +206,8 @@ private[ui] class StreamingPage(parent: StreamingTab)
         recordRateForAllStreams.data,
         minBatchTime,
         maxBatchTime,
-        minRecordRate,
-        maxRecordRate,
+        minRecordRate.toDouble,
+        maxRecordRate.toDouble,
         "records/sec")
     graphUIDataForRecordRateOfAllStreams.generateDataJs(jsCollector)
 
@@ -218,7 +218,7 @@ private[ui] class StreamingPage(parent: StreamingTab)
         schedulingDelay.timelineData(normalizedUnit),
         minBatchTime,
         maxBatchTime,
-        minTime,
+        minTime.toDouble,
         maxTime,
         formattedUnit)
     graphUIDataForSchedulingDelay.generateDataJs(jsCollector)
@@ -230,7 +230,7 @@ private[ui] class StreamingPage(parent: StreamingTab)
         processingTime.timelineData(normalizedUnit),
         minBatchTime,
         maxBatchTime,
-        minTime,
+        minTime.toDouble,
         maxTime,
         formattedUnit, Some(batchInterval))
     graphUIDataForProcessingTime.generateDataJs(jsCollector)
@@ -242,7 +242,7 @@ private[ui] class StreamingPage(parent: StreamingTab)
         totalDelay.timelineData(normalizedUnit),
         minBatchTime,
         maxBatchTime,
-        minTime,
+        minTime.toDouble,
         maxTime,
         formattedUnit)
     graphUIDataForTotalDelay.generateDataJs(jsCollector)
@@ -294,7 +294,7 @@ private[ui] class StreamingPage(parent: StreamingTab)
       {if (hasStream) {
         <tr id="inputs-table" style="display: none;" >
           <td colspan="3">
-            {generateInputDStreamsTable(jsCollector, minBatchTime, maxBatchTime, minRecordRate)}
+            {generateInputDStreamsTable(jsCollector, minBatchTime, maxBatchTime, minRecordRate.toDouble)}
           </td>
         </tr>
       }}
@@ -350,7 +350,7 @@ private[ui] class StreamingPage(parent: StreamingTab)
     val content: Seq[Node] = listener.receivedRecordRateWithBatchTime.toList.sortBy(_._1).flatMap {
       case (streamId, recordRates) =>
         generateInputDStreamRow(
-          jsCollector, streamId, recordRates, minX, maxX, minY, maxYCalculated)
+          jsCollector, streamId, recordRates, minX, maxX, minY, maxYCalculated.toDouble)
     }
 
     // scalastyle:off
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala b/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala
index 57ec162b0d179..3c5fc8a08e677 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ui/UIUtils.scala
@@ -44,7 +44,7 @@ private[streaming] object UIUtils {
    */
   def normalizeDuration(milliseconds: Long): (Double, TimeUnit) = {
     if (milliseconds < 1000) {
-      return (milliseconds, TimeUnit.MILLISECONDS)
+      return (milliseconds.toDouble, TimeUnit.MILLISECONDS)
     }
     val seconds = milliseconds.toDouble / 1000
     if (seconds < 60) {
@@ -67,9 +67,9 @@ private[streaming] object UIUtils {
    * will discard the fractional part.
    */
   def convertToTimeUnit(milliseconds: Long, unit: TimeUnit): Double = unit match {
-    case TimeUnit.NANOSECONDS => milliseconds * 1000 * 1000
-    case TimeUnit.MICROSECONDS => milliseconds * 1000
-    case TimeUnit.MILLISECONDS => milliseconds
+    case TimeUnit.NANOSECONDS => milliseconds.toDouble * 1000 * 1000
+    case TimeUnit.MICROSECONDS => milliseconds.toDouble * 1000
+    case TimeUnit.MILLISECONDS => milliseconds.toDouble
     case TimeUnit.SECONDS => milliseconds / 1000.0
     case TimeUnit.MINUTES => milliseconds / 1000.0 / 60.0
     case TimeUnit.HOURS => milliseconds / 1000.0 / 60.0 / 60.0
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
index 7d9dfb100f613..e0ca22ad77d1d 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/ExecutorAllocationManagerSuite.scala
@@ -119,13 +119,13 @@ class ExecutorAllocationManagerSuite extends TestSuiteBase
       }
 
       // Batch proc time = batch interval, should increase allocation by 1
-      addBatchProcTimeAndVerifyAllocation(batchDurationMillis) {
+      addBatchProcTimeAndVerifyAllocation(batchDurationMillis.toDouble) {
         verifyTotalRequestedExecs(Some(3)) // one already allocated, increase allocation by 1
         verifyScaledDownExec(None)
       }
 
       // Batch proc time = batch interval * 2, should increase allocation by 2
-      addBatchProcTimeAndVerifyAllocation(batchDurationMillis * 2) {
+      addBatchProcTimeAndVerifyAllocation(batchDurationMillis * 2.0) {
         verifyTotalRequestedExecs(Some(4))
         verifyScaledDownExec(None)
       }
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala
index b5a45fc317d0e..a4faed1501577 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/scheduler/RateControllerSuite.scala
@@ -83,5 +83,5 @@ private[streaming] class ConstantEstimator(@volatile private var rate: Long)
       time: Long,
       elements: Long,
       processingDelay: Long,
-      schedulingDelay: Long): Option[Double] = Some(rate)
+      schedulingDelay: Long): Option[Double] = Some(rate.toDouble)
 }

From 0193d0f88a953063c41c41042fb58bd0badc155c Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Mon, 27 Nov 2023 15:25:50 +0900
Subject: [PATCH 15/40] [SPARK-46114][PYTHON] Add PySparkIndexError for error
 framework

### What changes were proposed in this pull request?

This PR proposes to add `PySparkIndexError` for error framework to replace `IndexError` in `pyspark.sql.*`.

### Why are the changes needed?

To improve error messages and handling.

### Does this PR introduce _any_ user-facing change?

Yes, it improves the user-facing error messages.

### How was this patch tested?

Existing tests cases.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44028 from HyukjinKwon/SPARK-46114.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../docs/source/reference/pyspark.errors.rst  |  1 +
 python/pyspark/errors/__init__.py             |  2 ++
 python/pyspark/errors/error_classes.py        | 10 +++++++++
 python/pyspark/errors/exceptions/base.py      |  6 +++++
 python/pyspark/sql/connect/dataframe.py       | 22 +++++++++++--------
 python/pyspark/sql/dataframe.py               | 11 ++++++----
 python/pyspark/sql/types.py                   | 12 ++++++++--
 7 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/python/docs/source/reference/pyspark.errors.rst b/python/docs/source/reference/pyspark.errors.rst
index 89ca7373a4f3f..9f28d134cf2db 100644
--- a/python/docs/source/reference/pyspark.errors.rst
+++ b/python/docs/source/reference/pyspark.errors.rst
@@ -43,6 +43,7 @@ Classes
     PySparkRuntimeError
     PySparkTypeError
     PySparkValueError
+    PySparkIndexError
     PythonException
     QueryExecutionException
     SparkRuntimeException
diff --git a/python/pyspark/errors/__init__.py b/python/pyspark/errors/__init__.py
index 6d9d452ddd056..a8b7191c166ab 100644
--- a/python/pyspark/errors/__init__.py
+++ b/python/pyspark/errors/__init__.py
@@ -38,6 +38,7 @@
     SparkNoSuchElementException,
     PySparkTypeError,
     PySparkValueError,
+    PySparkIndexError,
     PySparkAttributeError,
     PySparkRuntimeError,
     PySparkAssertionError,
@@ -66,6 +67,7 @@
     "SparkNoSuchElementException",
     "PySparkTypeError",
     "PySparkValueError",
+    "PySparkIndexError",
     "PySparkAttributeError",
     "PySparkRuntimeError",
     "PySparkAssertionError",
diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py
index 70e88c18f9dc1..72ee0a53460f6 100644
--- a/python/pyspark/errors/error_classes.py
+++ b/python/pyspark/errors/error_classes.py
@@ -242,6 +242,16 @@
       " must be set to `true` to enable Python profile."
     ]
   },
+  "INDEX_NOT_POSITIVE" : {
+    "message" : [
+      "Index must be positive, got '<index>'."
+    ]
+  },
+  "INDEX_OUT_OF_RANGE" : {
+    "message" : [
+      "<arg_name> index out of range, got '<index>'."
+    ]
+  },
   "INVALID_ARROW_UDTF_RETURN_TYPE" : {
     "message" : [
       "The return type of the arrow-optimized Python UDTF should be of type 'pandas.DataFrame', but the '<func>' method returned a value of type <type_name> with value: <value>."
diff --git a/python/pyspark/errors/exceptions/base.py b/python/pyspark/errors/exceptions/base.py
index c63b9dbee87d0..c84ca17c3dbd5 100644
--- a/python/pyspark/errors/exceptions/base.py
+++ b/python/pyspark/errors/exceptions/base.py
@@ -214,6 +214,12 @@ class PySparkTypeError(PySparkException, TypeError):
     """
 
 
+class PySparkIndexError(PySparkException, IndexError):
+    """
+    Wrapper class for IndexError to support error classes.
+    """
+
+
 class PySparkAttributeError(PySparkException, AttributeError):
     """
     Wrapper class for AttributeError to support error classes.
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
index b3bec44428bf8..a6ef25c2ec3e1 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from pyspark.errors.exceptions.base import SessionNotSameException
+from pyspark.errors.exceptions.base import SessionNotSameException, PySparkIndexError
 from pyspark.sql.connect.utils import check_dependencies
 
 check_dependencies(__name__)
@@ -487,9 +487,10 @@ def groupBy(self, *cols: "ColumnOrNameOrOrdinal") -> GroupedData:
             elif isinstance(c, str):
                 _cols.append(self[c])
             elif isinstance(c, int) and not isinstance(c, bool):
-                # TODO: should introduce dedicated error class
                 if c < 1:
-                    raise IndexError(f"Column ordinal must be positive but got {c}")
+                    raise PySparkIndexError(
+                        error_class="INDEX_NOT_POSITIVE", message_parameters={"index": str(c)}
+                    )
                 # ordinal is 1-based
                 _cols.append(self[c - 1])
             else:
@@ -512,9 +513,10 @@ def rollup(self, *cols: "ColumnOrName") -> "GroupedData":
             elif isinstance(c, str):
                 _cols.append(self[c])
             elif isinstance(c, int) and not isinstance(c, bool):
-                # TODO: should introduce dedicated error class
                 if c < 1:
-                    raise IndexError(f"Column ordinal must be positive but got {c}")
+                    raise PySparkIndexError(
+                        error_class="INDEX_NOT_POSITIVE", message_parameters={"index": str(c)}
+                    )
                 # ordinal is 1-based
                 _cols.append(self[c - 1])
             else:
@@ -535,9 +537,10 @@ def cube(self, *cols: "ColumnOrName") -> "GroupedData":
             elif isinstance(c, str):
                 _cols.append(self[c])
             elif isinstance(c, int) and not isinstance(c, bool):
-                # TODO: should introduce dedicated error class
                 if c < 1:
-                    raise IndexError(f"Column ordinal must be positive but got {c}")
+                    raise PySparkIndexError(
+                        error_class="INDEX_NOT_POSITIVE", message_parameters={"index": str(c)}
+                    )
                 # ordinal is 1-based
                 _cols.append(self[c - 1])
             else:
@@ -693,7 +696,6 @@ def _sort_cols(
         _cols: List[Column] = []
         for c in cols:
             if isinstance(c, int) and not isinstance(c, bool):
-                # TODO: should introduce dedicated error class
                 # ordinal is 1-based
                 if c > 0:
                     _c = self[c - 1]
@@ -701,7 +703,9 @@ def _sort_cols(
                 elif c < 0:
                     _c = self[-c - 1].desc()
                 else:
-                    raise IndexError("Column ordinal must not be zero!")
+                    raise PySparkIndexError(
+                        error_class="INDEX_NOT_POSITIVE", message_parameters={"index": str(c)}
+                    )
             else:
                 _c = c  # type: ignore[assignment]
             _cols.append(_to_col(cast("ColumnOrName", _c)))
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 82087adc82f5a..8b40b222a289c 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -43,7 +43,7 @@
 from pyspark import copy_func, _NoValue
 from pyspark._globals import _NoValueType
 from pyspark.context import SparkContext
-from pyspark.errors import PySparkTypeError, PySparkValueError
+from pyspark.errors import PySparkTypeError, PySparkValueError, PySparkIndexError
 from pyspark.rdd import (
     RDD,
     _load_from_socket,
@@ -3222,9 +3222,10 @@ def _jcols_ordinal(self, *cols: "ColumnOrNameOrOrdinal") -> JavaObject:
         _cols = []
         for c in cols:
             if isinstance(c, int) and not isinstance(c, bool):
-                # TODO: should introduce dedicated error class
                 if c < 1:
-                    raise IndexError(f"Column ordinal must be positive but got {c}")
+                    raise PySparkIndexError(
+                        error_class="INDEX_NOT_POSITIVE", message_parameters={"index": str(c)}
+                    )
                 # ordinal is 1-based
                 _cols.append(self[c - 1])
             else:
@@ -3256,7 +3257,9 @@ def _sort_cols(
                 elif c < 0:
                     _c = self[-c - 1].desc()
                 else:
-                    raise IndexError("Column ordinal must not be zero!")
+                    raise PySparkIndexError(
+                        error_class="INDEX_NOT_POSITIVE", message_parameters={"index": str(c)}
+                    )
             else:
                 _c = c  # type: ignore[assignment]
             jcols.append(_to_java_column(cast("ColumnOrName", _c)))
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 698a54999d451..932a5a703ea91 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -50,7 +50,12 @@
 
 from pyspark.serializers import CloudPickleSerializer
 from pyspark.sql.utils import has_numpy, get_active_spark_context
-from pyspark.errors import PySparkNotImplementedError, PySparkTypeError, PySparkValueError
+from pyspark.errors import (
+    PySparkNotImplementedError,
+    PySparkTypeError,
+    PySparkValueError,
+    PySparkIndexError,
+)
 
 if has_numpy:
     import numpy as np
@@ -1042,7 +1047,10 @@ def __getitem__(self, key: Union[str, int]) -> StructField:
             try:
                 return self.fields[key]
             except IndexError:
-                raise IndexError("StructType index out of range")
+                raise PySparkIndexError(
+                    error_class="INDEX_OUT_OF_RANGE",
+                    message_parameters={"arg_name": "StructType", "index": str(key)},
+                )
         elif isinstance(key, slice):
             return StructType(self.fields[key])
         else:

From 08f23f7b6beece0aa9ba6dc481e039dedfd0c2f6 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Mon, 27 Nov 2023 15:26:40 +0900
Subject: [PATCH 16/40] [MINOR][PYTHON] Remove obsolete TODOs for ignores at
 SQLContext and HiveContext

### What changes were proposed in this pull request?

This PR proposes to Remove obsolete TODOs for ignores at ` SQLContext` and `HiveContext`.

### Why are the changes needed?

To remove obsolate TODOs added in https://github.com/apache/spark/pull/34185. Those ignores were already removed in https://github.com/apache/spark/pull/34944.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Linters

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44030 from HyukjinKwon/SPARK-37620-followup.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/sql/context.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index eaec42b1ed064..b75567ee07106 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -59,7 +59,6 @@
 __all__ = ["SQLContext", "HiveContext"]
 
 
-# TODO: ignore[attr-defined] will be removed, once SparkContext is inlined
 class SQLContext:
     """The entry point for working with structured data (rows and columns) in Spark, in Spark 1.x.
 
@@ -700,7 +699,6 @@ def streams(self) -> StreamingQueryManager:
         return StreamingQueryManager(self._ssql_ctx.streams())
 
 
-# TODO: ignore[attr-defined] will be removed, once SparkContext is inlined
 class HiveContext(SQLContext):
     """A variant of Spark SQL that integrates with data stored in Hive.
 

From 685f536a594700df32c22d62563ba3ec3dd26081 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Mon, 27 Nov 2023 15:27:39 +0900
Subject: [PATCH 17/40] [MINOR][PYTHON][DOCS] Remove since decorator usages in
 pyspark.sql

### What changes were proposed in this pull request?

This PR proposes to remove `since` decorator for docstrings in `pyspark.sql.`. They don't play very well with numpydoc style so we have removed them away almost all places in `pyspark.sql.`. These are leftovers.

### Why are the changes needed?

`since` decorator does not play very well with numpydoc style.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Before/after are virtually the same. Checked with linter.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44029 from HyukjinKwon/minor-remove-since.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/sql/conf.py       |  5 +++--
 python/pyspark/sql/context.py    | 23 ++++++++++++++-------
 python/pyspark/sql/datasource.py | 16 +++++++++------
 python/pyspark/sql/readwriter.py | 34 +++++++++++++++++++++-----------
 4 files changed, 51 insertions(+), 27 deletions(-)

diff --git a/python/pyspark/sql/conf.py b/python/pyspark/sql/conf.py
index b00f534eb48db..b2dda115f6667 100644
--- a/python/pyspark/sql/conf.py
+++ b/python/pyspark/sql/conf.py
@@ -20,7 +20,7 @@
 
 from py4j.java_gateway import JavaObject
 
-from pyspark import since, _NoValue
+from pyspark import _NoValue
 from pyspark._globals import _NoValueType
 
 
@@ -123,10 +123,11 @@ def _checkType(self, obj: Any, identifier: str) -> None:
                 "expected %s '%s' to be a string (was '%s')" % (identifier, obj, type(obj).__name__)
             )
 
-    @since(2.4)
     def isModifiable(self, key: str) -> bool:
         """Indicates whether the configuration property with the given key
         is modifiable in the current session.
+
+        .. versionadded:: 2.4.0
         """
         return self._jconf.isModifiable(key)
 
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index b75567ee07106..efc9760edf8bd 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -34,7 +34,7 @@
 
 from py4j.java_gateway import JavaObject
 
-from pyspark import since, _NoValue
+from pyspark import _NoValue
 from pyspark._globals import _NoValueType
 from pyspark.sql.session import _monkey_patch_RDD, SparkSession
 from pyspark.sql.dataframe import DataFrame
@@ -632,19 +632,28 @@ def tableNames(self, dbName: Optional[str] = None) -> List[str]:
         else:
             return [name for name in self._ssql_ctx.tableNames(dbName)]
 
-    @since(1.0)
     def cacheTable(self, tableName: str) -> None:
-        """Caches the specified table in-memory."""
+        """
+        Caches the specified table in-memory.
+
+        .. versionadded:: 1.0.0
+        """
         self._ssql_ctx.cacheTable(tableName)
 
-    @since(1.0)
     def uncacheTable(self, tableName: str) -> None:
-        """Removes the specified table from the in-memory cache."""
+        """
+        Removes the specified table from the in-memory cache.
+
+        .. versionadded:: 1.0.0
+        """
         self._ssql_ctx.uncacheTable(tableName)
 
-    @since(1.3)
     def clearCache(self) -> None:
-        """Removes all cached tables from the in-memory cache."""
+        """
+        Removes all cached tables from the in-memory cache.
+
+        .. versionadded:: 1.3.0
+        """
         self._ssql_ctx.clearCache()
 
     @property
diff --git a/python/pyspark/sql/datasource.py b/python/pyspark/sql/datasource.py
index b380e8b534ebd..032e3f48a82ec 100644
--- a/python/pyspark/sql/datasource.py
+++ b/python/pyspark/sql/datasource.py
@@ -17,7 +17,6 @@
 from abc import ABC, abstractmethod
 from typing import final, Any, Dict, Iterator, List, Tuple, Type, Union, TYPE_CHECKING
 
-from pyspark import since
 from pyspark.sql import Row
 from pyspark.sql.types import StructType
 
@@ -29,7 +28,6 @@
 __all__ = ["DataSource", "DataSourceReader", "DataSourceWriter", "DataSourceRegistration"]
 
 
-@since(4.0)
 class DataSource(ABC):
     """
     A base class for data sources.
@@ -42,6 +40,8 @@ class DataSource(ABC):
 
     After implementing this interface, you can start to load your data source using
     ``spark.read.format(...).load()`` and save data using ``df.write.format(...).save()``.
+
+    .. versionadded: 4.0.0
     """
 
     @final
@@ -145,11 +145,12 @@ def writer(self, schema: StructType, saveMode: str) -> "DataSourceWriter":
         raise NotImplementedError
 
 
-@since(4.0)
 class DataSourceReader(ABC):
     """
     A base class for data source readers. Data source readers are responsible for
     outputting data from a data source.
+
+    .. versionadded: 4.0.0
     """
 
     def partitions(self) -> Iterator[Any]:
@@ -241,11 +242,12 @@ def read(self, partition: Any) -> Iterator[Union[Tuple, Row]]:
         ...
 
 
-@since(4.0)
 class DataSourceWriter(ABC):
     """
     A base class for data source writers. Data source writers are responsible for saving
     the data to the data source.
+
+    .. versionadded: 4.0.0
     """
 
     @abstractmethod
@@ -305,21 +307,23 @@ def abort(self, messages: List["WriterCommitMessage"]) -> None:
         ...
 
 
-@since(4.0)
 class WriterCommitMessage:
     """
     A commit message returned by the ``write`` method of ``DataSourceWriter`` and will be
     sent back to the driver side as input parameter of ``commit`` or ``abort`` method.
+
+    .. versionadded: 4.0.0
     """
 
     ...
 
 
-@since(4.0)
 class DataSourceRegistration:
     """
     Wrapper for data source registration. This instance can be accessed by
     :attr:`spark.dataSource`.
+
+    .. versionadded: 4.0.0
     """
 
     def __init__(self, sparkSession: "SparkSession"):
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index b7e2c145f443e..b61284247b0e2 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -19,7 +19,7 @@
 
 from py4j.java_gateway import JavaClass, JavaObject
 
-from pyspark import RDD, since
+from pyspark import RDD
 from pyspark.sql.column import _to_seq, _to_java_column, Column
 from pyspark.sql.types import StructType
 from pyspark.sql import utils
@@ -2295,41 +2295,44 @@ def __init__(self, df: "DataFrame", table: str):
         self._spark = df.sparkSession
         self._jwriter = df._jdf.writeTo(table)
 
-    @since(3.1)
     def using(self, provider: str) -> "DataFrameWriterV2":
         """
         Specifies a provider for the underlying output data source.
         Spark's default catalog supports "parquet", "json", etc.
+
+        .. versionadded: 3.1.0
         """
         self._jwriter.using(provider)
         return self
 
-    @since(3.1)
     def option(self, key: str, value: "OptionalPrimitiveType") -> "DataFrameWriterV2":
         """
         Add a write option.
+
+        .. versionadded: 3.1.0
         """
         self._jwriter.option(key, to_str(value))
         return self
 
-    @since(3.1)
     def options(self, **options: "OptionalPrimitiveType") -> "DataFrameWriterV2":
         """
         Add write options.
+
+        .. versionadded: 3.1.0
         """
         options = {k: to_str(v) for k, v in options.items()}
         self._jwriter.options(options)
         return self
 
-    @since(3.1)
     def tableProperty(self, property: str, value: str) -> "DataFrameWriterV2":
         """
         Add table property.
+
+        .. versionadded: 3.1.0
         """
         self._jwriter.tableProperty(property, value)
         return self
 
-    @since(3.1)
     def partitionedBy(self, col: Column, *cols: Column) -> "DataFrameWriterV2":
         """
         Partition the output table created by `create`, `createOrReplace`, or `replace` using
@@ -2356,33 +2359,35 @@ def partitionedBy(self, col: Column, *cols: Column) -> "DataFrameWriterV2":
         * :py:func:`pyspark.sql.functions.hours`
         * :py:func:`pyspark.sql.functions.bucket`
 
+        .. versionadded: 3.1.0
         """
         col = _to_java_column(col)
         cols = _to_seq(self._spark._sc, [_to_java_column(c) for c in cols])
         self._jwriter.partitionedBy(col, cols)
         return self
 
-    @since(3.1)
     def create(self) -> None:
         """
         Create a new table from the contents of the data frame.
 
         The new table's schema, partition layout, properties, and other configuration will be
         based on the configuration set on this writer.
+
+        .. versionadded: 3.1.0
         """
         self._jwriter.create()
 
-    @since(3.1)
     def replace(self) -> None:
         """
         Replace an existing table with the contents of the data frame.
 
         The existing table's schema, partition layout, properties, and other configuration will be
         replaced with the contents of the data frame and the configuration set on this writer.
+
+        .. versionadded: 3.1.0
         """
         self._jwriter.replace()
 
-    @since(3.1)
     def createOrReplace(self) -> None:
         """
         Create a new table or replace an existing table with the contents of the data frame.
@@ -2391,26 +2396,29 @@ def createOrReplace(self) -> None:
         and other configuration will be based on the contents of the data frame
         and the configuration set on this writer.
         If the table exists, its configuration and data will be replaced.
+
+        .. versionadded: 3.1.0
         """
         self._jwriter.createOrReplace()
 
-    @since(3.1)
     def append(self) -> None:
         """
         Append the contents of the data frame to the output table.
+
+        .. versionadded: 3.1.0
         """
         self._jwriter.append()
 
-    @since(3.1)
     def overwrite(self, condition: Column) -> None:
         """
         Overwrite rows matching the given filter condition with the contents of the data frame in
         the output table.
+
+        .. versionadded: 3.1.0
         """
         condition = _to_java_column(condition)
         self._jwriter.overwrite(condition)
 
-    @since(3.1)
     def overwritePartitions(self) -> None:
         """
         Overwrite all partition for which the data frame contains at least one row with the contents
@@ -2418,6 +2426,8 @@ def overwritePartitions(self) -> None:
 
         This operation is equivalent to Hive's `INSERT OVERWRITE ... PARTITION`, which replaces
         partitions dynamically depending on the contents of the data frame.
+
+        .. versionadded: 3.1.0
         """
         self._jwriter.overwritePartitions()
 

From 8d646fe509651dde60a80eab97e7261020b0df70 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Mon, 27 Nov 2023 17:03:19 +0900
Subject: [PATCH 18/40] [MINOR][PYTHON] Remove _inferSchema in SQLContext

### What changes were proposed in this pull request?

There are only two places that use `SQLContext_inferSchema` that can be safely converted to `SQLContext.sparkSession._inferSchema` instead.

### Why are the changes needed?

For code cleanup, and remove unused private method

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Existing test cases should cover them.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44031 from HyukjinKwon/minor-remove-inferSchema.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/mllib/evaluation.py |  4 ++--
 python/pyspark/sql/context.py      | 18 ------------------
 2 files changed, 2 insertions(+), 20 deletions(-)

diff --git a/python/pyspark/mllib/evaluation.py b/python/pyspark/mllib/evaluation.py
index 73696ab46f878..df756f848429d 100644
--- a/python/pyspark/mllib/evaluation.py
+++ b/python/pyspark/mllib/evaluation.py
@@ -462,7 +462,7 @@ def __init__(
         sc = predictionAndLabels.ctx
         sql_ctx = SQLContext.getOrCreate(sc)
         df = sql_ctx.createDataFrame(
-            predictionAndLabels, schema=sql_ctx._inferSchema(predictionAndLabels)
+            predictionAndLabels, schema=sql_ctx.sparkSession._inferSchema(predictionAndLabels)
         )
         java_model = callMLlibFunc("newRankingMetrics", df._jdf)
         super(RankingMetrics, self).__init__(java_model)
@@ -576,7 +576,7 @@ def __init__(self, predictionAndLabels: RDD[Tuple[List[float], List[float]]]):
         sc = predictionAndLabels.ctx
         sql_ctx = SQLContext.getOrCreate(sc)
         df = sql_ctx.createDataFrame(
-            predictionAndLabels, schema=sql_ctx._inferSchema(predictionAndLabels)
+            predictionAndLabels, schema=sql_ctx.sparkSession._inferSchema(predictionAndLabels)
         )
         assert sc._jvm is not None
         java_class = sc._jvm.org.apache.spark.mllib.evaluation.MultilabelMetrics
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
index efc9760edf8bd..7ef7b320eeb42 100644
--- a/python/pyspark/sql/context.py
+++ b/python/pyspark/sql/context.py
@@ -311,24 +311,6 @@ def registerJavaFunction(
         )
         return self.sparkSession.udf.registerJavaFunction(name, javaClassName, returnType)
 
-    # TODO(andrew): delete this once we refactor things to take in SparkSession
-    def _inferSchema(self, rdd: RDD, samplingRatio: Optional[float] = None) -> StructType:
-        """
-        Infer schema from an RDD of Row or tuple.
-
-        Parameters
-        ----------
-        rdd : :class:`RDD`
-            an RDD of Row or tuple
-        samplingRatio : float, optional
-            sampling ratio, or no sampling (default)
-
-        Returns
-        -------
-        :class:`pyspark.sql.types.StructType`
-        """
-        return self.sparkSession._inferSchema(rdd, samplingRatio)
-
     @overload
     def createDataFrame(
         self,

From f9ea6681962473b8e8cc328020f88793944e9c53 Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Mon, 27 Nov 2023 17:30:34 +0900
Subject: [PATCH 19/40] [SPARK-46113][BUILD][DOCS] Update `pydata_sphinx_theme`
 version requirement to `>=0.13`

### What changes were proposed in this pull request?

This PR proposes to update the version requirement for `pydata_sphinx_theme` in `requirements.txt` from `==0.13` to `>=0.13`. This change allows the installation of the latest maintenance releases of `pydata_sphinx_theme`, ensuring we have the most up-to-date features and fixes.

### Why are the changes needed?

Updating to `>=0.13` ensures that we can benefit from the latest improvements and bug fixes in `pydata_sphinx_theme` without being restricted to a specific patch version.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Manually built docs with latest versions.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44027 from itholic/SPARK-46113.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .github/workflows/build_and_test.yml | 2 +-
 dev/requirements.txt                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index a4c9ec3042582..ccc437269bfa7 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -751,7 +751,7 @@ jobs:
         #   See also https://issues.apache.org/jira/browse/SPARK-35375.
         # Pin the MarkupSafe to 2.0.1 to resolve the CI error.
         #   See also https://issues.apache.org/jira/browse/SPARK-38279.
-        python3.9 -m pip install 'sphinx==4.2.0' mkdocs 'pydata_sphinx_theme==0.13' sphinx-copybutton nbsphinx numpydoc jinja2 'markupsafe==2.0.1' 'pyzmq<24.0.0'
+        python3.9 -m pip install 'sphinx==4.2.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 'markupsafe==2.0.1' 'pyzmq<24.0.0'
         python3.9 -m pip install ipython_genutils # See SPARK-38517
         python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
         python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
diff --git a/dev/requirements.txt b/dev/requirements.txt
index a7af0907c7264..2d139911bacb6 100644
--- a/dev/requirements.txt
+++ b/dev/requirements.txt
@@ -31,7 +31,7 @@ pandas-stubs<1.2.0.54
 mkdocs
 
 # Documentation (Python)
-pydata_sphinx_theme==0.13
+pydata_sphinx_theme>=0.13
 ipython
 nbsphinx
 numpydoc

From cd47a4c5e01efbfd808526224ef6acd39eba2b5c Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Mon, 27 Nov 2023 18:55:57 +0900
Subject: [PATCH 20/40] [SPARK-46123][DOCS][PYTHON] Using brighter color for
 document title for better visibility

### What changes were proposed in this pull request?

This PR proposes to adjust the font color of the titles in the PySpark documentation.

### Why are the changes needed?

For better visibility. The current title font color is not optimal for readability especially in dark mode, which can hinder user experience.

### Does this PR introduce _any_ user-facing change?

No API changes, but the font color for titles has been updated to a lighter shade, improving contrast and readability as below:

## Before

<img width="1025" alt="Screenshot 2023-11-27 at 5 57 42 PM" src="https://github.com/apache/spark/assets/44108233/19ee6e80-24ac-475f-94d1-1b5e95ef381a">

## After

<img width="1023" alt="Screenshot 2023-11-27 at 5 56 49 PM" src="https://github.com/apache/spark/assets/44108233/6a0d44e3-d28e-44ff-9ee2-1f5d69ef8aa1">

### How was this patch tested?

By manually building docs.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44040 from itholic/brighter_font.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/docs/source/_static/css/pyspark.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/docs/source/_static/css/pyspark.css b/python/docs/source/_static/css/pyspark.css
index ccfe60f2bca64..2743629ff61c2 100644
--- a/python/docs/source/_static/css/pyspark.css
+++ b/python/docs/source/_static/css/pyspark.css
@@ -24,7 +24,7 @@ body {
 }
 
 h1,h2 {
-    color:#1B5162!important;
+    color:#17A2B8!important;
 }
 
 h3 {

From 8ad02304f873f37d564ac1e6730c724e0bb7d438 Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Mon, 27 Nov 2023 21:33:18 +0900
Subject: [PATCH 21/40] [SPARK-46120][CONNECT][PYTHON] Remove helper function
 `DataFrame.withPlan`

### What changes were proposed in this pull request?
Remove helper function `DataFrame.withPlan`

### Why are the changes needed?
this helper function just invoke the constructor, should be redundant

```
    classmethod
    def withPlan(cls, plan: plan.LogicalPlan, session: "SparkSession") -> "DataFrame":
        """
        Main initialization method used to construct a new data frame with a child plan.
        This is for internal purpose.
        """
        return DataFrame(plan=plan, session=session)
```

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ci

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #44037 from zhengruifeng/del_withPlan.

Authored-by: Ruifeng Zheng <ruifengz@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/sql/connect/catalog.py         |   6 +-
 python/pyspark/sql/connect/dataframe.py       | 122 ++++++++----------
 python/pyspark/sql/connect/group.py           |  10 +-
 python/pyspark/sql/connect/observation.py     |   2 +-
 python/pyspark/sql/connect/readwriter.py      |   2 +-
 python/pyspark/sql/connect/session.py         |  12 +-
 .../sql/connect/streaming/readwriter.py       |   2 +-
 python/pyspark/sql/connect/udtf.py            |   2 +-
 8 files changed, 74 insertions(+), 84 deletions(-)

diff --git a/python/pyspark/sql/connect/catalog.py b/python/pyspark/sql/connect/catalog.py
index e725e381b8dbe..9143a03d324d7 100644
--- a/python/pyspark/sql/connect/catalog.py
+++ b/python/pyspark/sql/connect/catalog.py
@@ -46,7 +46,7 @@ def __init__(self, sparkSession: "SparkSession") -> None:
         self._sparkSession = sparkSession
 
     def _execute_and_fetch(self, catalog: plan.LogicalPlan) -> pa.Table:
-        table, _ = DataFrame.withPlan(catalog, session=self._sparkSession)._to_table()
+        table, _ = DataFrame(catalog, session=self._sparkSession)._to_table()
         assert table is not None
         return table
 
@@ -222,7 +222,7 @@ def createExternalTable(
             schema=schema,
             options=options,
         )
-        df = DataFrame.withPlan(catalog, session=self._sparkSession)
+        df = DataFrame(catalog, session=self._sparkSession)
         df._to_table()  # Eager execution.
         return df
 
@@ -245,7 +245,7 @@ def createTable(
             description=description,
             options=options,
         )
-        df = DataFrame.withPlan(catalog, session=self._sparkSession)
+        df = DataFrame(catalog, session=self._sparkSession)
         df._to_table()  # Eager execution.
         return df
 
diff --git a/python/pyspark/sql/connect/dataframe.py b/python/pyspark/sql/connect/dataframe.py
index a6ef25c2ec3e1..a73a24818c0c2 100644
--- a/python/pyspark/sql/connect/dataframe.py
+++ b/python/pyspark/sql/connect/dataframe.py
@@ -153,7 +153,7 @@ def _repr_html_(self) -> Optional[str]:
             "spark.sql.repl.eagerEval.truncate",
         )
         if repl_eager_eval_enabled == "true":
-            table, _ = DataFrame.withPlan(
+            table, _ = DataFrame(
                 plan.HtmlString(
                     child=self._plan,
                     num_rows=int(cast(str, repl_eager_eval_max_num_rows)),
@@ -182,7 +182,7 @@ def select(self, *cols: "ColumnOrName") -> "DataFrame":
         if len(cols) == 1 and isinstance(cols[0], list):
             cols = cols[0]
 
-        return DataFrame.withPlan(plan.Project(self._plan, *cols), session=self._session)
+        return DataFrame(plan.Project(self._plan, *cols), session=self._session)
 
     select.__doc__ = PySparkDataFrame.select.__doc__
 
@@ -196,7 +196,7 @@ def selectExpr(self, *expr: Union[str, List[str]]) -> "DataFrame":
             else:
                 sql_expr.extend([sql_expression(e) for e in element])
 
-        return DataFrame.withPlan(plan.Project(self._plan, *sql_expr), session=self._session)
+        return DataFrame(plan.Project(self._plan, *sql_expr), session=self._session)
 
     selectExpr.__doc__ = PySparkDataFrame.selectExpr.__doc__
 
@@ -219,7 +219,7 @@ def agg(self, *exprs: Union[Column, Dict[str, str]]) -> "DataFrame":
     agg.__doc__ = PySparkDataFrame.agg.__doc__
 
     def alias(self, alias: str) -> "DataFrame":
-        return DataFrame.withPlan(plan.SubqueryAlias(self._plan, alias), session=self._session)
+        return DataFrame(plan.SubqueryAlias(self._plan, alias), session=self._session)
 
     alias.__doc__ = PySparkDataFrame.alias.__doc__
 
@@ -259,7 +259,7 @@ def count(self) -> int:
 
     def crossJoin(self, other: "DataFrame") -> "DataFrame":
         self._check_same_session(other)
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Join(left=self._plan, right=other._plan, on=None, how="cross"),
             session=self._session,
         )
@@ -279,7 +279,7 @@ def coalesce(self, numPartitions: int) -> "DataFrame":
                 error_class="VALUE_NOT_POSITIVE",
                 message_parameters={"arg_name": "numPartitions", "arg_value": str(numPartitions)},
             )
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Repartition(self._plan, num_partitions=numPartitions, shuffle=False),
             self._session,
         )
@@ -307,18 +307,18 @@ def repartition(  # type: ignore[misc]
                     },
                 )
             if len(cols) == 0:
-                return DataFrame.withPlan(
+                return DataFrame(
                     plan.Repartition(self._plan, num_partitions=numPartitions, shuffle=True),
                     self._session,
                 )
             else:
-                return DataFrame.withPlan(
+                return DataFrame(
                     plan.RepartitionByExpression(self._plan, numPartitions, list(cols)),
                     self.sparkSession,
                 )
         elif isinstance(numPartitions, (str, Column)):
             cols = (numPartitions,) + cols
-            return DataFrame.withPlan(
+            return DataFrame(
                 plan.RepartitionByExpression(self._plan, None, list(cols)),
                 self.sparkSession,
             )
@@ -372,7 +372,7 @@ def _convert_col(col: "ColumnOrName") -> "ColumnOrName":
             else:
                 sort = []
                 sort.extend([_convert_col(c) for c in cols])
-                return DataFrame.withPlan(
+                return DataFrame(
                     plan.RepartitionByExpression(self._plan, numPartitions, sort),
                     self.sparkSession,
                 )
@@ -380,7 +380,7 @@ def _convert_col(col: "ColumnOrName") -> "ColumnOrName":
             cols = (numPartitions,) + cols
             sort = []
             sort.extend([_convert_col(c) for c in cols])
-            return DataFrame.withPlan(
+            return DataFrame(
                 plan.RepartitionByExpression(self._plan, None, sort),
                 self.sparkSession,
             )
@@ -403,11 +403,11 @@ def dropDuplicates(self, subset: Optional[List[str]] = None) -> "DataFrame":
             )
 
         if subset is None:
-            return DataFrame.withPlan(
+            return DataFrame(
                 plan.Deduplicate(child=self._plan, all_columns_as_keys=True), session=self._session
             )
         else:
-            return DataFrame.withPlan(
+            return DataFrame(
                 plan.Deduplicate(child=self._plan, column_names=subset), session=self._session
             )
 
@@ -423,12 +423,12 @@ def dropDuplicatesWithinWatermark(self, subset: Optional[List[str]] = None) -> "
             )
 
         if subset is None:
-            return DataFrame.withPlan(
+            return DataFrame(
                 plan.Deduplicate(child=self._plan, all_columns_as_keys=True, within_watermark=True),
                 session=self._session,
             )
         else:
-            return DataFrame.withPlan(
+            return DataFrame(
                 plan.Deduplicate(child=self._plan, column_names=subset, within_watermark=True),
                 session=self._session,
             )
@@ -438,7 +438,7 @@ def dropDuplicatesWithinWatermark(self, subset: Optional[List[str]] = None) -> "
     drop_duplicates_within_watermark = dropDuplicatesWithinWatermark
 
     def distinct(self) -> "DataFrame":
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Deduplicate(child=self._plan, all_columns_as_keys=True), session=self._session
         )
 
@@ -452,7 +452,7 @@ def drop(self, *cols: "ColumnOrName") -> "DataFrame":
                 message_parameters={"arg_name": "cols", "arg_type": type(cols).__name__},
             )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Drop(
                 child=self._plan,
                 columns=_cols,
@@ -467,7 +467,7 @@ def filter(self, condition: Union[Column, str]) -> "DataFrame":
             expr = sql_expression(condition)
         else:
             expr = condition
-        return DataFrame.withPlan(plan.Filter(child=self._plan, filter=expr), session=self._session)
+        return DataFrame(plan.Filter(child=self._plan, filter=expr), session=self._session)
 
     filter.__doc__ = PySparkDataFrame.filter.__doc__
 
@@ -622,7 +622,7 @@ def join(
         self._check_same_session(other)
         if how is not None and isinstance(how, str):
             how = how.lower().replace("_", "")
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Join(left=self._plan, right=other._plan, on=on, how=how),
             session=self._session,
         )
@@ -649,7 +649,7 @@ def _joinAsOf(
         if tolerance is not None:
             assert isinstance(tolerance, Column), "tolerance should be Column"
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.AsOfJoin(
                 left=self._plan,
                 right=other._plan,
@@ -667,14 +667,12 @@ def _joinAsOf(
     _joinAsOf.__doc__ = PySparkDataFrame._joinAsOf.__doc__
 
     def limit(self, n: int) -> "DataFrame":
-        return DataFrame.withPlan(plan.Limit(child=self._plan, limit=n), session=self._session)
+        return DataFrame(plan.Limit(child=self._plan, limit=n), session=self._session)
 
     limit.__doc__ = PySparkDataFrame.limit.__doc__
 
     def tail(self, num: int) -> List[Row]:
-        return DataFrame.withPlan(
-            plan.Tail(child=self._plan, limit=num), session=self._session
-        ).collect()
+        return DataFrame(plan.Tail(child=self._plan, limit=num), session=self._session).collect()
 
     tail.__doc__ = PySparkDataFrame.tail.__doc__
 
@@ -729,7 +727,7 @@ def sort(
         *cols: Union[int, str, Column, List[Union[int, str, Column]]],
         **kwargs: Any,
     ) -> "DataFrame":
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Sort(
                 self._plan,
                 columns=self._sort_cols(cols, kwargs),
@@ -747,7 +745,7 @@ def sortWithinPartitions(
         *cols: Union[int, str, Column, List[Union[int, str, Column]]],
         **kwargs: Any,
     ) -> "DataFrame":
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Sort(
                 self._plan,
                 columns=self._sort_cols(cols, kwargs),
@@ -804,7 +802,7 @@ def sample(
 
         seed = int(seed) if seed is not None else None
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Sample(
                 child=self._plan,
                 lower_bound=0.0,
@@ -829,7 +827,7 @@ def withColumnsRenamed(self, colsMap: Dict[str, str]) -> "DataFrame":
                 message_parameters={"arg_name": "colsMap", "arg_type": type(colsMap).__name__},
             )
 
-        return DataFrame.withPlan(plan.WithColumnsRenamed(self._plan, colsMap), self._session)
+        return DataFrame(plan.WithColumnsRenamed(self._plan, colsMap), self._session)
 
     withColumnsRenamed.__doc__ = PySparkDataFrame.withColumnsRenamed.__doc__
 
@@ -862,7 +860,7 @@ def _show_string(
                     },
                 )
 
-        table, _ = DataFrame.withPlan(
+        table, _ = DataFrame(
             plan.ShowString(
                 child=self._plan,
                 num_rows=n,
@@ -886,7 +884,7 @@ def withColumns(self, colsMap: Dict[str, Column]) -> "DataFrame":
             names.append(columnName)
             columns.append(column)
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.WithColumns(
                 self._plan,
                 columnNames=names,
@@ -903,7 +901,7 @@ def withColumn(self, colName: str, col: Column) -> "DataFrame":
                 error_class="NOT_COLUMN",
                 message_parameters={"arg_name": "col", "arg_type": type(col).__name__},
             )
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.WithColumns(
                 self._plan,
                 columnNames=[colName],
@@ -921,7 +919,7 @@ def withMetadata(self, columnName: str, metadata: Dict[str, Any]) -> "DataFrame"
                 message_parameters={"arg_name": "metadata", "arg_type": type(metadata).__name__},
             )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.WithColumns(
                 self._plan,
                 columnNames=[columnName],
@@ -955,7 +953,7 @@ def to_jcols(
                 lst = [cols]
             return lst
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Unpivot(
                 self._plan,
                 to_jcols(ids),
@@ -986,7 +984,7 @@ def withWatermark(self, eventTime: str, delayThreshold: str) -> "DataFrame":
                 },
             )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.WithWatermark(
                 self._plan,
                 event_time=eventTime,
@@ -1036,7 +1034,7 @@ def hint(
                         },
                     )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Hint(self._plan, name, list(parameters)),
             session=self._session,
         )
@@ -1071,7 +1069,7 @@ def randomSplit(
         while j < length:
             lowerBound = normalizedCumWeights[j - 1]
             upperBound = normalizedCumWeights[j]
-            samplePlan = DataFrame.withPlan(
+            samplePlan = DataFrame(
                 plan.Sample(
                     child=self._plan,
                     lower_bound=lowerBound,
@@ -1110,7 +1108,7 @@ def observe(
         if isinstance(observation, Observation):
             return observation._on(self, *exprs)
         elif isinstance(observation, str):
-            return DataFrame.withPlan(
+            return DataFrame(
                 plan.CollectMetrics(self._plan, observation, list(exprs)),
                 self._session,
             )
@@ -1138,7 +1136,7 @@ def union(self, other: "DataFrame") -> "DataFrame":
 
     def unionAll(self, other: "DataFrame") -> "DataFrame":
         self._check_same_session(other)
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.SetOperation(self._plan, other._plan, "union", is_all=True), session=self._session
         )
 
@@ -1146,7 +1144,7 @@ def unionAll(self, other: "DataFrame") -> "DataFrame":
 
     def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) -> "DataFrame":
         self._check_same_session(other)
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.SetOperation(
                 self._plan,
                 other._plan,
@@ -1161,7 +1159,7 @@ def unionByName(self, other: "DataFrame", allowMissingColumns: bool = False) ->
 
     def subtract(self, other: "DataFrame") -> "DataFrame":
         self._check_same_session(other)
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.SetOperation(self._plan, other._plan, "except", is_all=False),
             session=self._session,
         )
@@ -1170,7 +1168,7 @@ def subtract(self, other: "DataFrame") -> "DataFrame":
 
     def exceptAll(self, other: "DataFrame") -> "DataFrame":
         self._check_same_session(other)
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.SetOperation(self._plan, other._plan, "except", is_all=True), session=self._session
         )
 
@@ -1178,7 +1176,7 @@ def exceptAll(self, other: "DataFrame") -> "DataFrame":
 
     def intersect(self, other: "DataFrame") -> "DataFrame":
         self._check_same_session(other)
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.SetOperation(self._plan, other._plan, "intersect", is_all=False),
             session=self._session,
         )
@@ -1187,7 +1185,7 @@ def intersect(self, other: "DataFrame") -> "DataFrame":
 
     def intersectAll(self, other: "DataFrame") -> "DataFrame":
         self._check_same_session(other)
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.SetOperation(self._plan, other._plan, "intersect", is_all=True),
             session=self._session,
         )
@@ -1268,7 +1266,7 @@ def fillna(
         else:
             _values = [value]
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.NAFill(child=self._plan, cols=_cols, values=_values),
             session=self._session,
         )
@@ -1327,7 +1325,7 @@ def dropna(
                     message_parameters={"arg_name": "subset", "arg_type": type(subset).__name__},
                 )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.NADrop(child=self._plan, cols=_cols, min_non_nulls=min_non_nulls),
             session=self._session,
         )
@@ -1439,7 +1437,7 @@ def all_of_(xs: Iterable) -> bool:
                 message_parameters={},
             )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.NAReplace(child=self._plan, cols=subset, replacements=rep_dict),
             session=self._session,
         )
@@ -1460,7 +1458,7 @@ def summary(self, *statistics: str) -> "DataFrame":
                     error_class="NOT_LIST_OF_STR",
                     message_parameters={"arg_name": "statistics", "arg_type": type(s).__name__},
                 )
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.StatSummary(child=self._plan, statistics=_statistics),
             session=self._session,
         )
@@ -1477,7 +1475,7 @@ def describe(self, *cols: Union[str, List[str]]) -> "DataFrame":
                 _cols.append(column)
             else:
                 _cols.extend([s for s in column])
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.StatDescribe(child=self._plan, cols=_cols),
             session=self._session,
         )
@@ -1495,7 +1493,7 @@ def cov(self, col1: str, col2: str) -> float:
                 error_class="NOT_STR",
                 message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__},
             )
-        table, _ = DataFrame.withPlan(
+        table, _ = DataFrame(
             plan.StatCov(child=self._plan, col1=col1, col2=col2),
             session=self._session,
         )._to_table()
@@ -1521,7 +1519,7 @@ def corr(self, col1: str, col2: str, method: Optional[str] = None) -> float:
                 error_class="VALUE_NOT_PEARSON",
                 message_parameters={"arg_name": "method", "arg_value": method},
             )
-        table, _ = DataFrame.withPlan(
+        table, _ = DataFrame(
             plan.StatCorr(child=self._plan, col1=col1, col2=col2, method=method),
             session=self._session,
         )._to_table()
@@ -1592,7 +1590,7 @@ def approxQuantile(
                 },
             )
         relativeError = float(relativeError)
-        table, _ = DataFrame.withPlan(
+        table, _ = DataFrame(
             plan.StatApproxQuantile(
                 child=self._plan,
                 cols=list(col),
@@ -1618,7 +1616,7 @@ def crosstab(self, col1: str, col2: str) -> "DataFrame":
                 error_class="NOT_STR",
                 message_parameters={"arg_name": "col2", "arg_type": type(col2).__name__},
             )
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.StatCrosstab(child=self._plan, col1=col1, col2=col2),
             session=self._session,
         )
@@ -1637,7 +1635,7 @@ def freqItems(
             )
         if not support:
             support = 0.01
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.StatFreqItems(child=self._plan, cols=cols, support=support),
             session=self._session,
         )
@@ -1674,7 +1672,7 @@ def sampleBy(
                 )
             fractions[k] = float(v)
         seed = seed if seed is not None else random.randint(0, sys.maxsize)
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.StatSampleBy(child=self._plan, col=col, fractions=fractions, seed=seed),
             session=self._session,
         )
@@ -1819,7 +1817,7 @@ def inputFiles(self) -> List[str]:
 
     def to(self, schema: StructType) -> "DataFrame":
         assert schema is not None
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.ToSchema(child=self._plan, schema=schema),
             session=self._session,
         )
@@ -1833,7 +1831,7 @@ def toDF(self, *cols: str) -> "DataFrame":
                     error_class="NOT_LIST_OF_STR",
                     message_parameters={"arg_name": "cols", "arg_type": type(col_).__name__},
                 )
-        return DataFrame.withPlan(plan.ToDF(self._plan, list(cols)), self._session)
+        return DataFrame(plan.ToDF(self._plan, list(cols)), self._session)
 
     toDF.__doc__ = PySparkDataFrame.toDF.__doc__
 
@@ -2029,7 +2027,7 @@ def _map_partitions(
             evalType=evalType,
         )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.MapPartitions(
                 child=self._plan, function=udf_obj, cols=self.columns, is_barrier=barrier
             ),
@@ -2119,18 +2117,10 @@ def writeTo(self, table: str) -> "DataFrameWriterV2":
 
     # SparkConnect specific API
     def offset(self, n: int) -> "DataFrame":
-        return DataFrame.withPlan(plan.Offset(child=self._plan, offset=n), session=self._session)
+        return DataFrame(plan.Offset(child=self._plan, offset=n), session=self._session)
 
     offset.__doc__ = PySparkDataFrame.offset.__doc__
 
-    @classmethod
-    def withPlan(cls, plan: plan.LogicalPlan, session: "SparkSession") -> "DataFrame":
-        """
-        Main initialization method used to construct a new data frame with a child plan.
-        This is for internal purpose.
-        """
-        return DataFrame(plan=plan, session=session)
-
 
 class DataFrameNaFunctions:
     def __init__(self, df: DataFrame):
diff --git a/python/pyspark/sql/connect/group.py b/python/pyspark/sql/connect/group.py
index 6f98186d9d93d..610ef036bc5ed 100644
--- a/python/pyspark/sql/connect/group.py
+++ b/python/pyspark/sql/connect/group.py
@@ -134,7 +134,7 @@ def agg(self, *exprs: Union[Column, Dict[str, str]]) -> "DataFrame":
             assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
             aggregate_cols = cast(List[Column], list(exprs))
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Aggregate(
                 child=self._df._plan,
                 group_type=self._group_type,
@@ -176,7 +176,7 @@ def _numeric_agg(self, function: str, cols: Sequence[str]) -> "DataFrame":
             # if no column is provided, then all numerical columns are selected
             agg_cols = numerical_cols
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.Aggregate(
                 child=self._df._plan,
                 group_type=self._group_type,
@@ -296,7 +296,7 @@ def applyInPandas(
             evalType=PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
         )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.GroupMap(
                 child=self._df._plan,
                 grouping_cols=self._grouping_cols,
@@ -335,7 +335,7 @@ def applyInPandasWithState(
             stateStructType.json() if isinstance(stateStructType, StructType) else stateStructType
         )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.ApplyInPandasWithState(
                 child=self._df._plan,
                 grouping_cols=self._grouping_cols,
@@ -378,7 +378,7 @@ def applyInPandas(
             evalType=PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
         )
 
-        return DataFrame.withPlan(
+        return DataFrame(
             plan.CoGroupMap(
                 input=self._gd1._df._plan,
                 input_grouping_cols=self._gd1._grouping_cols,
diff --git a/python/pyspark/sql/connect/observation.py b/python/pyspark/sql/connect/observation.py
index ff10443574962..fcc3c2f828b1b 100644
--- a/python/pyspark/sql/connect/observation.py
+++ b/python/pyspark/sql/connect/observation.py
@@ -49,7 +49,7 @@ def _on(self, df: DataFrame, *exprs: Column) -> DataFrame:
             raise IllegalArgumentException("Observation does not support streaming Datasets")
 
         self._result = {}
-        return DataFrame.withPlan(plan.CollectMetrics(df._plan, self, list(exprs)), df._session)
+        return DataFrame(plan.CollectMetrics(df._plan, self, list(exprs)), df._session)
 
     _on.__doc__ = PySparkObservation._on.__doc__
 
diff --git a/python/pyspark/sql/connect/readwriter.py b/python/pyspark/sql/connect/readwriter.py
index fee98cc34964a..52975917ea02b 100644
--- a/python/pyspark/sql/connect/readwriter.py
+++ b/python/pyspark/sql/connect/readwriter.py
@@ -134,7 +134,7 @@ def load(
     def _df(self, plan: LogicalPlan) -> "DataFrame":
         from pyspark.sql.connect.dataframe import DataFrame
 
-        return DataFrame.withPlan(plan, self._client)
+        return DataFrame(plan, self._client)
 
     def table(self, tableName: str) -> "DataFrame":
         return self._df(Read(tableName, self._options))
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
index ef825f1a6f313..7c1fef2170ae8 100644
--- a/python/pyspark/sql/connect/session.py
+++ b/python/pyspark/sql/connect/session.py
@@ -381,7 +381,7 @@ def createDataFrame(
             )
         elif isinstance(data, Sized) and len(data) == 0:
             if _schema is not None:
-                return DataFrame.withPlan(LocalRelation(table=None, schema=_schema.json()), self)
+                return DataFrame(LocalRelation(table=None, schema=_schema.json()), self)
             else:
                 raise PySparkValueError(
                     error_class="CANNOT_INFER_EMPTY_SCHEMA",
@@ -547,7 +547,7 @@ def createDataFrame(
         if cache_threshold[0] is not None and int(cache_threshold[0]) <= _table.nbytes:
             plan = CachedLocalRelation(self._cache_local_relation(local_relation))
 
-        df = DataFrame.withPlan(plan, self)
+        df = DataFrame(plan, self)
         if _cols is not None and len(_cols) > 0:
             df = df.toDF(*_cols)
         return df
@@ -558,9 +558,9 @@ def sql(self, sqlQuery: str, args: Optional[Union[Dict[str, Any], List]] = None)
         cmd = SQL(sqlQuery, args)
         data, properties = self.client.execute_command(cmd.command(self._client))
         if "sql_command_result" in properties:
-            return DataFrame.withPlan(CachedRelation(properties["sql_command_result"]), self)
+            return DataFrame(CachedRelation(properties["sql_command_result"]), self)
         else:
-            return DataFrame.withPlan(cmd, self)
+            return DataFrame(cmd, self)
 
     sql.__doc__ = PySparkSession.sql.__doc__
 
@@ -580,7 +580,7 @@ def range(
         if numPartitions is not None:
             numPartitions = int(numPartitions)
 
-        return DataFrame.withPlan(
+        return DataFrame(
             Range(
                 start=int(start), end=int(actual_end), step=int(step), num_partitions=numPartitions
             ),
@@ -769,7 +769,7 @@ def _create_remote_dataframe(self, remote_id: str) -> "DataFrame":
         This is used in ForeachBatch() runner, where the remote DataFrame refers to the
         output of a micro batch.
         """
-        return DataFrame.withPlan(CachedRemoteRelation(remote_id), self)
+        return DataFrame(CachedRemoteRelation(remote_id), self)
 
     @staticmethod
     def _start_connect_server(master: str, opts: Dict[str, Any]) -> None:
diff --git a/python/pyspark/sql/connect/streaming/readwriter.py b/python/pyspark/sql/connect/streaming/readwriter.py
index 294487fd4bdd5..11f230473fcf1 100644
--- a/python/pyspark/sql/connect/streaming/readwriter.py
+++ b/python/pyspark/sql/connect/streaming/readwriter.py
@@ -52,7 +52,7 @@ def __init__(self, client: "SparkSession") -> None:
     def _df(self, plan: LogicalPlan) -> "DataFrame":
         from pyspark.sql.connect.dataframe import DataFrame
 
-        return DataFrame.withPlan(plan, self._client)
+        return DataFrame(plan, self._client)
 
     def format(self, source: str) -> "DataStreamReader":
         self._format = source
diff --git a/python/pyspark/sql/connect/udtf.py b/python/pyspark/sql/connect/udtf.py
index f0facbd1a7022..f137864e026e4 100644
--- a/python/pyspark/sql/connect/udtf.py
+++ b/python/pyspark/sql/connect/udtf.py
@@ -167,7 +167,7 @@ def __call__(self, *args: "ColumnOrName", **kwargs: "ColumnOrName") -> "DataFram
         session = SparkSession.active()
 
         plan = self._build_common_inline_user_defined_table_function(*args, **kwargs)
-        return DataFrame.withPlan(plan, session)
+        return DataFrame(plan, session)
 
     def asDeterministic(self) -> "UserDefinedTableFunction":
         self.deterministic = True

From 7d1db89358e0f2d3ddbe751bc53522f5d6e9c693 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Mon, 27 Nov 2023 23:41:28 +0900
Subject: [PATCH 22/40] [SPARK-46110][PYTHON] Use error classes in catalog,
 conf, connect, observation, pandas modules

### What changes were proposed in this pull request?

This PR proposes to use error classes in catalog, conf, connect, observation, pandas modules.

### Why are the changes needed?

For better error handling.

### Does this PR introduce _any_ user-facing change?

Yes, it includes the error class, and methods such as `getErrorClass` at the error instance.

### How was this patch tested?

Existing unittests.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44024 from HyukjinKwon/error-calss.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/errors/error_classes.py        | 39 ++++++++++++++++++-
 python/pyspark/sql/catalog.py                 |  9 ++++-
 python/pyspark/sql/conf.py                    | 15 ++++---
 python/pyspark/sql/connect/client/artifact.py | 24 ++++++++++--
 python/pyspark/sql/connect/client/core.py     | 22 +++++++++--
 python/pyspark/sql/connect/conf.py            | 20 +++++++---
 python/pyspark/sql/connect/conversion.py      | 20 +++++++---
 python/pyspark/sql/connect/observation.py     | 21 ++++++++--
 python/pyspark/sql/connect/plan.py            | 21 ++++++----
 python/pyspark/sql/connect/session.py         | 34 ++++++++--------
 python/pyspark/sql/connect/types.py           | 12 ++++--
 python/pyspark/sql/observation.py             | 11 +++++-
 python/pyspark/sql/pandas/group_ops.py        |  8 +++-
 .../sql/tests/connect/test_connect_basic.py   | 15 +++----
 .../tests/pandas/test_pandas_grouped_map.py   | 12 +++---
 python/pyspark/sql/tests/test_dataframe.py    |  4 +-
 16 files changed, 209 insertions(+), 78 deletions(-)

diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py
index 72ee0a53460f6..6662efa8ca54b 100644
--- a/python/pyspark/errors/error_classes.py
+++ b/python/pyspark/errors/error_classes.py
@@ -76,7 +76,7 @@
   },
   "CANNOT_BE_NONE": {
     "message": [
-      "Argument `<arg_name>` can not be None."
+      "Argument `<arg_name>` cannot be None."
     ]
   },
   "CANNOT_CONVERT_COLUMN_INTO_BOOL": {
@@ -277,6 +277,11 @@
       "All items in `<arg_name>` should be in <allowed_types>, got <item_type>."
     ]
   },
+  "INVALID_MULTIPLE_ARGUMENT_CONDITIONS" : {
+    "message" : [
+      "[{arg_names}] cannot be <condition>."
+    ]
+  },
   "INVALID_NDARRAY_DIMENSION": {
     "message": [
       "NumPy array input should be of <dimensions> dimensions."
@@ -297,6 +302,11 @@
       "Pandas UDF should return StructType for <eval_type>, got <return_type>."
     ]
   },
+  "INVALID_SESSION_UUID_ID": {
+    "message": [
+      "Parameter value <arg_name> must be a valid UUID format: <origin>"
+    ]
+  },
   "INVALID_TIMEOUT_TIMESTAMP" : {
     "message" : [
       "Timeout timestamp (<timestamp>) cannot be earlier than the current watermark (<watermark>)."
@@ -612,6 +622,11 @@
       "Argument `<arg_name>` should be a str, got <arg_type>."
     ]
   },
+  "NOT_STRUCT" : {
+    "message" : [
+      "Argument `<arg_name>` should be a struct type, got <arg_type>."
+    ]
+  },
   "NOT_STR_OR_LIST_OF_RDD" : {
     "message" : [
       "Argument `<arg_name>` should be a str or list[RDD], got <arg_type>."
@@ -619,7 +634,7 @@
   },
   "NOT_STR_OR_STRUCT" : {
     "message" : [
-      "Argument `<arg_name>` should be a str or structType, got <arg_type>."
+      "Argument `<arg_name>` should be a str or struct type, got <arg_type>."
     ]
   },
   "NOT_WINDOWSPEC" : {
@@ -637,6 +652,11 @@
       "No active Spark session found. Please create a new Spark session before running the code."
     ]
   },
+  "NO_SCHEMA_AND_DRIVER_DEFAULT_SCHEME" : {
+    "message" : [
+      "Only allows <arg_name> to be a path without scheme, and Spark Driver should use the default scheme to determine the destination file system."
+    ]
+  },
   "ONLY_ALLOWED_FOR_SINGLE_COLUMN" : {
     "message" : [
       "Argument `<arg_name>` can only be provided for a single column."
@@ -713,6 +733,11 @@
       "Cannot start a remote Spark session because there is a regular Spark session already running."
     ]
   },
+  "SESSION_NEED_CONN_STR_OR_BUILDER" : {
+    "message" : [
+      "Needs either connection string or channelBuilder (mutually exclusive) to create a new SparkSession."
+    ]
+  },
   "SESSION_NOT_SAME" : {
     "message" : [
       "Both Datasets must belong to the same SparkSession."
@@ -758,6 +783,11 @@
       "Expected <expected> values for `<item>`, got <actual>."
     ]
   },
+  "TYPE_HINT_REQUIRED" : {
+    "message" : [
+      "A <arg_type> is required <where>."
+    ]
+  },
   "UDF_RETURN_TYPE" : {
     "message" : [
       "Return type of the user-defined function should be <expected>, but is <actual>."
@@ -878,6 +908,11 @@
       "<feature> is not supported with Arrow optimization enabled in Python UDFs. Disable 'spark.sql.execution.pythonUDF.arrow.enabled' to workaround.."
     ]
   },
+  "VALUE_ALLOWED" : {
+    "message" : [
+      "Value for `<arg_name>` does not allow <disallowed_value>."
+    ]
+  },
   "VALUE_NOT_ACCESSIBLE": {
     "message": [
       "Value `<value>` cannot be accessed inside tasks."
diff --git a/python/pyspark/sql/catalog.py b/python/pyspark/sql/catalog.py
index 1886f810ce05c..b5337734b3b80 100644
--- a/python/pyspark/sql/catalog.py
+++ b/python/pyspark/sql/catalog.py
@@ -19,6 +19,7 @@
 import warnings
 from typing import Any, Callable, NamedTuple, List, Optional, TYPE_CHECKING
 
+from pyspark.errors import PySparkTypeError
 from pyspark.storagelevel import StorageLevel
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.session import SparkSession
@@ -851,7 +852,13 @@ def createTable(
             df = self._jcatalog.createTable(tableName, source, description, options)
         else:
             if not isinstance(schema, StructType):
-                raise TypeError("schema should be StructType")
+                raise PySparkTypeError(
+                    error_class="NOT_STRUCT",
+                    message_parameters={
+                        "arg_name": "schema",
+                        "arg_type": type(schema).__name__,
+                    },
+                )
             scala_datatype = self._jsparkSession.parseDataType(schema.json())
             df = self._jcatalog.createTable(tableName, source, scala_datatype, description, options)
         return DataFrame(df, self._sparkSession)
diff --git a/python/pyspark/sql/conf.py b/python/pyspark/sql/conf.py
index b2dda115f6667..e77039565dd15 100644
--- a/python/pyspark/sql/conf.py
+++ b/python/pyspark/sql/conf.py
@@ -22,6 +22,7 @@
 
 from pyspark import _NoValue
 from pyspark._globals import _NoValueType
+from pyspark.errors import PySparkTypeError
 
 
 class RuntimeConfig:
@@ -84,12 +85,12 @@ def get(
         >>> spark.conf.get("my_key")
         'my_value'
         """
-        self._checkType(key, "key")
+        self._check_type(key, "key")
         if default is _NoValue:
             return self._jconf.get(key)
         else:
             if default is not None:
-                self._checkType(default, "default")
+                self._check_type(default, "default")
             return self._jconf.get(key, default)
 
     def unset(self, key: str) -> None:
@@ -116,11 +117,15 @@ def unset(self, key: str) -> None:
         """
         self._jconf.unset(key)
 
-    def _checkType(self, obj: Any, identifier: str) -> None:
+    def _check_type(self, obj: Any, identifier: str) -> None:
         """Assert that an object is of type str."""
         if not isinstance(obj, str):
-            raise TypeError(
-                "expected %s '%s' to be a string (was '%s')" % (identifier, obj, type(obj).__name__)
+            raise PySparkTypeError(
+                error_class="NOT_STR",
+                message_parameters={
+                    "arg_name": identifier,
+                    "arg_type": type(obj).__name__,
+                },
             )
 
     def isModifiable(self, key: str) -> bool:
diff --git a/python/pyspark/sql/connect/client/artifact.py b/python/pyspark/sql/connect/client/artifact.py
index 5829ec9a8d4dc..46c2b2750ba5e 100644
--- a/python/pyspark/sql/connect/client/artifact.py
+++ b/python/pyspark/sql/connect/client/artifact.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from pyspark.errors import PySparkRuntimeError, PySparkValueError
 from pyspark.sql.connect.utils import check_dependencies
 from pyspark.sql.connect.client.logging import logger
 
@@ -112,7 +113,10 @@ def size(self) -> int:
         if isinstance(self.storage, LocalData):
             return self.storage.size
         else:
-            raise RuntimeError(f"Unsupported storage {type(self.storage)}")
+            raise PySparkRuntimeError(
+                error_class="UNSUPPORTED_OPERATION",
+                message_parameters={"operation": f"{self.storage} storage"},
+            )
 
 
 def new_jar_artifact(file_name: str, storage: LocalData) -> Artifact:
@@ -214,7 +218,13 @@ def _parse_artifacts(
                     # Minimal fix for the workaround of fragment handling in URI.
                     # This has a limitation - hash(#) in the file name would not work.
                     if "#" in local_path:
-                        raise ValueError("'#' in the path is not supported for adding an archive.")
+                        raise PySparkValueError(
+                            error_class="VALUE_ALLOWED",
+                            message_parameters={
+                                "arg_name": "artifact path",
+                                "disallowed_value": "#",
+                            },
+                        )
                     name = f"{name}#{parsed.fragment}"
 
                 artifact = new_archive_artifact(name, LocalFile(local_path))
@@ -223,9 +233,15 @@ def _parse_artifacts(
             elif name.endswith(".jar"):
                 artifact = new_jar_artifact(name, LocalFile(local_path))
             else:
-                raise RuntimeError(f"Unsupported file format: {local_path}")
+                raise PySparkRuntimeError(
+                    error_class="UNSUPPORTED_OPERATION",
+                    message_parameters={"operation": f"{local_path} file format"},
+                )
             return [artifact]
-        raise RuntimeError(f"Unsupported scheme: {parsed.scheme}")
+        raise PySparkRuntimeError(
+            error_class="UNSUPPORTED_OPERATION",
+            message_parameters={"operation": f"{parsed.scheme} scheme"},
+        )
 
     def _parse_forward_to_fs_artifacts(self, local_path: str, dest_path: str) -> List[Artifact]:
         abs_path: Path = Path(local_path).absolute()
diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py
index 5d8db69c641ff..f037e968be013 100644
--- a/python/pyspark/sql/connect/client/core.py
+++ b/python/pyspark/sql/connect/client/core.py
@@ -327,7 +327,10 @@ def session_id(self) -> Optional[str]:
             try:
                 uuid.UUID(session_id, version=4)
             except ValueError as ve:
-                raise ValueError("Parameter value 'session_id' must be a valid UUID format.", ve)
+                raise PySparkValueError(
+                    error_class="INVALID_SESSION_UUID_ID",
+                    message_parameters={"arg_name": "session_id", "origin": str(ve)},
+                )
         return session_id
 
     def toChannel(self) -> grpc.Channel:
@@ -1461,11 +1464,22 @@ def _throw_if_invalid_tag(self, tag: str) -> None:
         """
         spark_job_tags_sep = ","
         if tag is None:
-            raise ValueError("Spark Connect tag cannot be null.")
+            raise PySparkValueError(
+                error_class="CANNOT_BE_NONE", message_paramters={"arg_name": "Spark Connect tag"}
+            )
         if spark_job_tags_sep in tag:
-            raise ValueError(f"Spark Connect tag cannot contain '{spark_job_tags_sep}'.")
+            raise PySparkValueError(
+                error_class="VALUE_ALLOWED",
+                message_parameters={
+                    "arg_name": "Spark Connect tag",
+                    "disallowed_value": spark_job_tags_sep,
+                },
+            )
         if len(tag) == 0:
-            raise ValueError("Spark Connect tag cannot be an empty string.")
+            raise PySparkValueError(
+                error_class="VALUE_NOT_NON_EMPTY_STR",
+                message_parameters={"arg_name": "Spark Connect tag", "arg_value": tag},
+            )
 
     def _handle_error(self, error: Exception) -> NoReturn:
         """
diff --git a/python/pyspark/sql/connect/conf.py b/python/pyspark/sql/connect/conf.py
index 16e992044b268..3548a31fef036 100644
--- a/python/pyspark/sql/connect/conf.py
+++ b/python/pyspark/sql/connect/conf.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from pyspark.errors import PySparkValueError, PySparkTypeError
 from pyspark.sql.connect.utils import check_dependencies
 
 check_dependencies(__name__)
@@ -79,21 +80,28 @@ def unset(self, key: str) -> None:
     def isModifiable(self, key: str) -> bool:
         op_is_modifiable = proto.ConfigRequest.IsModifiable(keys=[key])
         operation = proto.ConfigRequest.Operation(is_modifiable=op_is_modifiable)
-        result = self._client.config(operation)
-        if result.pairs[0][1] == "true":
+        result = self._client.config(operation).pairs[0][1]
+        if result == "true":
             return True
-        elif result.pairs[0][1] == "false":
+        elif result == "false":
             return False
         else:
-            raise ValueError(f"Unknown boolean value: {result.pairs[0][1]}")
+            raise PySparkValueError(
+                error_class="VALUE_NOT_ALLOWED",
+                message_parameters={"arg_name": "result", "allowed_values": "'true' or 'false'"},
+            )
 
     isModifiable.__doc__ = PySparkRuntimeConfig.isModifiable.__doc__
 
     def _checkType(self, obj: Any, identifier: str) -> None:
         """Assert that an object is of type str."""
         if not isinstance(obj, str):
-            raise TypeError(
-                "expected %s '%s' to be a string (was '%s')" % (identifier, obj, type(obj).__name__)
+            raise PySparkTypeError(
+                error_class="NOT_STR",
+                message_parameters={
+                    "arg_name": identifier,
+                    "arg_type": type(obj).__name__,
+                },
             )
 
 
diff --git a/python/pyspark/sql/connect/conversion.py b/python/pyspark/sql/connect/conversion.py
index dc46e68f532f6..550978d02f851 100644
--- a/python/pyspark/sql/connect/conversion.py
+++ b/python/pyspark/sql/connect/conversion.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from pyspark.errors import PySparkValueError
 from pyspark.sql.connect.utils import check_dependencies
 
 check_dependencies(__name__)
@@ -127,9 +128,12 @@ def convert_struct(value: Any) -> Any:
                             _dict[dedup_field_names[i]] = field_convs[i](value.get(field))
                     else:
                         if len(value) != len(field_names):
-                            raise ValueError(
-                                f"Length mismatch: Expected axis has {len(field_names)} elements, "
-                                f"new values have {len(value)} elements"
+                            raise PySparkValueError(
+                                error_class="AXIS_LENGTH_MISMATCH",
+                                message_parameters={
+                                    "expected_length": str(len(field_names)),
+                                    "actual_length": str(len(value)),
+                                },
                             )
                         for i in range(len(field_names)):
                             _dict[dedup_field_names[i]] = field_convs[i](value[i])
@@ -285,10 +289,14 @@ def convert(data: Sequence[Any], schema: StructType) -> "pa.Table":
                     pylist[i].append(column_convs[i](item.get(col)))
             else:
                 if len(item) != len(column_names):
-                    raise ValueError(
-                        f"Length mismatch: Expected axis has {len(column_names)} elements, "
-                        f"new values have {len(item)} elements"
+                    raise PySparkValueError(
+                        error_class="AXIS_LENGTH_MISMATCH",
+                        message_parameters={
+                            "expected_length": str(len(column_names)),
+                            "actual_length": str(len(item)),
+                        },
                     )
+
                 for i in range(len(column_names)):
                     pylist[i].append(column_convs[i](item[i]))
 
diff --git a/python/pyspark/sql/connect/observation.py b/python/pyspark/sql/connect/observation.py
index fcc3c2f828b1b..174ab74c2506f 100644
--- a/python/pyspark/sql/connect/observation.py
+++ b/python/pyspark/sql/connect/observation.py
@@ -17,7 +17,11 @@
 from typing import Any, Dict, Optional
 import uuid
 
-from pyspark.errors import IllegalArgumentException
+from pyspark.errors import (
+    PySparkTypeError,
+    PySparkValueError,
+    IllegalArgumentException,
+)
 from pyspark.sql.connect.column import Column
 from pyspark.sql.connect.dataframe import DataFrame
 from pyspark.sql.observation import Observation as PySparkObservation
@@ -31,9 +35,15 @@ class Observation:
     def __init__(self, name: Optional[str] = None) -> None:
         if name is not None:
             if not isinstance(name, str):
-                raise TypeError("name should be a string")
+                raise PySparkTypeError(
+                    error_class="NOT_STR",
+                    message_parameters={"arg_name": "name", "arg_type": type(name).__name__},
+                )
             if name == "":
-                raise ValueError("name should not be empty")
+                raise PySparkValueError(
+                    error_class="VALUE_NOT_NON_EMPTY_STR",
+                    message_parameters={"arg_name": "name", "arg_value": name},
+                )
         self._name = name
         self._result: Optional[Dict[str, Any]] = None
 
@@ -46,7 +56,10 @@ def _on(self, df: DataFrame, *exprs: Column) -> DataFrame:
             self._name = str(uuid.uuid4())
 
         if df.isStreaming:
-            raise IllegalArgumentException("Observation does not support streaming Datasets")
+            raise IllegalArgumentException(
+                error_class="UNSUPPORTED_OPERATION",
+                message_parameters={"operation": "Streaming DataFrame with Observation"},
+            )
 
         self._result = {}
         return DataFrame(plan.CollectMetrics(df._plan, self, list(exprs)), df._session)
diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py
index 4b18914446dbf..67a33c2b6cf25 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -43,7 +43,7 @@
 from pyspark.sql.connect.types import pyspark_types_to_proto_types, UnparsedDataType
 from pyspark.errors import (
     PySparkTypeError,
-    PySparkNotImplementedError,
+    PySparkValueError,
     PySparkPicklingError,
     IllegalArgumentException,
 )
@@ -1060,9 +1060,9 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation:
         elif self.set_op == "except":
             plan.set_op.set_op_type = proto.SetOperation.SET_OP_TYPE_EXCEPT
         else:
-            raise PySparkNotImplementedError(
+            raise PySparkValueError(
                 error_class="UNSUPPORTED_OPERATION",
-                message_parameters={"feature": self.set_op},
+                message_parameters={"operation": self.set_op},
             )
 
         plan.set_op.is_all = self.is_all
@@ -1699,8 +1699,9 @@ def command(self, session: "SparkConnectClient") -> proto.Command:
                         proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO
                     )
                 else:
-                    raise ValueError(
-                        f"Unknown TestSaveMethod value for DataFrame: {self.table_save_method}"
+                    raise PySparkValueError(
+                        error_class="UNSUPPORTED_OPERATION",
+                        message_parameters={"operation": tsm},
                     )
         elif self.path is not None:
             plan.write_operation.path = self.path
@@ -1716,7 +1717,10 @@ def command(self, session: "SparkConnectClient") -> proto.Command:
             elif wm == "ignore":
                 plan.write_operation.mode = proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE
             else:
-                raise ValueError(f"Unknown SaveMode value for DataFrame: {self.mode}")
+                raise PySparkValueError(
+                    error_class="UNSUPPORTED_OPERATION",
+                    message_parameters={"operation": self.mode},
+                )
         return plan
 
     def print(self, indent: int = 0) -> str:
@@ -1812,7 +1816,10 @@ def command(self, session: "SparkConnectClient") -> proto.Command:
             elif wm == "create_or_replace":
                 plan.write_operation_v2.mode = proto.WriteOperationV2.Mode.MODE_CREATE_OR_REPLACE
             else:
-                raise ValueError(f"Unknown Mode value for DataFrame: {self.mode}")
+                raise PySparkValueError(
+                    error_class="UNSUPPORTED_OPERATION",
+                    message_parameters={"operation": self.mode},
+                )
         return plan
 
 
diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
index 7c1fef2170ae8..0fcd85c033cf2 100644
--- a/python/pyspark/sql/connect/session.py
+++ b/python/pyspark/sql/connect/session.py
@@ -194,15 +194,11 @@ def create(self) -> "SparkSession":
             has_channel_builder = self._channel_builder is not None
             has_spark_remote = "spark.remote" in self._options
 
-            if has_channel_builder and has_spark_remote:
-                raise ValueError(
-                    "Only one of connection string or channelBuilder "
-                    "can be used to create a new SparkSession."
-                )
-
-            if not has_channel_builder and not has_spark_remote:
-                raise ValueError(
-                    "Needs either connection string or channelBuilder to create a new SparkSession."
+            if (has_channel_builder and has_spark_remote) or (
+                not has_channel_builder and not has_spark_remote
+            ):
+                raise PySparkValueError(
+                    error_class="SESSION_NEED_CONN_STR_OR_BUILDER", message_parameters={}
                 )
 
             if has_channel_builder:
@@ -514,9 +510,8 @@ def createDataFrame(
                 if _has_nulltype(_schema):
                     # For cases like createDataFrame([("Alice", None, 80.1)], schema)
                     # we can not infer the schema from the data itself.
-                    raise ValueError(
-                        "Some of types cannot be determined after inferring, "
-                        "a StructType Schema is required in this case"
+                    raise PySparkValueError(
+                        error_class="CANNOT_DETERMINE_TYPE", message_parameters={}
                     )
 
             from pyspark.sql.connect.conversion import LocalDataToArrowConversion
@@ -738,7 +733,13 @@ def addArtifacts(
         self, *path: str, pyfile: bool = False, archive: bool = False, file: bool = False
     ) -> None:
         if sum([file, pyfile, archive]) > 1:
-            raise ValueError("'pyfile', 'archive' and/or 'file' cannot be True together.")
+            raise PySparkValueError(
+                error_class="INVALID_MULTIPLE_ARGUMENT_CONDITIONS",
+                message_parameters={
+                    "arg_names": "'pyfile', 'archive' and/or 'file'",
+                    "condition": "True together",
+                },
+            )
         self._client.add_artifacts(*path, pyfile=pyfile, archive=archive, file=file)
 
     addArtifacts.__doc__ = PySparkSession.addArtifacts.__doc__
@@ -754,10 +755,9 @@ def _cache_local_relation(self, local_relation: LocalRelation) -> str:
 
     def copyFromLocalToFs(self, local_path: str, dest_path: str) -> None:
         if urllib.parse.urlparse(dest_path).scheme:
-            raise ValueError(
-                "`spark_session.copyFromLocalToFs` API only allows `dest_path` to be a path "
-                "without scheme, and spark driver uses the default scheme to "
-                "determine the destination file system."
+            raise PySparkValueError(
+                error_class="NO_SCHEMA_AND_DRIVER_DEFAULT_SCHEME",
+                message_parameters={"arg_name": "dest_path"},
             )
         self._client.copy_from_local_to_fs(local_path, dest_path)
 
diff --git a/python/pyspark/sql/connect/types.py b/python/pyspark/sql/connect/types.py
index cd2311e614eec..e8da9f61c6654 100644
--- a/python/pyspark/sql/connect/types.py
+++ b/python/pyspark/sql/connect/types.py
@@ -49,7 +49,7 @@
     NullType,
     UserDefinedType,
 )
-from pyspark.errors import PySparkAssertionError
+from pyspark.errors import PySparkAssertionError, PySparkValueError
 
 import pyspark.sql.connect.proto as pb2
 
@@ -205,7 +205,10 @@ def pyspark_types_to_proto_types(data_type: DataType) -> pb2.DataType:
         data_type_string = data_type.data_type_string
         ret.unparsed.data_type_string = data_type_string
     else:
-        raise Exception(f"Unsupported data type {data_type}")
+        raise PySparkValueError(
+            error_class="UNSUPPORTED_OPERATION",
+            message_parameters={"operation": f"data type {data_type}"},
+        )
     return ret
 
 
@@ -303,4 +306,7 @@ def proto_schema_to_pyspark_data_type(schema: pb2.DataType) -> DataType:
             json_value["serializedClass"] = schema.udt.serialized_python_class
         return UserDefinedType.fromJson(json_value)
     else:
-        raise Exception(f"Unsupported data type {schema}")
+        raise PySparkValueError(
+            error_class="UNSUPPORTED_OPERATION",
+            message_parameters={"operation": f"data type {schema}"},
+        )
diff --git a/python/pyspark/sql/observation.py b/python/pyspark/sql/observation.py
index 19201cdf0f3c9..ecb21e8d90849 100644
--- a/python/pyspark/sql/observation.py
+++ b/python/pyspark/sql/observation.py
@@ -19,6 +19,7 @@
 
 from py4j.java_gateway import JavaObject, JVMView
 
+from pyspark.errors import PySparkTypeError, PySparkValueError
 from pyspark.sql import column
 from pyspark.sql.column import Column
 from pyspark.sql.dataframe import DataFrame
@@ -85,9 +86,15 @@ def __init__(self, name: Optional[str] = None) -> None:
         """
         if name is not None:
             if not isinstance(name, str):
-                raise TypeError("name should be a string")
+                raise PySparkTypeError(
+                    error_class="NOT_STR",
+                    message_parameters={"arg_name": "name", "arg_type": type(name).__name__},
+                )
             if name == "":
-                raise ValueError("name should not be empty")
+                raise PySparkValueError(
+                    error_class="VALUE_NOT_NON_EMPTY_STR",
+                    message_parameters={"arg_name": "name", "arg_value": name},
+                )
         self._name = name
         self._jvm: Optional[JVMView] = None
         self._jo: Optional[JavaObject] = None
diff --git a/python/pyspark/sql/pandas/group_ops.py b/python/pyspark/sql/pandas/group_ops.py
index 56403482b9deb..dfe37672c0374 100644
--- a/python/pyspark/sql/pandas/group_ops.py
+++ b/python/pyspark/sql/pandas/group_ops.py
@@ -18,6 +18,7 @@
 from typing import List, Union, TYPE_CHECKING, cast
 import warnings
 
+from pyspark.errors import PySparkValueError
 from pyspark.rdd import PythonEvalType
 from pyspark.sql.column import Column
 from pyspark.sql.dataframe import DataFrame
@@ -97,8 +98,11 @@ def apply(self, udf: "GroupedMapPandasUserDefinedFunction") -> DataFrame:
                 != PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF
             )
         ):
-            raise ValueError(
-                "Invalid udf: the udf argument must be a pandas_udf of type " "GROUPED_MAP."
+            raise PySparkValueError(
+                error_class="INVALID_PANDAS_UDF",
+                message_parameters={
+                    "detail": "the udf argument must be a pandas_udf of type GROUPED_MAP."
+                },
             )
 
         warnings.warn(
diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py
index 856bcbead19df..fb5eaece7f481 100755
--- a/python/pyspark/sql/tests/connect/test_connect_basic.py
+++ b/python/pyspark/sql/tests/connect/test_connect_basic.py
@@ -704,12 +704,15 @@ def test_with_local_list(self):
         with self.assertRaises(ParseException):
             self.connect.createDataFrame(data, "col1 magic_type, col2 int, col3 int, col4 int")
 
-        with self.assertRaisesRegex(
-            ValueError,
-            "Length mismatch: Expected axis has 3 elements, new values have 4 elements",
-        ):
+        with self.assertRaises(PySparkValueError) as pe:
             self.connect.createDataFrame(data, "col1 int, col2 int, col3 int")
 
+        self.check_error(
+            exception=pe.exception,
+            error_class="AXIS_LENGTH_MISMATCH",
+            message_parameters={"expected_length": "3", "actual_length": "4"},
+        )
+
     def test_with_local_rows(self):
         # SPARK-41789, SPARK-41810: Test creating a dataframe with list of rows and dictionaries
         rows = [
@@ -3708,9 +3711,7 @@ def test_metadata(self):
         with self.assertRaises(ValueError) as ve:
             chan = ChannelBuilder("sc://host/;session_id=abcd")
             SparkConnectClient(chan)
-        self.assertIn(
-            "Parameter value 'session_id' must be a valid UUID format.", str(ve.exception)
-        )
+        self.assertIn("Parameter value session_id must be a valid UUID format", str(ve.exception))
 
         chan = ChannelBuilder("sc://host/")
         self.assertIsNone(chan.session_id)
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
index 38650b972eab3..5e7e4413ee465 100644
--- a/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
+++ b/python/pyspark/sql/tests/pandas/test_pandas_grouped_map.py
@@ -422,21 +422,21 @@ def test_wrong_args(self):
     def check_wrong_args(self):
         df = self.data
 
-        with self.assertRaisesRegex(ValueError, "Invalid udf"):
+        with self.assertRaisesRegex(ValueError, "Invalid function"):
             df.groupby("id").apply(lambda x: x)
-        with self.assertRaisesRegex(ValueError, "Invalid udf"):
+        with self.assertRaisesRegex(ValueError, "Invalid function"):
             df.groupby("id").apply(udf(lambda x: x, DoubleType()))
-        with self.assertRaisesRegex(ValueError, "Invalid udf"):
+        with self.assertRaisesRegex(ValueError, "Invalid function"):
             df.groupby("id").apply(sum(df.v))
-        with self.assertRaisesRegex(ValueError, "Invalid udf"):
+        with self.assertRaisesRegex(ValueError, "Invalid function"):
             df.groupby("id").apply(df.v + 1)
         with self.assertRaisesRegex(ValueError, "Invalid function"):
             df.groupby("id").apply(
                 pandas_udf(lambda: 1, StructType([StructField("d", DoubleType())]))
             )
-        with self.assertRaisesRegex(ValueError, "Invalid udf"):
+        with self.assertRaisesRegex(ValueError, "Invalid function"):
             df.groupby("id").apply(pandas_udf(lambda x, y: x, DoubleType()))
-        with self.assertRaisesRegex(ValueError, "Invalid udf.*GROUPED_MAP"):
+        with self.assertRaisesRegex(ValueError, "Invalid function.*GROUPED_MAP"):
             df.groupby("id").apply(pandas_udf(lambda x, y: x, DoubleType(), PandasUDFType.SCALAR))
 
     def test_unsupported_types(self):
diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py
index 3b2fb87123eba..52806f4f4a382 100644
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@@ -995,9 +995,9 @@ def test_observe(self):
         self.assertEqual(unnamed_observation.get, dict(rows=3))
 
         # observation requires name (if given) to be non empty string
-        with self.assertRaisesRegex(TypeError, "name should be a string"):
+        with self.assertRaisesRegex(TypeError, "`name` should be a str, got int"):
             Observation(123)
-        with self.assertRaisesRegex(ValueError, "name should not be empty"):
+        with self.assertRaisesRegex(ValueError, "`name` must be a non empty string, got ''."):
             Observation("")
 
         # dataframe.observe requires at least one expr

From 92e832a20bd013af1c6820152d6e011353545f1c Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Tue, 28 Nov 2023 00:43:22 +0800
Subject: [PATCH 23/40] [SPARK-46118][SQL][SS][CONNECT] Use
 `SparkSession.sessionState.conf` instead of `SQLContext.conf` and mark
 `SQLContext.conf` as deprecated

### What changes were proposed in this pull request?
There are some calls to `SparkSession.sqlContext.conf` in the Spark code, which is equivalent to `SparkSession.sqlContext.sparkSession.sessionState.conf`. This PR changes them to directly call `SparkSession.sessionState.conf` or expand to `SQLContext.SparkSession.sessionState.conf`

At the same time, this PR marks the internal API `SQLContext.conf` as deprecated, and `SparkSession.sessionState.conf` should be used directly.

### Why are the changes needed?
1. `SparkSession.sessionState.conf` has a shallower call stack compared to `SparkSession.sqlContext.conf`
2. `SQLContext` has been marked as deprecated since Apache Spark 1.6, and its APIs should be avoided as much as possible in Spark's internal code.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Pass GitHub Actions

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #44034 from LuciferYang/sc-conf.

Lead-authored-by: yangjie01 <yangjie01@baidu.com>
Co-authored-by: YangJie <yangjie01@baidu.com>
Signed-off-by: yangjie01 <yangjie01@baidu.com>
---
 .../sql/connect/planner/SparkConnectPlanner.scala    |  2 +-
 .../main/scala/org/apache/spark/sql/Dataset.scala    |  4 ++--
 .../main/scala/org/apache/spark/sql/SQLContext.scala |  1 +
 .../execution/command/createDataSourceTables.scala   |  2 +-
 .../org/apache/spark/sql/execution/command/ddl.scala |  4 ++--
 .../spark/sql/execution/datasources/DataSource.scala |  2 +-
 .../sql/execution/datasources/FileStatusCache.scala  |  8 ++++----
 .../sql/execution/datasources/HadoopFsRelation.scala |  2 +-
 .../datasources/PartitioningAwareFileIndex.scala     |  4 ++--
 .../datasources/jdbc/JdbcRelationProvider.scala      |  9 +++++----
 .../execution/streaming/MicroBatchExecution.scala    |  6 +++---
 .../sql/execution/streaming/ProgressReporter.scala   |  2 +-
 .../streaming/continuous/EpochCoordinator.scala      |  2 +-
 .../org/apache/spark/sql/sources/interfaces.scala    |  2 +-
 .../spark/sql/streaming/DataStreamReader.scala       |  2 +-
 .../spark/sql/streaming/DataStreamWriter.scala       |  2 +-
 .../datasources/parquet/ParquetEncodingSuite.scala   |  6 +++---
 .../datasources/parquet/ParquetIOSuite.scala         | 10 +++++-----
 .../sources/RatePerMicroBatchProviderSuite.scala     |  2 +-
 .../streaming/sources/RateStreamProviderSuite.scala  |  4 ++--
 .../streaming/sources/TextSocketStreamSuite.scala    |  2 +-
 .../state/SymmetricHashJoinStateManagerSuite.scala   |  2 +-
 .../scala/org/apache/spark/sql/jdbc/JDBCSuite.scala  |  2 +-
 .../org/apache/spark/sql/jdbc/JDBCWriteSuite.scala   |  2 +-
 .../apache/spark/sql/sources/BucketedReadSuite.scala |  2 +-
 .../org/apache/spark/sql/streaming/StreamSuite.scala |  2 +-
 .../sql/streaming/StreamingAggregationSuite.scala    |  2 +-
 .../spark/sql/streaming/StreamingJoinSuite.scala     |  2 +-
 .../continuous/ContinuousQueuedDataReaderSuite.scala |  4 ++--
 .../sources/StreamingDataSourceV2Suite.scala         |  4 ++--
 .../SparkExecuteStatementOperation.scala             | 12 +++++++-----
 .../sql/hive/thriftserver/SparkSQLCLIDriver.scala    | 10 ++++++----
 .../spark/sql/hive/thriftserver/SparkSQLDriver.scala |  2 +-
 .../hive/thriftserver/SparkSQLSessionManager.scala   |  2 +-
 .../apache/spark/sql/hive/HiveMetastoreCatalog.scala |  2 +-
 .../org/apache/spark/sql/hive/test/TestHive.scala    |  4 +++-
 36 files changed, 70 insertions(+), 62 deletions(-)

diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
index 95c5acc803d49..3ac093b5e0b42 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -2235,7 +2235,7 @@ class SparkConnectPlanner(
 
     JoinWith.typedJoinWith(
       joined,
-      session.sqlContext.conf.dataFrameSelfJoinAutoResolveAmbiguity,
+      session.sessionState.conf.dataFrameSelfJoinAutoResolveAmbiguity,
       session.sessionState.analyzer.resolver,
       rel.getJoinDataType.getIsLeftStruct,
       rel.getJoinDataType.getIsRightStruct)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index d36aaef558663..0c33f2c87fec2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1244,7 +1244,7 @@ class Dataset[T] private[sql](
 
       withTypedPlan(JoinWith.typedJoinWith(
         joined,
-        sqlContext.conf.dataFrameSelfJoinAutoResolveAmbiguity,
+        sparkSession.sessionState.conf.dataFrameSelfJoinAutoResolveAmbiguity,
         sparkSession.sessionState.analyzer.resolver,
         this.exprEnc.isSerializedAsStructForTopLevel,
         other.exprEnc.isSerializedAsStructForTopLevel))
@@ -1450,7 +1450,7 @@ class Dataset[T] private[sql](
     case "*" =>
       Column(ResolvedStar(queryExecution.analyzed.output))
     case _ =>
-      if (sqlContext.conf.supportQuotedRegexColumnName) {
+      if (sparkSession.sessionState.conf.supportQuotedRegexColumnName) {
         colRegex(colName)
       } else {
         Column(addDataFrameIdToCol(resolve(colName)))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
index a52de12e70c41..267581659d87b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala
@@ -78,6 +78,7 @@ class SQLContext private[sql](val sparkSession: SparkSession)
 
   private[sql] def sessionState: SessionState = sparkSession.sessionState
   private[sql] def sharedState: SharedState = sparkSession.sharedState
+  @deprecated("Use SparkSession.sessionState.conf instead", "4.0.0")
   private[sql] def conf: SQLConf = sessionState.conf
 
   def sparkContext: SparkContext = sparkSession.sparkContext
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index a94140dae5c0d..a8ec810fab3a2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -194,7 +194,7 @@ case class CreateDataSourceTableAsSelectCommand(
 
       result match {
         case fs: HadoopFsRelation if table.partitionColumnNames.nonEmpty &&
-            sparkSession.sqlContext.conf.manageFilesourcePartitions =>
+            sparkSession.sessionState.conf.manageFilesourcePartitions =>
           // Need to recover partitions into the metastore so our saved data is visible.
           sessionState.executePlan(RepairTableCommand(
             table.identifier,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 130872b10bcd1..7e001803592ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -715,7 +715,7 @@ case class RepairTableCommand(
       val total = partitionSpecsAndLocs.length
       logInfo(s"Found $total partitions in $root")
 
-      val partitionStats = if (spark.sqlContext.conf.gatherFastStats) {
+      val partitionStats = if (spark.sessionState.conf.gatherFastStats) {
         gatherPartitionStats(spark, partitionSpecsAndLocs, fs, pathFilter, threshold)
       } else {
         Map.empty[Path, PartitionStatistics]
@@ -957,7 +957,7 @@ object DDLUtils extends Logging {
   def verifyPartitionProviderIsHive(
       spark: SparkSession, table: CatalogTable, action: String): Unit = {
     val tableName = table.identifier.table
-    if (!spark.sqlContext.conf.manageFilesourcePartitions && isDatasourceTable(table)) {
+    if (!spark.sessionState.conf.manageFilesourcePartitions && isDatasourceTable(table)) {
       throw QueryCompilationErrors
         .actionNotAllowedOnTableWithFilesourcePartitionManagementDisabledError(action, tableName)
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index 835308f3d0248..cebc74af724db 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -390,7 +390,7 @@ case class DataSource(
 
       // This is a non-streaming file based datasource.
       case (format: FileFormat, _) =>
-        val useCatalogFileIndex = sparkSession.sqlContext.conf.manageFilesourcePartitions &&
+        val useCatalogFileIndex = sparkSession.sessionState.conf.manageFilesourcePartitions &&
           catalogTable.isDefined && catalogTable.get.tracksPartitionsInCatalog &&
           catalogTable.get.partitionColumnNames.nonEmpty
         val (fileCatalog, dataSchema, partitionSchema) = if (useCatalogFileIndex) {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
index e1fdb9570732a..80002ecdaf8da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileStatusCache.scala
@@ -41,12 +41,12 @@ object FileStatusCache {
    *         shared across all clients.
    */
   def getOrCreate(session: SparkSession): FileStatusCache = synchronized {
-    if (session.sqlContext.conf.manageFilesourcePartitions &&
-      session.sqlContext.conf.filesourcePartitionFileCacheSize > 0) {
+    if (session.sessionState.conf.manageFilesourcePartitions &&
+      session.sessionState.conf.filesourcePartitionFileCacheSize > 0) {
       if (sharedCache == null) {
         sharedCache = new SharedInMemoryCache(
-          session.sqlContext.conf.filesourcePartitionFileCacheSize,
-          session.sqlContext.conf.metadataCacheTTL
+          session.sessionState.conf.filesourcePartitionFileCacheSize,
+          session.sessionState.conf.metadataCacheTTL
         )
       }
       sharedCache.createForNewClient()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index fd1824055dcfd..a87453d3fd53a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -65,7 +65,7 @@ case class HadoopFsRelation(
   }
 
   override def sizeInBytes: Long = {
-    val compressionFactor = sqlContext.conf.fileCompressionFactor
+    val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor
     (location.sizeInBytes * compressionFactor).toLong
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
index 37de04a59e4b0..dc41afe226b86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningAwareFileIndex.scala
@@ -157,8 +157,8 @@ abstract class PartitioningAwareFileIndex(
         typeInference = sparkSession.sessionState.conf.partitionColumnTypeInferenceEnabled,
         basePaths = basePaths,
         userSpecifiedSchema = userSpecifiedSchema,
-        caseSensitive = sparkSession.sqlContext.conf.caseSensitiveAnalysis,
-        validatePartitionColumns = sparkSession.sqlContext.conf.validatePartitionColumns,
+        caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis,
+        validatePartitionColumns = sparkSession.sessionState.conf.validatePartitionColumns,
         timeZoneId = timeZoneId)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
index 2760c7ac3019c..d9be1a1e3f674 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcRelationProvider.scala
@@ -32,11 +32,12 @@ class JdbcRelationProvider extends CreatableRelationProvider
       sqlContext: SQLContext,
       parameters: Map[String, String]): BaseRelation = {
     val jdbcOptions = new JDBCOptions(parameters)
-    val resolver = sqlContext.conf.resolver
-    val timeZoneId = sqlContext.conf.sessionLocalTimeZone
+    val sparkSession = sqlContext.sparkSession
+    val resolver = sparkSession.sessionState.conf.resolver
+    val timeZoneId = sparkSession.sessionState.conf.sessionLocalTimeZone
     val schema = JDBCRelation.getSchema(resolver, jdbcOptions)
     val parts = JDBCRelation.columnPartition(schema, resolver, timeZoneId, jdbcOptions)
-    JDBCRelation(schema, parts, jdbcOptions)(sqlContext.sparkSession)
+    JDBCRelation(schema, parts, jdbcOptions)(sparkSession)
   }
 
   override def createRelation(
@@ -45,7 +46,7 @@ class JdbcRelationProvider extends CreatableRelationProvider
       parameters: Map[String, String],
       df: DataFrame): BaseRelation = {
     val options = new JdbcOptionsInWrite(parameters)
-    val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis
+    val isCaseSensitive = sqlContext.sparkSession.sessionState.conf.caseSensitiveAnalysis
     val dialect = JdbcDialects.get(options.url)
     val conn = dialect.createConnectionFactory(options)(-1)
     try {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
index 3febce0fa4456..1bd59e818be57 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/MicroBatchExecution.scala
@@ -66,7 +66,7 @@ class MicroBatchExecution(
         // When the flag is disabled, Spark will fall back to single batch execution, whenever
         // it figures out any source does not support Trigger.AvailableNow.
         // See SPARK-45178 for more details.
-        if (sparkSession.sqlContext.conf.getConf(
+        if (sparkSession.sessionState.conf.getConf(
             SQLConf.STREAMING_TRIGGER_AVAILABLE_NOW_WRAPPER_ENABLED)) {
           logInfo("Configured to use the wrapper of Trigger.AvailableNow for query " +
             s"$prettyIdString.")
@@ -113,7 +113,7 @@ class MicroBatchExecution(
     // transformation is responsible for replacing attributes with their final values.
 
     val disabledSources =
-      Utils.stringToSeq(sparkSession.sqlContext.conf.disabledV2StreamingMicroBatchReaders)
+      Utils.stringToSeq(sparkSession.sessionState.conf.disabledV2StreamingMicroBatchReaders)
 
     import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
     val _logicalPlan = analyzedPlan.transform {
@@ -144,7 +144,7 @@ class MicroBatchExecution(
           })
         } else if (v1.isEmpty) {
           throw QueryExecutionErrors.microBatchUnsupportedByDataSourceError(
-            srcName, sparkSession.sqlContext.conf.disabledV2StreamingMicroBatchReaders, table)
+            srcName, sparkSession.sessionState.conf.disabledV2StreamingMicroBatchReaders, table)
         } else {
           v2ToExecutionRelationMap.getOrElseUpdate(s, {
             // Materialize source to avoid creating it in every batch
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
index e70e94001eee0..ffdf9da6e5814 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ProgressReporter.scala
@@ -145,7 +145,7 @@ trait ProgressReporter extends Logging {
   private def addNewProgress(newProgress: StreamingQueryProgress): Unit = {
     progressBuffer.synchronized {
       progressBuffer += newProgress
-      while (progressBuffer.length >= sparkSession.sqlContext.conf.streamingProgressRetention) {
+      while (progressBuffer.length >= sparkSession.sessionState.conf.streamingProgressRetention) {
         progressBuffer.dequeue()
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
index dbddab2e9acdd..c1027db6ec77f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/EpochCoordinator.scala
@@ -124,7 +124,7 @@ private[continuous] class EpochCoordinator(
   extends ThreadSafeRpcEndpoint with Logging {
 
   private val epochBacklogQueueSize =
-    session.sqlContext.conf.continuousStreamingEpochBacklogQueueSize
+    session.sessionState.conf.continuousStreamingEpochBacklogQueueSize
 
   private var queryWritesStopped: Boolean = false
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
index d194ae77e968f..2911dfae46226 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala
@@ -206,7 +206,7 @@ abstract class BaseRelation {
    *
    * @since 1.3.0
    */
-  def sizeInBytes: Long = sqlContext.conf.defaultSizeInBytes
+  def sizeInBytes: Long = sqlContext.sparkSession.sessionState.conf.defaultSizeInBytes
 
   /**
    * Whether does it need to convert the objects in Row to internal representation, for example:
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 36dd168992a14..905c96ff4cbb0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -156,7 +156,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
       extraOptions + ("path" -> path.get)
     }
 
-    val ds = DataSource.lookupDataSource(source, sparkSession.sqlContext.conf).
+    val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf).
       getConstructor().newInstance()
     // We need to generate the V1 data source so we can pass it to the V2 relation as a shim.
     // We can't be sure at this point whether we'll actually want to use V2, since we don't know the
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
index 036afa62b4889..f4665f8ac6773 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamWriter.scala
@@ -369,7 +369,7 @@ final class DataStreamWriter[T] private[sql](ds: Dataset[T]) {
     } else {
       val cls = DataSource.lookupDataSource(source, df.sparkSession.sessionState.conf)
       val disabledSources =
-        Utils.stringToSeq(df.sparkSession.sqlContext.conf.disabledV2StreamingWriters)
+        Utils.stringToSeq(df.sparkSession.sessionState.conf.disabledV2StreamingWriters)
       val useV1Source = disabledSources.contains(cls.getCanonicalName) ||
         // file source v2 does not support streaming yet.
         classOf[FileDataSourceV2].isAssignableFrom(cls)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala
index a0d11e2ce7ae1..cd6f41b4ef45e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala
@@ -61,7 +61,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess
         List.fill(n)(ROW).toDF().repartition(1).write.parquet(dir.getCanonicalPath)
         val file = TestUtils.listDirectory(dir).head
 
-        val conf = sqlContext.conf
+        val conf = spark.sessionState.conf
         val reader = new VectorizedParquetRecordReader(
           conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize)
         reader.initialize(file, null)
@@ -91,7 +91,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess
         data.repartition(1).write.parquet(dir.getCanonicalPath)
         val file = TestUtils.listDirectory(dir).head
 
-        val conf = sqlContext.conf
+        val conf = spark.sessionState.conf
         val reader = new VectorizedParquetRecordReader(
           conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize)
         reader.initialize(file, null)
@@ -125,7 +125,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSparkSess
         data.toDF("f").coalesce(1).write.parquet(dir.getCanonicalPath)
         val file = TestUtils.listDirectory(dir).head
 
-        val conf = sqlContext.conf
+        val conf = spark.sessionState.conf
         val reader = new VectorizedParquetRecordReader(
           conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize)
         reader.initialize(file, null /* set columns to null to project all columns */)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
index c064f49c31225..1efa8221e41f4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -1375,7 +1375,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
       spark.createDataFrame(data).repartition(1).write.parquet(dir.getCanonicalPath)
       val file = TestUtils.listDirectory(dir).head;
       {
-        val conf = sqlContext.conf
+        val conf = spark.sessionState.conf
         val reader = new VectorizedParquetRecordReader(
           conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize)
         try {
@@ -1394,7 +1394,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
 
       // Project just one column
       {
-        val conf = sqlContext.conf
+        val conf = spark.sessionState.conf
         val reader = new VectorizedParquetRecordReader(
           conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize)
         try {
@@ -1412,7 +1412,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
 
       // Project columns in opposite order
       {
-        val conf = sqlContext.conf
+        val conf = spark.sessionState.conf
         val reader = new VectorizedParquetRecordReader(
           conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize)
         try {
@@ -1431,7 +1431,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
 
       // Empty projection
       {
-        val conf = sqlContext.conf
+        val conf = spark.sessionState.conf
         val reader = new VectorizedParquetRecordReader(
           conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize)
         try {
@@ -1473,7 +1473,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
 
       dataTypes.zip(constantValues).foreach { case (dt, v) =>
         val schema = StructType(StructField("pcol", dt) :: Nil)
-        val conf = sqlContext.conf
+        val conf = spark.sessionState.conf
         val vectorizedReader = new VectorizedParquetRecordReader(
           conf.offHeapColumnVectorEnabled, conf.parquetVectorizedReaderBatchSize)
         val partitionValues = new GenericInternalRow(Array(v))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RatePerMicroBatchProviderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RatePerMicroBatchProviderSuite.scala
index 48f90e34890cf..01599bb92869e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RatePerMicroBatchProviderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RatePerMicroBatchProviderSuite.scala
@@ -29,7 +29,7 @@ class RatePerMicroBatchProviderSuite extends StreamTest {
   import testImplicits._
 
   test("RatePerMicroBatchProvider in registry") {
-    val ds = DataSource.lookupDataSource("rate-micro-batch", spark.sqlContext.conf)
+    val ds = DataSource.lookupDataSource("rate-micro-batch", spark.sessionState.conf)
       .getConstructor().newInstance()
     assert(ds.isInstanceOf[RatePerMicroBatchProvider], "Could not find rate-micro-batch source")
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala
index 556782d9c5541..051cf9e17b782 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/RateStreamProviderSuite.scala
@@ -56,7 +56,7 @@ class RateStreamProviderSuite extends StreamTest {
   }
 
   test("RateStreamProvider in registry") {
-    val ds = DataSource.lookupDataSource("rate", spark.sqlContext.conf)
+    val ds = DataSource.lookupDataSource("rate", spark.sessionState.conf)
       .getConstructor().newInstance()
     assert(ds.isInstanceOf[RateStreamProvider], "Could not find rate source")
   }
@@ -64,7 +64,7 @@ class RateStreamProviderSuite extends StreamTest {
   test("compatible with old path in registry") {
     val ds = DataSource.lookupDataSource(
       "org.apache.spark.sql.execution.streaming.RateSourceProvider",
-      spark.sqlContext.conf).getConstructor().newInstance()
+      spark.sessionState.conf).getConstructor().newInstance()
     assert(ds.isInstanceOf[RateStreamProvider], "Could not find rate source")
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala
index 92dd3a996801d..06cb5be2add6a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/sources/TextSocketStreamSuite.scala
@@ -87,7 +87,7 @@ class TextSocketStreamSuite extends StreamTest with SharedSparkSession {
   test("backward compatibility with old path") {
     val ds = DataSource.lookupDataSource(
       "org.apache.spark.sql.execution.streaming.TextSocketSourceProvider",
-      spark.sqlContext.conf).getConstructor().newInstance()
+      spark.sessionState.conf).getConstructor().newInstance()
     assert(ds.isInstanceOf[TextSocketSourceProvider], "Could not find socket source")
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
index b0abcbbe4d020..16f3e972c7697 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/streaming/state/SymmetricHashJoinStateManagerSuite.scala
@@ -317,7 +317,7 @@ class SymmetricHashJoinStateManagerSuite extends StreamTest with BeforeAndAfter
     withTempDir { file =>
       withSQLConf(SQLConf.STATE_STORE_SKIP_NULLS_FOR_STREAM_STREAM_JOINS.key ->
         skipNullsForStreamStreamJoins.toString) {
-        val storeConf = new StateStoreConf(spark.sqlContext.conf)
+        val storeConf = new StateStoreConf(spark.sessionState.conf)
         val stateInfo = StatefulOperatorStateInfo(file.getAbsolutePath, UUID.randomUUID, 0, 0, 5)
         val manager = new SymmetricHashJoinStateManager(
           LeftSide, inputValueAttribs, joinKeyExprs, Some(stateInfo), storeConf, new Configuration,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
index e759ef01e2c73..8655c0a3c29ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCSuite.scala
@@ -1329,7 +1329,7 @@ class JDBCSuite extends QueryTest with SharedSparkSession {
     val df = spark.createDataset(Seq("a", "b", "c")).toDF("order")
     val schema = JdbcUtils.schemaString(
       df.schema,
-      df.sqlContext.conf.caseSensitiveAnalysis,
+      df.sparkSession.sessionState.conf.caseSensitiveAnalysis,
       "jdbc:mysql://localhost:3306/temp")
     assert(schema.contains("`order` LONGTEXT"))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
index ccb202085910a..f904d0e3d3c81 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCWriteSuite.scala
@@ -417,7 +417,7 @@ class JDBCWriteSuite extends SharedSparkSession with BeforeAndAfter {
 
       assert(JdbcUtils.schemaString(
         schema,
-        spark.sqlContext.conf.caseSensitiveAnalysis,
+        spark.sessionState.conf.caseSensitiveAnalysis,
         url1,
         Option(createTableColTypes)) == expectedSchemaStr)
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
index 746f289c39327..898e80df0207d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -449,7 +449,7 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils with Adapti
           joined.sort("bucketed_table1.k", "bucketed_table2.k"),
           df1.join(df2, joinCondition(df1, df2), joinType).sort("df1.k", "df2.k"))
 
-        val joinOperator = if (joined.sqlContext.conf.adaptiveExecutionEnabled) {
+        val joinOperator = if (joined.sparkSession.sessionState.conf.adaptiveExecutionEnabled) {
           val executedPlan =
             joined.queryExecution.executedPlan.asInstanceOf[AdaptiveSparkPlanExec].executedPlan
           assert(executedPlan.isInstanceOf[SortMergeJoinExec])
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index 953bbddf6abbb..883f64ff7af4b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -1317,7 +1317,7 @@ class StreamSuite extends StreamTest {
           .map(_.asInstanceOf[RepartitionByExpression].numPartitions)
         // Before the fix of SPARK-34482, the numPartition is the value of
         // `COALESCE_PARTITIONS_INITIAL_PARTITION_NUM`.
-        assert(numPartition.get === spark.sqlContext.conf.getConf(SQLConf.SHUFFLE_PARTITIONS))
+        assert(numPartition.get === spark.sessionState.conf.getConf(SQLConf.SHUFFLE_PARTITIONS))
       } finally {
         if (query != null) {
           query.stop()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
index be84640f4bf36..8d79cf4af7717 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingAggregationSuite.scala
@@ -708,7 +708,7 @@ class StreamingAggregationSuite extends StateStoreMetricsTest with Assertions {
       .groupBy("group")
       .agg(collect_list("value"))
     testStream(df, outputMode = OutputMode.Update)(
-      AddData(input, (1 to spark.sqlContext.conf.objectAggSortBasedFallbackThreshold): _*),
+      AddData(input, (1 to spark.sessionState.conf.objectAggSortBasedFallbackThreshold): _*),
       AssertOnQuery { q =>
         q.processAllAvailable()
         true
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
index 4692c685c80bd..e05cb4d3c35ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingJoinSuite.scala
@@ -618,7 +618,7 @@ class StreamingInnerJoinSuite extends StreamingJoinSuite {
           }
         }
 
-        val numPartitions = spark.sqlContext.conf.getConf(SQLConf.SHUFFLE_PARTITIONS)
+        val numPartitions = spark.sessionState.conf.getConf(SQLConf.SHUFFLE_PARTITIONS)
 
         assert(query.lastExecution.executedPlan.collect {
           case j @ StreamingSymmetricHashJoinExec(_, _, _, _, _, _, _, _, _,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala
index 55b884573f647..dac9e760e4be4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/continuous/ContinuousQueuedDataReaderSuite.scala
@@ -93,8 +93,8 @@ class ContinuousQueuedDataReaderSuite extends StreamTest with MockitoSugar {
       partitionReader,
       new StructType().add("i", "int"),
       mockContext,
-      dataQueueSize = sqlContext.conf.continuousStreamingExecutorQueueSize,
-      epochPollIntervalMs = sqlContext.conf.continuousStreamingExecutorPollIntervalMs)
+      dataQueueSize = spark.sessionState.conf.continuousStreamingExecutorQueueSize,
+      epochPollIntervalMs = spark.sessionState.conf.continuousStreamingExecutorPollIntervalMs)
 
     (queue, reader)
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
index 1a4862bf9781d..e77ba92fe2981 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/sources/StreamingDataSourceV2Suite.scala
@@ -422,10 +422,10 @@ class StreamingDataSourceV2Suite extends StreamTest {
 
   for ((read, write, trigger) <- cases) {
     testQuietly(s"stream with read format $read, write format $write, trigger $trigger") {
-      val sourceTable = DataSource.lookupDataSource(read, spark.sqlContext.conf).getConstructor()
+      val sourceTable = DataSource.lookupDataSource(read, spark.sessionState.conf).getConstructor()
         .newInstance().asInstanceOf[SimpleTableProvider].getTable(CaseInsensitiveStringMap.empty())
 
-      val sinkTable = DataSource.lookupDataSource(write, spark.sqlContext.conf).getConstructor()
+      val sinkTable = DataSource.lookupDataSource(write, spark.sessionState.conf).getConstructor()
         .newInstance().asInstanceOf[SimpleTableProvider].getTable(CaseInsensitiveStringMap.empty())
 
       import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
index e70d05820c34d..f8c592a943a04 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkExecuteStatementOperation.scala
@@ -49,11 +49,13 @@ private[hive] class SparkExecuteStatementOperation(
   with SparkOperation
   with Logging {
 
+  val session = sqlContext.sparkSession
+
   // If a timeout value `queryTimeout` is specified by users and it is smaller than
   // a global timeout value, we use the user-specified value.
   // This code follows the Hive timeout behaviour (See #29933 for details).
   private val timeout = {
-    val globalTimeout = sqlContext.conf.getConf(SQLConf.THRIFTSERVER_QUERY_TIMEOUT)
+    val globalTimeout = session.sessionState.conf.getConf(SQLConf.THRIFTSERVER_QUERY_TIMEOUT)
     if (globalTimeout > 0 && (queryTimeout <= 0 || globalTimeout < queryTimeout)) {
       globalTimeout
     } else {
@@ -61,13 +63,13 @@ private[hive] class SparkExecuteStatementOperation(
     }
   }
 
-  private val forceCancel = sqlContext.conf.getConf(SQLConf.THRIFTSERVER_FORCE_CANCEL)
+  private val forceCancel = session.sessionState.conf.getConf(SQLConf.THRIFTSERVER_FORCE_CANCEL)
 
   private val redactedStatement = {
-    val substitutorStatement = SQLConf.withExistingConf(sqlContext.conf) {
+    val substitutorStatement = SQLConf.withExistingConf(session.sessionState.conf) {
       new VariableSubstitution().substitute(statement)
     }
-    SparkUtils.redact(sqlContext.conf.stringRedactionPattern, substitutorStatement)
+    SparkUtils.redact(session.sessionState.conf.stringRedactionPattern, substitutorStatement)
   }
 
   private var result: DataFrame = _
@@ -259,7 +261,7 @@ private[hive] class SparkExecuteStatementOperation(
           e match {
             case _: HiveSQLException => throw e
             case _ => throw HiveThriftServerErrors.runningQueryError(
-              e, sqlContext.conf.errorMessageFormat)
+              e, sqlContext.sparkSession.sessionState.conf.errorMessageFormat)
           }
         }
     } finally {
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
index 5b76cd653e372..73290a4d25928 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLCLIDriver.scala
@@ -284,7 +284,8 @@ private[hive] object SparkSQLCLIDriver extends Logging {
     var prefix = ""
 
     def currentDB = {
-      if (!SparkSQLEnv.sqlContext.conf.getConf(LEGACY_EMPTY_CURRENT_DB_IN_CLI)) {
+      if (!SparkSQLEnv.sqlContext.sparkSession.sessionState.conf
+        .getConf(LEGACY_EMPTY_CURRENT_DB_IN_CLI)) {
         s" (${SparkSQLEnv.sqlContext.sparkSession.catalog.currentDatabase})"
       } else {
         ReflectionUtils.invokeStatic(classOf[CliDriver], "getFormattedDb",
@@ -448,7 +449,8 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
   }
 
   override def setHiveVariables(hiveVariables: java.util.Map[String, String]): Unit = {
-    hiveVariables.asScala.foreach(kv => SparkSQLEnv.sqlContext.conf.setConfString(kv._1, kv._2))
+    hiveVariables.asScala.foreach(kv =>
+      SparkSQLEnv.sqlContext.sparkSession.sessionState.conf.setConfString(kv._1, kv._2))
   }
 
   def printMasterAndAppId(): Unit = {
@@ -504,7 +506,7 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
 
           ret = rc.getResponseCode
           if (ret != 0) {
-            val format = SparkSQLEnv.sqlContext.conf.errorMessageFormat
+            val format = SparkSQLEnv.sqlContext.sparkSession.sessionState.conf.errorMessageFormat
             val e = rc.getException
             val msg = e match {
               case st: SparkThrowable with Throwable => SparkThrowableHelper.getMessage(st, format)
@@ -523,7 +525,7 @@ private[hive] class SparkSQLCLIDriver extends CliDriver with Logging {
           val res = new JArrayList[String]()
 
           if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CLI_PRINT_HEADER) ||
-              SparkSQLEnv.sqlContext.conf.cliPrintHeader) {
+              SparkSQLEnv.sqlContext.sparkSession.sessionState.conf.cliPrintHeader) {
             // Print the column names.
             Option(driver.getSchema.getFieldSchemas).foreach { fields =>
               out.println(fields.asScala.map(_.getName).mkString("\t"))
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
index 4834956f478d0..5d9ec3051dc35 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLDriver.scala
@@ -61,7 +61,7 @@ private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlCont
 
   override def run(command: String): CommandProcessorResponse = {
     try {
-      val substitutorCommand = SQLConf.withExistingConf(context.conf) {
+      val substitutorCommand = SQLConf.withExistingConf(context.sparkSession.sessionState.conf) {
         new VariableSubstitution().substitute(command)
       }
       context.sparkContext.setJobDescription(substitutorCommand)
diff --git a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
index b6528ac62419d..7acc485b01e57 100644
--- a/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
+++ b/sql/hive-thriftserver/src/main/scala/org/apache/spark/sql/hive/thriftserver/SparkSQLSessionManager.scala
@@ -58,7 +58,7 @@ private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext:
       val session = super.getSession(sessionHandle)
       HiveThriftServer2.eventManager.onSessionCreated(
         session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername)
-      val ctx = if (sqlContext.conf.hiveThriftServerSingleSession) {
+      val ctx = if (sqlContext.sparkSession.sessionState.conf.hiveThriftServerSingleSession) {
         sqlContext
       } else {
         sqlContext.newSession()
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index 08f69aecdd2ea..e7d03b82274c3 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -194,7 +194,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     val tableIdentifier =
       QualifiedTableName(relation.tableMeta.database, relation.tableMeta.identifier.table)
 
-    val lazyPruningEnabled = sparkSession.sqlContext.conf.manageFilesourcePartitions
+    val lazyPruningEnabled = sparkSession.sessionState.conf.manageFilesourcePartitions
     val tablePath = new Path(relation.tableMeta.location)
     val fileFormat = fileFormatClass.getConstructor().newInstance()
     val bucketSpec = relation.tableMeta.bucketSpec
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala
index a9861dafda723..d4847ee830f57 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/test/TestHive.scala
@@ -70,7 +70,9 @@ object TestHive
         // LocalRelation will exercise the optimization rules better by disabling it as
         // this rule may potentially block testing of other optimization rules such as
         // ConstantPropagation etc.
-        .set(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, ConvertToLocalRelation.ruleName)))
+        .set(SQLConf.OPTIMIZER_EXCLUDED_RULES.key, ConvertToLocalRelation.ruleName))) {
+  override def conf: SQLConf = sparkSession.sessionState.conf
+}
 
 
 case class TestHiveVersion(hiveClient: HiveClient)

From e6edb1013b74ebd6e7135ae16fc1822c0e0f2e5e Mon Sep 17 00:00:00 2001
From: YangJie <yangjie01@baidu.com>
Date: Mon, 27 Nov 2023 13:09:33 -0800
Subject: [PATCH 24/40] [SPARK-46089][BUILD] Upgrade commons-lang3 to 3.14.0

### What changes were proposed in this pull request?
This pr aims to upgrade Apache commons-lang3 from 3.13.0 to 3.14.0.

### Why are the changes needed?
The new version brings some new features:

- Added a number of new functions and methods, including handling of functions, date and time processing, and support for localization, etc.
- Added syntax support for optional tokens and a new ArrayFill.
- Enhanced support for exception handling, including the addition of new exception handling methods.
- Improvements to ConcurrentInitializer, which can now be instantiated and configured with allocation and release lambdas.
- Added support for RISC-V in ArchUtils

and bugs fix:
- Improved the performance of StringUtils.isMixedCase().
- Fixed the issue of ThreadUtils find methods returning null items.
- Fixed the issue where changes in ReflectionToStringBuilder in version 3.13.0 broke the logic for overriding classes.
- Return "null" instead of NPE in ClassLoaderUtils.toString(ClassLoader) and ClassLoaderUtils.toString(URLClassLoader).
- Fixed the issue with ThresholdCircuitBreaker#checkState().
- ConcurrentInitializer implementations can now be used without subclassing.

The full release note as follow:

- https://github.com/apache/commons-lang/blob/c8774fa74adbbbcd4e5f915ab6bc3aa10d877419/RELEASE-NOTES.txt#L39-L81

![image](https://github.com/apache/spark/assets/1475305/8ac6c0fd-341e-4e0a-8750-7b9729c1bff0)

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Pass GitHub Actions

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #44003 from LuciferYang/lang3-314.

Lead-authored-by: YangJie <yangjie01@baidu.com>
Co-authored-by: yangjie01 <yangjie01@baidu.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 dev/deps/spark-deps-hadoop-3-hive-2.3 | 2 +-
 pom.xml                               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3
index 5a96f3fe9b982..3a8c3dc707aa1 100644
--- a/dev/deps/spark-deps-hadoop-3-hive-2.3
+++ b/dev/deps/spark-deps-hadoop-3-hive-2.3
@@ -45,7 +45,7 @@ commons-crypto/1.1.0//commons-crypto-1.1.0.jar
 commons-dbcp/1.4//commons-dbcp-1.4.jar
 commons-io/2.15.0//commons-io-2.15.0.jar
 commons-lang/2.6//commons-lang-2.6.jar
-commons-lang3/3.13.0//commons-lang3-3.13.0.jar
+commons-lang3/3.14.0//commons-lang3-3.14.0.jar
 commons-logging/1.1.3//commons-logging-1.1.3.jar
 commons-math3/3.6.1//commons-math3-3.6.1.jar
 commons-pool/1.5.4//commons-pool-1.5.4.jar
diff --git a/pom.xml b/pom.xml
index 6ed16d88b0dc4..fce9c2b54e03a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -197,7 +197,7 @@
     <!-- org.apache.commons/commons-lang/-->
     <commons-lang2.version>2.6</commons-lang2.version>
     <!-- org.apache.commons/commons-lang3/-->
-    <commons-lang3.version>3.13.0</commons-lang3.version>
+    <commons-lang3.version>3.14.0</commons-lang3.version>
     <!-- org.apache.commons/commons-pool2/-->
     <commons-pool2.version>2.11.1</commons-pool2.version>
     <datanucleus-core.version>4.1.17</datanucleus-core.version>

From 86971665ed4786a5d4269c9371d3e2d6751a49d2 Mon Sep 17 00:00:00 2001
From: jdesjean <jf.gauthier@databricks.com>
Date: Tue, 28 Nov 2023 08:31:53 +0900
Subject: [PATCH 25/40] [SPARK-45957][CONNECT] Avoid generating execution plan
 for non-executable commands

### What changes were proposed in this pull request?
Remove the metric response for non executable commands (and the executedPlan generation)

### Why are the changes needed?
SQL command can be of 2 types:
1) Executable (i.e. `show tables`). They are eagerly executed and return a response. The execution can generate metrics that should be returned to the user.
2) Non executable. They are lazy and are not executed. As such they should not generate metrics.
We currently generate a executedPlan for both command & relations to attach the metrics. This is a performance concern for relations as generating the optimized & physical can take some time. Furthermore, streaming SQL relations cannot generate a physical plan in the same way (i.e. they need to use read / write stream).

### Does this PR introduce _any_ user-facing change?
Yes, SQL non-executable commands will no longer return a metric response. This is a backward compatible change.

### How was this patch tested?
Unit

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #43851 from jdesjean/SPARK-45957.

Authored-by: jdesjean <jf.gauthier@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../connect/planner/SparkConnectPlanner.scala | 21 ++++++++++++----
 .../sql/connect/utils/MetricGenerator.scala   |  6 +++++
 .../sql/tests/streaming/test_streaming.py     | 24 +++++++++++++++++++
 3 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
index 3ac093b5e0b42..abfc063139056 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -2563,6 +2563,8 @@ class SparkConnectPlanner(
     // To avoid explicit handling of the result on the client, we build the expected input
     // of the relation on the server. The client has to simply forward the result.
     val result = SqlCommandResult.newBuilder()
+    // Only filled when isCommand
+    val metrics = ExecutePlanResponse.Metrics.newBuilder()
     if (isCommand) {
       // Convert the results to Arrow.
       val schema = df.schema
@@ -2596,10 +2598,10 @@ class SparkConnectPlanner(
             proto.LocalRelation
               .newBuilder()
               .setData(ByteString.copyFrom(bytes))))
+      metrics.addAllMetrics(MetricGenerator.transformPlan(df).asJava)
     } else {
-      // Trigger assertExecutedPlanPrepared to ensure post ReadyForExecution before finished
-      // executedPlan is currently called by createMetricsResponse below
-      df.queryExecution.assertExecutedPlanPrepared()
+      // No execution triggered for relations. Manually set ready
+      tracker.setReadyForExecution()
       result.setRelation(
         proto.Relation
           .newBuilder()
@@ -2622,8 +2624,17 @@ class SparkConnectPlanner(
         .setSqlCommandResult(result)
         .build())
 
-    // Send Metrics
-    responseObserver.onNext(MetricGenerator.createMetricsResponse(sessionHolder, df))
+    // Send Metrics when isCommand (i.e. show tables) which is eagerly executed & has metrics
+    // Skip metrics when !isCommand (i.e. select 1) which is not executed & doesn't have metrics
+    if (isCommand) {
+      responseObserver.onNext(
+        ExecutePlanResponse
+          .newBuilder()
+          .setSessionId(sessionHolder.sessionId)
+          .setServerSideSessionId(sessionHolder.serverSessionId)
+          .setMetrics(metrics.build)
+          .build)
+    }
   }
 
   private def handleRegisterUserDefinedFunction(
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala
index c9bba653e8a8f..e2e4128311871 100644
--- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala
+++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala
@@ -51,6 +51,12 @@ private[connect] object MetricGenerator extends AdaptiveSparkPlanHelper {
     allChildren(p).flatMap(c => transformPlan(c, p.id))
   }
 
+  private[connect] def transformPlan(
+      rows: DataFrame): Seq[ExecutePlanResponse.Metrics.MetricObject] = {
+    val executedPlan = rows.queryExecution.executedPlan
+    transformPlan(executedPlan, executedPlan.id)
+  }
+
   private def transformPlan(
       p: SparkPlan,
       parentId: Int): Seq[ExecutePlanResponse.Metrics.MetricObject] = {
diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py
index a905a87a3b4d6..2b9072c34befe 100644
--- a/python/pyspark/sql/tests/streaming/test_streaming.py
+++ b/python/pyspark/sql/tests/streaming/test_streaming.py
@@ -382,6 +382,30 @@ def test_streaming_write_to_table(self):
             result = self.spark.sql("SELECT value FROM output_table").collect()
             self.assertTrue(len(result) > 0)
 
+    def test_streaming_with_temporary_view(self):
+        """
+        This verifies createOrReplaceTempView() works with a streaming dataframe. An SQL
+        SELECT query on such a table results in a streaming dataframe and the streaming query works
+        as expected.
+        """
+        with self.table("input_table", "this_query"):
+            self.spark.sql("CREATE TABLE input_table (value string) USING parquet")
+            self.spark.sql("INSERT INTO input_table VALUES ('a'), ('b'), ('c')")
+            df = self.spark.readStream.table("input_table")
+            self.assertTrue(df.isStreaming)
+            # Create a temp view
+            df.createOrReplaceTempView("test_view")
+            # Create a select query
+            view_df = self.spark.sql("SELECT CONCAT('view_', value) as vv from test_view")
+            self.assertTrue(view_df.isStreaming)
+            q = view_df.writeStream.format("memory").queryName("this_query").start()
+            q.processAllAvailable()
+            q.stop()
+            result = self.spark.sql("SELECT * FROM this_query ORDER BY vv").collect()
+            self.assertEqual(
+                set([Row(value="view_a"), Row(value="view_b"), Row(value="view_c")]), set(result)
+            )
+
 
 class StreamingTests(StreamingTestsMixin, ReusedSQLTestCase):
     pass

From 11ac856919815f7ef2e534e205d1ed83398de136 Mon Sep 17 00:00:00 2001
From: panbingkun <pbk1982@gmail.com>
Date: Tue, 28 Nov 2023 08:46:39 +0900
Subject: [PATCH 26/40] [SPARK-46103][FOLLOWUP] Keep Sphinx version consistency
 in spark-rm

### What changes were proposed in this pull request?
The pr aims to  keep Sphinx version consistency in `spark-rm`.

### Why are the changes needed?
To avoid unexpected behavior in published documents.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Manually test.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #44032 from panbingkun/SPARK-46103_FOLLOWUP.

Authored-by: panbingkun <pbk1982@gmail.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 dev/create-release/spark-rm/Dockerfile | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile
index dbb851d74a565..9cfe78570421e 100644
--- a/dev/create-release/spark-rm/Dockerfile
+++ b/dev/create-release/spark-rm/Dockerfile
@@ -37,12 +37,7 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true
 # These arguments are just for reuse and not really meant to be customized.
 ARG APT_INSTALL="apt-get install --no-install-recommends -y"
 
-# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
-#   See also https://github.com/sphinx-doc/sphinx/issues/7551.
-#   We should use the latest Sphinx version once this is fixed.
-# TODO(SPARK-35375): Jinja2 3.0.0+ causes error when building with Sphinx.
-#   See also https://issues.apache.org/jira/browse/SPARK-35375.
-ARG PIP_PKGS="sphinx==3.0.4 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.8.0 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==2.11.3 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==1.5.3 pyarrow==3.0.0 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.59.3 protobuf==4.21.6 grpcio-status==1.59.3 googleapis-common-protos==1.56.4"
+ARG PIP_PKGS="sphinx==4.2.0 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==1.5.3 pyarrow==3.0.0 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.59.3 protobuf==4.21.6 grpcio-status==1.59.3 googleapis-common-protos==1.56.4"
 ARG GEM_PKGS="bundler:2.3.8"
 
 # Install extra needed repos and refresh.

From 2430e87ac93952ae7e296faf49734f65af29f9ed Mon Sep 17 00:00:00 2001
From: Max Gekk <max.gekk@gmail.com>
Date: Tue, 28 Nov 2023 08:47:46 +0900
Subject: [PATCH 27/40] [SPARK-46115][SQL] Restrict charsets in `encode()`

### What changes were proposed in this pull request?
In the PR, I propose to restrict the supported charsets in the `encode()` functions by the list from [the doc](https://spark.apache.org/docs/latest/api/sql/#encode):
```
'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'
```
and introduce the SQL config `spark.sql.legacy.javaCharsets` for restoring the previous behaviour.

### Why are the changes needed?
Currently the list of supported charsets in `encode()` is not stable and fully depends on the used JDK version. So, sometimes user code might not work because a devop changed Java version in Spark cluster.

### Does this PR introduce _any_ user-facing change?
Yes.

### How was this patch tested?
By running new checks:
```
$ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z string-functions.sql"
```

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #44020 from MaxGekk/restrict-charsets-in-encode-2.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../explain-results/function_encode.explain   |  2 +-
 .../function_to_binary_with_format.explain    |  2 +-
 docs/sql-migration-guide.md                   |  1 +
 .../sql/tests/pandas/test_pandas_map.py       |  2 +-
 .../expressions/stringExpressions.scala       | 25 ++++++-
 .../apache/spark/sql/internal/SQLConf.scala   | 11 ++++
 .../ansi/string-functions.sql.out             | 54 +++++++++++++--
 .../analyzer-results/string-functions.sql.out | 54 +++++++++++++--
 .../typeCoercion/native/concat.sql.out        | 18 ++---
 .../typeCoercion/native/elt.sql.out           |  8 +--
 .../sql-tests/inputs/string-functions.sql     |  6 ++
 .../results/ansi/string-functions.sql.out     | 66 +++++++++++++++++++
 .../results/string-functions.sql.out          | 66 +++++++++++++++++++
 .../org/apache/spark/sql/ExplainSuite.scala   |  8 +--
 14 files changed, 288 insertions(+), 35 deletions(-)

diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain
index 56da919abf4c5..2f65436059230 100644
--- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain
@@ -1,2 +1,2 @@
-Project [encode(g#0, UTF-8) AS encode(g, UTF-8)#0]
+Project [encode(g#0, UTF-8, false) AS encode(g, UTF-8)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain
index e9513f0103c81..b62ccccc0c15e 100644
--- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain
+++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain
@@ -1,2 +1,2 @@
-Project [encode(g#0, UTF-8) AS to_binary(g, utf-8)#0]
+Project [encode(g#0, UTF-8, false) AS to_binary(g, utf-8)#0]
 +- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 5c00ce6558513..664bccf26651b 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -29,6 +29,7 @@ license: |
 - Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions.
 - Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead.
 - Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value.
+- Since Spark 4.0, the `encode()` function supports only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`.
 
 ## Upgrading from Spark SQL 3.4 to 3.5
 
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_map.py b/python/pyspark/sql/tests/pandas/test_pandas_map.py
index 304b78049b20f..ec9f208d08f9b 100644
--- a/python/pyspark/sql/tests/pandas/test_pandas_map.py
+++ b/python/pyspark/sql/tests/pandas/test_pandas_map.py
@@ -110,7 +110,7 @@ def func(iterator):
             df = (
                 self.spark.range(10, numPartitions=3)
                 .select(col("id").cast("string").alias("str"))
-                .withColumn("bin", encode(col("str"), "utf8"))
+                .withColumn("bin", encode(col("str"), "utf-8"))
             )
             actual = df.mapInPandas(func, "str string, bin binary").collect()
             expected = df.collect()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 0d3239423b22c..90cfd13875d0c 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -2685,18 +2685,26 @@ case class StringDecode(bin: Expression, charset: Expression)
   since = "1.5.0",
   group = "string_funcs")
 // scalastyle:on line.size.limit
-case class Encode(value: Expression, charset: Expression)
+case class Encode(value: Expression, charset: Expression, legacyCharsets: Boolean)
   extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant {
 
+  def this(value: Expression, charset: Expression) =
+    this(value, charset, SQLConf.get.legacyJavaCharsets)
+
   override def left: Expression = value
   override def right: Expression = charset
   override def dataType: DataType = BinaryType
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType)
 
+  private val supportedCharsets = Set(
+    "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16")
+
   protected override def nullSafeEval(input1: Any, input2: Any): Any = {
     val toCharset = input2.asInstanceOf[UTF8String].toString
     try {
-      input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
+      if (legacyCharsets || supportedCharsets.contains(toCharset.toUpperCase(Locale.ROOT))) {
+        input1.asInstanceOf[UTF8String].toString.getBytes(toCharset)
+      } else throw new UnsupportedEncodingException
     } catch {
       case _: UnsupportedEncodingException =>
         throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset)
@@ -2706,10 +2714,17 @@ case class Encode(value: Expression, charset: Expression)
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     nullSafeCodeGen(ctx, ev, (string, charset) => {
       val toCharset = ctx.freshName("toCharset")
+      val sc = JavaCode.global(
+        ctx.addReferenceObj("supportedCharsets", supportedCharsets),
+        supportedCharsets.getClass)
       s"""
         String $toCharset = $charset.toString();
         try {
-          ${ev.value} = $string.toString().getBytes($toCharset);
+          if ($legacyCharsets || $sc.contains($toCharset.toUpperCase(java.util.Locale.ROOT))) {
+            ${ev.value} = $string.toString().getBytes($toCharset);
+          } else {
+            throw new java.io.UnsupportedEncodingException();
+          }
         } catch (java.io.UnsupportedEncodingException e) {
           throw QueryExecutionErrors.invalidCharsetError("$prettyName", $toCharset);
         }"""
@@ -2720,6 +2735,10 @@ case class Encode(value: Expression, charset: Expression)
     newLeft: Expression, newRight: Expression): Encode = copy(value = newLeft, charset = newRight)
 }
 
+object Encode {
+  def apply(value: Expression, charset: Expression): Encode = new Encode(value, charset)
+}
+
 /**
  * Converts the input expression to a binary value based on the supplied format.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 5133c40bc6faa..d4e5c6a3d1e04 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -4584,6 +4584,15 @@ object SQLConf {
     .checkValue(_ > 0, "The number of stack traces in the DataFrame context must be positive.")
     .createWithDefault(1)
 
+  val LEGACY_JAVA_CHARSETS = buildConf("spark.sql.legacy.javaCharsets")
+    .internal()
+    .doc("When set to true, the functions like `encode()` can use charsets from JDK while " +
+      "encoding or decoding string values. If it is false, such functions support only one of " +
+      "the charsets: 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'.")
+    .version("4.0.0")
+    .booleanConf
+    .createWithDefault(false)
+
   /**
    * Holds information about keys that have been deprecated.
    *
@@ -5474,6 +5483,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
 
   def stackTracesInDataFrameContext: Int = getConf(SQLConf.STACK_TRACES_IN_DATAFRAME_CONTEXT)
 
+  def legacyJavaCharsets: Boolean = getConf(SQLConf.LEGACY_JAVA_CHARSETS)
+
   /** ********************** SQLConf functionality methods ************ */
 
   /** Set Spark SQL configuration properties. */
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
index 9c210a713de3d..9d8705e3e8620 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out
@@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x]
 -- !query
 SELECT btrim(encode(" xyz ", 'utf-8'))
 -- !query analysis
-Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x]
+Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x]
 +- OneRowRelation
 
 
 -- !query
 SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
 -- !query analysis
-Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x]
+Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x]
 +- OneRowRelation
 
 
 -- !query
 SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
 -- !query analysis
-Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x]
+Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x]
 +- OneRowRelation
 
 
@@ -640,17 +640,59 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x]
 +- OneRowRelation
 
 
+-- !query
+set spark.sql.legacy.javaCharsets=true
+-- !query analysis
+SetCommand (spark.sql.legacy.javaCharsets,Some(true))
+
+
+-- !query
+select encode('hello', 'WINDOWS-1252')
+-- !query analysis
+Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+   +- LocalRelation [scol#x, ecol#x]
+
+
+-- !query
+set spark.sql.legacy.javaCharsets=false
+-- !query analysis
+SetCommand (spark.sql.legacy.javaCharsets,Some(false))
+
+
+-- !query
+select encode('hello', 'WINDOWS-1252')
+-- !query analysis
+Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+   +- LocalRelation [scol#x, ecol#x]
+
+
 -- !query
 select encode('hello', 'Windows-xxx')
 -- !query analysis
-Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
+Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x]
 +- OneRowRelation
 
 
 -- !query
 select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
 -- !query analysis
-Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x]
 +- SubqueryAlias t
    +- LocalRelation [scol#x, ecol#x]
 
@@ -704,7 +746,7 @@ org.apache.spark.sql.AnalysisException
 -- !query
 select decode(encode('abc', 'utf-8'), 'utf-8')
 -- !query analysis
-Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x]
+Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x]
 +- OneRowRelation
 
 
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
index 9c210a713de3d..9d8705e3e8620 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out
@@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x]
 -- !query
 SELECT btrim(encode(" xyz ", 'utf-8'))
 -- !query analysis
-Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x]
+Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x]
 +- OneRowRelation
 
 
 -- !query
 SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8'))
 -- !query analysis
-Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x]
+Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x]
 +- OneRowRelation
 
 
 -- !query
 SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8'))
 -- !query analysis
-Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x]
+Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x]
 +- OneRowRelation
 
 
@@ -640,17 +640,59 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x]
 +- OneRowRelation
 
 
+-- !query
+set spark.sql.legacy.javaCharsets=true
+-- !query analysis
+SetCommand (spark.sql.legacy.javaCharsets,Some(true))
+
+
+-- !query
+select encode('hello', 'WINDOWS-1252')
+-- !query analysis
+Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+   +- LocalRelation [scol#x, ecol#x]
+
+
+-- !query
+set spark.sql.legacy.javaCharsets=false
+-- !query analysis
+SetCommand (spark.sql.legacy.javaCharsets,Some(false))
+
+
+-- !query
+select encode('hello', 'WINDOWS-1252')
+-- !query analysis
+Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x]
++- OneRowRelation
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
+-- !query analysis
+Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x]
++- SubqueryAlias t
+   +- LocalRelation [scol#x, ecol#x]
+
+
 -- !query
 select encode('hello', 'Windows-xxx')
 -- !query analysis
-Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x]
+Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x]
 +- OneRowRelation
 
 
 -- !query
 select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol)
 -- !query analysis
-Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x]
+Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x]
 +- SubqueryAlias t
    +- LocalRelation [scol#x, ecol#x]
 
@@ -704,7 +746,7 @@ org.apache.spark.sql.AnalysisException
 -- !query
 select decode(encode('abc', 'utf-8'), 'utf-8')
 -- !query analysis
-Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x]
+Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x]
 +- OneRowRelation
 
 
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out
index 676737a4fea8e..1b19753b1f6de 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out
@@ -11,7 +11,7 @@ FROM (
 -- !query analysis
 Project [concat(concat(cast(col1#xL as string), col2#x), cast(col3#x as string)) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x]
+   +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -29,7 +29,7 @@ FROM (
 -- !query analysis
 Project [concat(concat(concat(col1#x, cast(col2#xL as string)), concat(col3#x, cast(col4#x as string))), cast(col5#x as string)) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x]
+   +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -46,7 +46,7 @@ FROM (
 -- !query analysis
 Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+   +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -67,7 +67,7 @@ FROM (
 -- !query analysis
 Project [concat(cast(col1#x as string), cast(col2#x as string)) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x]
+   +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -84,7 +84,7 @@ FROM (
 -- !query analysis
 Project [concat(concat(concat(cast(col1#x as string), cast(col2#x as string)), cast(col3#x as string)), cast(col4#x as string)) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+   +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -101,7 +101,7 @@ FROM (
 -- !query analysis
 Project [concat(concat(cast(col1#x as string), cast(col2#x as string)), concat(cast(col3#x as string), cast(col4#x as string))) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+   +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -122,7 +122,7 @@ FROM (
 -- !query analysis
 Project [concat(col1#x, col2#x) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x]
+   +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -139,7 +139,7 @@ FROM (
 -- !query analysis
 Project [concat(concat(concat(col1#x, col2#x), col3#x), col4#x) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+   +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -156,7 +156,7 @@ FROM (
 -- !query analysis
 Project [concat(concat(col1#x, col2#x), concat(col3#x, col4#x)) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+   +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
       +- Range (0, 10, step=1, splits=None)
 
 
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out
index 5a9b5ddbafa39..4d897a329cfe1 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out
@@ -13,7 +13,7 @@ FROM (
 -- !query analysis
 Project [elt(2, col1#x, cast(col2#xL as string), col3#x, cast(col4#x as string), cast(col5#x as string), false) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x]
+   +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -30,7 +30,7 @@ FROM (
 -- !query analysis
 Project [elt(3, col1#x, col2#x, cast(col3#x as string), cast(col4#x as string), false) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x]
+   +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -51,7 +51,7 @@ FROM (
 -- !query analysis
 Project [elt(1, cast(col1#x as string), cast(col2#x as string), false) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x]
+   +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x]
       +- Range (0, 10, step=1, splits=None)
 
 
@@ -72,5 +72,5 @@ FROM (
 -- !query analysis
 Project [elt(2, col1#x, col2#x, false) AS col#x]
 +- SubqueryAlias __auto_generated_subquery_name
-   +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x]
+   +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x]
       +- Range (0, 10, step=1, splits=None)
diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
index 0fbf211ec5c5e..645f6bcb8327c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql
@@ -118,6 +118,12 @@ SELECT rpad('abc', 5, x'57');
 SELECT rpad(x'57', 5, 'abc');
 
 -- encode
+set spark.sql.legacy.javaCharsets=true;
+select encode('hello', 'WINDOWS-1252');
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol);
+set spark.sql.legacy.javaCharsets=false;
+select encode('hello', 'WINDOWS-1252');
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol);
 select encode('hello', 'Windows-xxx');
 select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol);
 
diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
index 082ff03efacb3..89bb20fc1bff4 100644
--- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out
@@ -803,6 +803,72 @@ struct<rpad(X'57', 5, abc):string>
 Wabca
 
 
+-- !query
+set spark.sql.legacy.javaCharsets=true
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.javaCharsets	true
+
+
+-- !query
+select encode('hello', 'WINDOWS-1252')
+-- !query schema
+struct<encode(hello, WINDOWS-1252):binary>
+-- !query output
+hello
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
+-- !query schema
+struct<encode(scol, ecol):binary>
+-- !query output
+hello
+
+
+-- !query
+set spark.sql.legacy.javaCharsets=false
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.javaCharsets	false
+
+
+-- !query
+select encode('hello', 'WINDOWS-1252')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+  "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+  "sqlState" : "22023",
+  "messageParameters" : {
+    "charset" : "WINDOWS-1252",
+    "functionName" : "`encode`",
+    "parameter" : "`charset`"
+  }
+}
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+  "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+  "sqlState" : "22023",
+  "messageParameters" : {
+    "charset" : "WINDOWS-1252",
+    "functionName" : "`encode`",
+    "parameter" : "`charset`"
+  }
+}
+
+
 -- !query
 select encode('hello', 'Windows-xxx')
 -- !query schema
diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
index 7914092037887..6d90a50915788 100644
--- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out
@@ -735,6 +735,72 @@ struct<rpad(X'57', 5, abc):string>
 Wabca
 
 
+-- !query
+set spark.sql.legacy.javaCharsets=true
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.javaCharsets	true
+
+
+-- !query
+select encode('hello', 'WINDOWS-1252')
+-- !query schema
+struct<encode(hello, WINDOWS-1252):binary>
+-- !query output
+hello
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
+-- !query schema
+struct<encode(scol, ecol):binary>
+-- !query output
+hello
+
+
+-- !query
+set spark.sql.legacy.javaCharsets=false
+-- !query schema
+struct<key:string,value:string>
+-- !query output
+spark.sql.legacy.javaCharsets	false
+
+
+-- !query
+select encode('hello', 'WINDOWS-1252')
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+  "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+  "sqlState" : "22023",
+  "messageParameters" : {
+    "charset" : "WINDOWS-1252",
+    "functionName" : "`encode`",
+    "parameter" : "`charset`"
+  }
+}
+
+
+-- !query
+select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.SparkIllegalArgumentException
+{
+  "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET",
+  "sqlState" : "22023",
+  "messageParameters" : {
+    "charset" : "WINDOWS-1252",
+    "functionName" : "`encode`",
+    "parameter" : "`charset`"
+  }
+}
+
+
 -- !query
 select encode('hello', 'Windows-xxx')
 -- !query schema
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index 8b5ffe560a1fa..da04674b99205 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -193,8 +193,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
         """.stripMargin)
       checkKeywordsExistsInExplain(df2,
         "Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), " +
-          "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " +
-          "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]")
+          "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " +
+          "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]")
 
       val df3 = sql(
         """
@@ -209,8 +209,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
         """.stripMargin)
       checkKeywordsExistsInExplain(df3,
         "Project [concat(cast(id#xL as string), " +
-          "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " +
-          "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]")
+          "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " +
+          "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]")
     }
   }
 

From 753b2f23206464006c45f8d9b2747e56a09808a0 Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Tue, 28 Nov 2023 08:48:41 +0900
Subject: [PATCH 28/40] [SPARK-46121][PYTHON][DOCS] Refine docstring of
 `concat/array_position/element_at/try_element_at`

### What changes were proposed in this pull request?
This pr refine docstring of `concat/array_position/element_at/try_element_at` and add some new examples.

### Why are the changes needed?
To improve PySpark documentation

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Pass Github Actions

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #44039 from LuciferYang/SPARK-46121.

Authored-by: yangjie01 <yangjie01@baidu.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/sql/functions/builtin.py | 241 +++++++++++++++++++++---
 1 file changed, 210 insertions(+), 31 deletions(-)

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index 8723c5fc4b9d4..d985b9e6138f5 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -12368,7 +12368,7 @@ def array_join(
 @_try_remote_functions
 def concat(*cols: "ColumnOrName") -> Column:
     """
-    Concatenates multiple input columns together into a single column.
+    Collection function: Concatenates multiple input columns together into a single column.
     The function works with strings, numeric, binary and compatible array columns.
 
     .. versionadded:: 1.5.0
@@ -12392,19 +12392,61 @@ def concat(*cols: "ColumnOrName") -> Column:
 
     Examples
     --------
+    Example 1: Concatenating string columns
+
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])
-    >>> df = df.select(concat(df.s, df.d).alias('s'))
-    >>> df.collect()
-    [Row(s='abcd123')]
-    >>> df
-    DataFrame[s: string]
+    >>> df.select(sf.concat(df.s, df.d)).show()
+    +------------+
+    |concat(s, d)|
+    +------------+
+    |     abcd123|
+    +------------+
+
+    Example 2: Concatenating array columns
 
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])
-    >>> df = df.select(concat(df.a, df.b, df.c).alias("arr"))
-    >>> df.collect()
-    [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]
-    >>> df
-    DataFrame[arr: array<bigint>]
+    >>> df.select(sf.concat(df.a, df.b, df.c)).show()
+    +---------------+
+    |concat(a, b, c)|
+    +---------------+
+    |[1, 2, 3, 4, 5]|
+    |           NULL|
+    +---------------+
+
+    Example 3: Concatenating numeric columns
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c'])
+    >>> df.select(sf.concat(df.a, df.b, df.c)).show()
+    +---------------+
+    |concat(a, b, c)|
+    +---------------+
+    |            123|
+    +---------------+
+
+    Example 4: Concatenating binary columns
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(bytearray(b'abc'), bytearray(b'def'))], ['a', 'b'])
+    >>> df.select(sf.concat(df.a, df.b)).show()
+    +-------------------+
+    |       concat(a, b)|
+    +-------------------+
+    |[61 62 63 64 65 66]|
+    +-------------------+
+
+    Example 5: Concatenating mixed types of columns
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(1,"abc",3,"def")], ['a','b','c','d'])
+    >>> df.select(sf.concat(df.a, df.b, df.c, df.d)).show()
+    +------------------+
+    |concat(a, b, c, d)|
+    +------------------+
+    |          1abc3def|
+    +------------------+
     """
     return _invoke_function_over_seq_of_columns("concat", cols)
 
@@ -12412,7 +12454,7 @@ def concat(*cols: "ColumnOrName") -> Column:
 @_try_remote_functions
 def array_position(col: "ColumnOrName", value: Any) -> Column:
     """
-    Collection function: Locates the position of the first occurrence of the given value
+    Array function: Locates the position of the first occurrence of the given value
     in the given array. Returns null if either of the arguments are null.
 
     .. versionadded:: 2.4.0
@@ -12439,9 +12481,62 @@ def array_position(col: "ColumnOrName", value: Any) -> Column:
 
     Examples
     --------
-    >>> df = spark.createDataFrame([(["c", "b", "a"],), ([],)], ['data'])
-    >>> df.select(array_position(df.data, "a")).collect()
-    [Row(array_position(data, a)=3), Row(array_position(data, a)=0)]
+    Example 1: Finding the position of a string in an array of strings
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(["c", "b", "a"],)], ['data'])
+    >>> df.select(sf.array_position(df.data, "a")).show()
+    +-----------------------+
+    |array_position(data, a)|
+    +-----------------------+
+    |                      3|
+    +-----------------------+
+
+    Example 2: Finding the position of a string in an empty array
+
+    >>> from pyspark.sql import functions as sf
+    >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType
+    >>> schema = StructType([StructField("data", ArrayType(StringType()), True)])
+    >>> df = spark.createDataFrame([([],)], schema=schema)
+    >>> df.select(sf.array_position(df.data, "a")).show()
+    +-----------------------+
+    |array_position(data, a)|
+    +-----------------------+
+    |                      0|
+    +-----------------------+
+
+    Example 3: Finding the position of an integer in an array of integers
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([([1, 2, 3],)], ['data'])
+    >>> df.select(sf.array_position(df.data, 2)).show()
+    +-----------------------+
+    |array_position(data, 2)|
+    +-----------------------+
+    |                      2|
+    +-----------------------+
+
+    Example 4: Finding the position of a non-existing value in an array
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(["c", "b", "a"],)], ['data'])
+    >>> df.select(sf.array_position(df.data, "d")).show()
+    +-----------------------+
+    |array_position(data, d)|
+    +-----------------------+
+    |                      0|
+    +-----------------------+
+
+    Example 5: Finding the position of a value in an array with nulls
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([([None, "b", "a"],)], ['data'])
+    >>> df.select(sf.array_position(df.data, "a")).show()
+    +-----------------------+
+    |array_position(data, a)|
+    +-----------------------+
+    |                      3|
+    +-----------------------+
     """
     return _invoke_function("array_position", _to_java_column(col), value)
 
@@ -12449,10 +12544,14 @@ def array_position(col: "ColumnOrName", value: Any) -> Column:
 @_try_remote_functions
 def element_at(col: "ColumnOrName", extraction: Any) -> Column:
     """
-    Collection function: Returns element of array at given index in `extraction` if col is array.
-    Returns value for the given key in `extraction` if col is map. If position is negative
-    then location of the element will start from end, if number is outside the
-    array boundaries then None will be returned.
+    Collection function:
+    (array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will
+    throw an error. If index < 0, accesses elements from the last to the first.
+    If 'spark.sql.ansi.enabled' is set to true, an exception will be thrown if the index is out
+    of array boundaries instead of returning NULL.
+
+    (map, key) - Returns value for given key in `extraction` if col is map. The function always
+    returns NULL if the key is not contained in the map.
 
     .. versionadded:: 2.4.0
 
@@ -12481,15 +12580,49 @@ def element_at(col: "ColumnOrName", extraction: Any) -> Column:
 
     Examples
     --------
+    Example 1: Getting the first element of an array
+
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])
-    >>> df.select(element_at(df.data, 1)).collect()
-    [Row(element_at(data, 1)='a')]
-    >>> df.select(element_at(df.data, -1)).collect()
-    [Row(element_at(data, -1)='c')]
+    >>> df.select(sf.element_at(df.data, 1)).show()
+    +-------------------+
+    |element_at(data, 1)|
+    +-------------------+
+    |                  a|
+    +-------------------+
+
+    Example 2: Getting the last element of an array using negative index
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])
+    >>> df.select(sf.element_at(df.data, -1)).show()
+    +--------------------+
+    |element_at(data, -1)|
+    +--------------------+
+    |                   c|
+    +--------------------+
+
+    Example 3: Getting a value from a map using a key
 
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])
+    >>> df.select(sf.element_at(df.data, sf.lit("a"))).show()
+    +-------------------+
+    |element_at(data, a)|
+    +-------------------+
+    |                1.0|
+    +-------------------+
+
+    Example 4: Getting a non-existing value from a map using a key
+
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])
-    >>> df.select(element_at(df.data, lit("a"))).collect()
-    [Row(element_at(data, a)=1.0)]
+    >>> df.select(sf.element_at(df.data, sf.lit("c"))).show()
+    +-------------------+
+    |element_at(data, c)|
+    +-------------------+
+    |               NULL|
+    +-------------------+
     """
     return _invoke_function_over_columns("element_at", col, lit(extraction))
 
@@ -12497,6 +12630,7 @@ def element_at(col: "ColumnOrName", extraction: Any) -> Column:
 @_try_remote_functions
 def try_element_at(col: "ColumnOrName", extraction: "ColumnOrName") -> Column:
     """
+    Collection function:
     (array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will
     throw an error. If index < 0, accesses elements from the last to the first. The function
     always returns NULL if the index exceeds the length of the array.
@@ -12515,15 +12649,60 @@ def try_element_at(col: "ColumnOrName", extraction: "ColumnOrName") -> Column:
 
     Examples
     --------
+    Example 1: Getting the first element of an array
+
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])
-    >>> df.select(try_element_at(df.data, lit(1)).alias('r')).collect()
-    [Row(r='a')]
-    >>> df.select(try_element_at(df.data, lit(-1)).alias('r')).collect()
-    [Row(r='c')]
+    >>> df.select(sf.try_element_at(df.data, sf.lit(1))).show()
+    +-----------------------+
+    |try_element_at(data, 1)|
+    +-----------------------+
+    |                      a|
+    +-----------------------+
+
+    Example 2: Getting the last element of an array using negative index
 
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])
+    >>> df.select(sf.try_element_at(df.data, sf.lit(-1))).show()
+    +------------------------+
+    |try_element_at(data, -1)|
+    +------------------------+
+    |                       c|
+    +------------------------+
+
+    Example 3: Getting a value from a map using a key
+
+    >>> from pyspark.sql import functions as sf
     >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])
-    >>> df.select(try_element_at(df.data, lit("a")).alias('r')).collect()
-    [Row(r=1.0)]
+    >>> df.select(sf.try_element_at(df.data, sf.lit("a"))).show()
+    +-----------------------+
+    |try_element_at(data, a)|
+    +-----------------------+
+    |                    1.0|
+    +-----------------------+
+
+    Example 4: Getting a non-existing element from an array
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])
+    >>> df.select(sf.try_element_at(df.data, sf.lit(4))).show()
+    +-----------------------+
+    |try_element_at(data, 4)|
+    +-----------------------+
+    |                   NULL|
+    +-----------------------+
+
+    Example 5: Getting a non-existing value from a map using a key
+
+    >>> from pyspark.sql import functions as sf
+    >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])
+    >>> df.select(sf.try_element_at(df.data, sf.lit("c"))).show()
+    +-----------------------+
+    |try_element_at(data, c)|
+    +-----------------------+
+    |                   NULL|
+    +-----------------------+
     """
     return _invoke_function_over_columns("try_element_at", col, extraction)
 

From ec7d07c635c487fb19e04c48ebdffa7752015330 Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Tue, 28 Nov 2023 10:38:29 +0900
Subject: [PATCH 29/40] [SPARK-46111][DOCS][PYTHON] Add copyright to the
 PySpark official documentation

### What changes were proposed in this pull request?

This PR proposes to add the Apache Spark Foundation copyright notice to the bottom of the PySpark official documentation.

### Why are the changes needed?

Our current documentation is missing the copyright. The addition of the copyright notice is necessary to ensure compliance with the Apache Software Foundation's requirements for project documentation.

### Does this PR introduce _any_ user-facing change?

No API changes, but users will now see the Apache Spark Foundation copyright notice at the bottom of each page of the PySpark documentation as below:

## Before
<img width="278" alt="Screenshot 2023-11-27 at 11 49 38 AM" src="https://github.com/apache/spark/assets/44108233/fda0c23d-5a8c-457c-adde-08917282320f">

## After

<img width="770" alt="Screenshot 2023-11-27 at 11 35 35 AM" src="https://github.com/apache/spark/assets/44108233/bfecf5ca-7117-4269-8e85-59646ef60b70">

### How was this patch tested?

Manually build the docs and confirm.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44026 from itholic/add_copyright.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/docs/source/_templates/spark_footer.html | 3 +++
 python/docs/source/conf.py                      | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)
 create mode 100644 python/docs/source/_templates/spark_footer.html

diff --git a/python/docs/source/_templates/spark_footer.html b/python/docs/source/_templates/spark_footer.html
new file mode 100644
index 0000000000000..684482b0c2cdf
--- /dev/null
+++ b/python/docs/source/_templates/spark_footer.html
@@ -0,0 +1,3 @@
+<p class="copyright">
+    {{copyright}} The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
+</p>
diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py
index 81083c007b346..de7ab953c5386 100644
--- a/python/docs/source/conf.py
+++ b/python/docs/source/conf.py
@@ -12,6 +12,7 @@
 # All configuration values have a default; values that are commented out
 # serve to show the default.
 
+from datetime import datetime
 import sys
 import os
 import shutil
@@ -124,7 +125,8 @@
 
 # General information about the project.
 project = 'PySpark'
-copyright = ''
+# We have our custom "spark_footer.html" template, using copyright for the current year.
+copyright = f"Copyright @ {datetime.now().year}"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -194,6 +196,7 @@
 # further.  For a list of options available for each theme, see the
 # documentation.
 html_theme_options = {
+    "footer_start": ["spark_footer", "sphinx-version"],
     "navbar_end": ["version-switcher", "theme-switcher"],
     "logo": {
         "image_light": "_static/spark-logo-light.png",

From 4f59e1b663812a47ec1906b40dc59f6ed5342e50 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Tue, 28 Nov 2023 10:46:28 +0900
Subject: [PATCH 30/40] [SPARK-46126][PYTHON][TESTS] Fix the doctest in
 pyspark.pandas.frame.DataFrame.to_dict (Python 3.12)

### What changes were proposed in this pull request?

This PR proposes to fix doctest, `pyspark.pandas.frame.DataFrame.to_dict`, compatible with Python 3.12.

```
File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 2515, in pyspark.pandas.frame.DataFrame.to_dict
Failed example:
    df.to_dict(into=OrderedDict)
Expected:
    OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
Got:
    OrderedDict({'col1': OrderedDict({'row1': 1, 'row2': 2}), 'col2': OrderedDict({'row1': 0.5, 'row2': 0.75})})
```

### Why are the changes needed?

For the proper test for Python 3.12. It is failing, see https://github.com/apache/spark/actions/runs/7006848931/job/19059702970

### Does this PR introduce _any_ user-facing change?

No. A bit of user-facing doc change but very trival.

### How was this patch tested?

Fixed unittests. Manually tested via:

```bash
python/run-tests --python-executable=python3  --testnames 'pyspark.pandas.frame'
...
Tests passed in 721 seconds
```

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44042 from HyukjinKwon/SPARK-46126.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/pandas/frame.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 4ecc85ce8f795..b53f5adfbaa81 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -2512,9 +2512,8 @@ def to_dict(self, orient: str = "dict", into: Type = dict) -> Union[List, Mappin
         You can also specify the mapping type.
 
         >>> from collections import OrderedDict, defaultdict
-        >>> df.to_dict(into=OrderedDict)
-        OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), \
-('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
+        >>> df.to_dict(into=OrderedDict)  # doctest: +ELLIPSIS
+        OrderedDict(...)
 
         If you want a `defaultdict`, you need to initialize it:
 

From dbc8756bdac823be42ed10bc011415f405905497 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 28 Nov 2023 11:04:14 +0800
Subject: [PATCH 31/40] [SPARK-46006][YARN][FOLLOWUP] YarnAllocator set target
 executor number to 0 to cancel pending allocate request when driver stop

### What changes were proposed in this pull request?
YarnAllocator set target executor number to 0 to cancel pending allocate request when driver stop
Now for this issue we do:

1. AllocationFailure should not be treated as exitCausedByApp when driver is shutting down https://github.com/apache/spark/pull/38622
2. Avoid new allocation requests when sc.stop stuck https://github.com/apache/spark/pull/43906
3. Cancel pending allocation request, this pr https://github.com/apache/spark/pull/44036

### Why are the changes needed?
Avoid unnecessary allocate request

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
MT

### Was this patch authored or co-authored using generative AI tooling?
No

Closes #44036 from AngersZhuuuu/SPARK-46006-FOLLOWUP.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Kent Yao <yao@apache.org>
---
 .../scala/org/apache/spark/deploy/yarn/YarnAllocator.scala   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
index f8afbc81c1211..5d24870bbcda3 100644
--- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
+++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -385,7 +385,10 @@ private[yarn] class YarnAllocator(
     this.hostToLocalTaskCountPerResourceProfileId = hostToLocalTaskCountPerResourceProfileId
 
     if (resourceProfileToTotalExecs.isEmpty) {
-      targetNumExecutorsPerResourceProfileId.clear()
+      // Set target executor number to 0 to cancel pending allocate request.
+      targetNumExecutorsPerResourceProfileId.keys.foreach { rp =>
+        targetNumExecutorsPerResourceProfileId(rp) = 0
+      }
       allocatorNodeHealthTracker.setSchedulerExcludedNodes(excludedNodes)
       true
     } else {

From 2800b5849309645657b9d308557009b31e14084e Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Tue, 28 Nov 2023 11:07:14 +0800
Subject: [PATCH 32/40] [SPARK-46116][DOCS][PYTHON] Adding "Stack Overflow" and
 "Mailing Lists" link into PySpark doc homepage

### What changes were proposed in this pull request?

This PR proposes to enhance the PySpark documentation by adding more items for a "Useful links"including "Stack Overflow", "Dev Mailing List" and the "User Mailing List".

### Why are the changes needed?

It is aimed at improving user engagement and providing quick access to community support and discussions. This approach is inspired by the [Pandas documentation](https://pandas.pydata.org/docs/index.html), which effectively uses a similar section for community engagement.

The "Stack Overflow" will lead users to a curated list of StackOverflow questions tagged with `pyspark`, while the mailing lists will offer platforms for deeper discussions and insights within the Spark community.

### Does this PR introduce _any_ user-facing change?

No API change, but the main page of the PySpark documentation will be updated to include a new "Useful links"as below:

<img width="1197" alt="Screenshot 2023-11-27 at 5 29 19 PM" src="https://github.com/apache/spark/assets/44108233/0d52e243-8e97-4a5b-9dc9-ecb941496b10">

## Linked pages for each items

### Stack Overflow

<img width="935" alt="Screenshot 2023-11-27 at 2 59 31 PM" src="https://github.com/apache/spark/assets/44108233/78deff80-ccc8-4969-9966-cb308ba2a71d">

### Dev Mailing List
<img width="1601" alt="Screenshot 2023-11-27 at 3 00 23 PM" src="https://github.com/apache/spark/assets/44108233/64b70a59-8120-47a7-94f4-4a4a6fe94e69">

### User Mailing List
<img width="1604" alt="Screenshot 2023-11-27 at 3 00 44 PM" src="https://github.com/apache/spark/assets/44108233/af77ef5d-c594-448d-ba7e-63d89d6e176c">

### How was this patch tested?

Manually build doc & verify each links.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44033 from itholic/improve_useful_links.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Kent Yao <yao@apache.org>
---
 python/docs/source/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/docs/source/index.rst b/python/docs/source/index.rst
index b3233744c5eb1..72a846290fe9e 100644
--- a/python/docs/source/index.rst
+++ b/python/docs/source/index.rst
@@ -24,7 +24,7 @@ PySpark Overview
 **Date**: |today| **Version**: |release|
 
 **Useful links**:
-|binder|_ | `GitHub <https://github.com/apache/spark>`_ | `Issues <https://issues.apache.org/jira/projects/SPARK/issues>`_ | |examples|_ | `Community <https://spark.apache.org/community.html>`_
+|binder|_ | `GitHub <https://github.com/apache/spark>`_ | `Issues <https://issues.apache.org/jira/projects/SPARK/issues>`_ | |examples|_ | `Community <https://spark.apache.org/community.html>`_ | `Stack Overflow <https://stackoverflow.com/questions/tagged/pyspark>`_ | `Dev Mailing List <https://lists.apache.org/list.html?dev@spark.apache.org>`_ | `User Mailing List <https://lists.apache.org/list.html?user@spark.apache.org>`_
 
 PySpark is the Python API for Apache Spark. It enables you to perform real-time,
 large-scale data processing in a distributed environment using Python. It also provides a PySpark

From 158f87621570b82206178d1847d84749538baa04 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Tue, 28 Nov 2023 14:40:04 +0900
Subject: [PATCH 33/40] [SPARK-46127][PYTHON][TESTS] Disable
 pyspark.tests.test_worker.WorkerSegfaultNonDaemonTest.test_python_segfault
 with Python 3.12

### What changes were proposed in this pull request?

This PR disables `pyspark.tests.test_worker.WorkerSegfaultNonDaemonTest.test_python_segfault` with Python 3.12 for now.

### Why are the changes needed?

This test is flaky, and stops the tests run till the end, e.g., see https://github.com/apache/spark/actions/runs/7006848931/job/19059701743

How `faulthandler` is used is correct, as documented in the standard Python documentation. So I do believe this is a bug from Python 3.12. I will track separately in Python side.

### Does this PR introduce _any_ user-facing change?

No, test-only.

### How was this patch tested?

Manually:

```bash
python/run-tests --python-executable=python3  --testnames 'pyspark.tests.test_worker WorkerSegfaultNonDaemonTest.test_python_segfault'
```

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44044 from HyukjinKwon/SPARK-46127.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/tests/test_worker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
index 2d675811fb9bc..bab853967b9e1 100644
--- a/python/pyspark/tests/test_worker.py
+++ b/python/pyspark/tests/test_worker.py
@@ -230,6 +230,7 @@ def conf(cls):
         _conf.set("spark.python.worker.faulthandler.enabled", "true")
         return _conf
 
+    @unittest.skipIf(sys.version_info < (3, 12), "SPARK-46130: Flaky with Python 3.12")
     def test_python_segfault(self):
         try:
 

From 984e797e02ca245b684fb18614f3378b9a559ab5 Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Tue, 28 Nov 2023 14:40:25 +0900
Subject: [PATCH 34/40] [SPARK-46131][PYTHON][INFRA] Install torchvision for
 Python 3.12 build

### What changes were proposed in this pull request?

This PR adds `torchvision` into the testing image for Python 3.12.

### Why are the changes needed?

To continue Python 3.12 build, and see what are failing. Currently it fails as below: https://github.com/apache/spark/actions/runs/7006848931/job/19059702169#step:12:4236

```
======================================================================
ERROR [0.001s]: test_end_to_end_run_distributedly (pyspark.ml.tests.connect.test_parity_torch_distributor.TorchDistributorDistributedUnitTestsOnConnect.test_end_to_end_run_distributedly)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 495, in test_end_to_end_run_distributedly
    train_fn = create_training_function(self.mnist_dir_path)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 60, in create_training_function
    from torchvision import transforms, datasets
ModuleNotFoundError: No module named 'torchvision'

======================================================================
ERROR [0.001s]: test_end_to_end_run_locally (pyspark.ml.tests.connect.test_parity_torch_distributor.TorchDistributorLocalUnitTestsIIOnConnect.test_end_to_end_run_locally)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 402, in test_end_to_end_run_locally
    train_fn = create_training_function(self.mnist_dir_path)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 60, in create_training_function
    from torchvision import transforms, datasets
ModuleNotFoundError: No module named 'torchvision'

======================================================================
ERROR [0.001s]: test_end_to_end_run_locally (pyspark.ml.tests.connect.test_parity_torch_distributor.TorchDistributorLocalUnitTestsOnConnect.test_end_to_end_run_locally)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 402, in test_end_to_end_run_locally
    train_fn = create_training_function(self.mnist_dir_path)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 60, in create_training_function
    from torchvision import transforms, datasets
ModuleNotFoundError: No module named 'torchvision'

----------------------------------------------------------------------
Ran 23 tests in 50.860s
```

and this pr fixes it

### Does this PR introduce _any_ user-facing change?

No, dev-only.

### How was this patch tested?

Manually tested.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44045 from HyukjinKwon/SPARK-46131.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 dev/infra/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile
index 10ae49b71665f..7348c6af1e059 100644
--- a/dev/infra/Dockerfile
+++ b/dev/infra/Dockerfile
@@ -138,4 +138,5 @@ RUN python3.12 -m pip install numpy 'pyarrow>=14.0.0' 'six==1.16.0' 'pandas<=2.1
 RUN python3.12 -m pip install 'grpcio==1.59.3' 'grpcio-status==1.59.3' 'protobuf==4.25.1' 'googleapis-common-protos==1.56.4'
 # TODO(SPARK-46078) Use official one instead of nightly build when it's ready
 RUN python3.12 -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+RUN python3.12 -m pip install torchvision --index-url https://download.pytorch.org/whl/cpu
 RUN python3.12 -m pip install torcheval

From bfb08823b490c17943d75b0fa24a6838ef1e2634 Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Tue, 28 Nov 2023 15:13:07 +0900
Subject: [PATCH 35/40] [SPARK-32407][SPARK-35375][INFRA][DOCS] Delete comments
 on `Sphinx` and `Jinja2`

### What changes were proposed in this pull request?
Delete comments on `Sphinx` and `Jinja2`

### Why are the changes needed?
they had been upgraded in https://github.com/apache/spark/pull/44012

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ci

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #44046 from zhengruifeng/infra_Sphinx_nit.

Authored-by: Ruifeng Zheng <ruifengz@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .github/workflows/build_and_test.yml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index ccc437269bfa7..01e5458340af2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -692,10 +692,6 @@ jobs:
     - name: Install Python linter dependencies
       if: inputs.branch != 'branch-3.3' && inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
       run: |
-        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
-        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
-        # Jinja2 3.0.0+ causes error when building with Sphinx.
-        #   See also https://issues.apache.org/jira/browse/SPARK-35375.
         python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==23.9.1'
         python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.59.3' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
     - name: Python linter
@@ -745,10 +741,6 @@ jobs:
         Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
     - name: Install dependencies for documentation generation
       run: |
-        # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
-        #   See also https://github.com/sphinx-doc/sphinx/issues/7551.
-        # Jinja2 3.0.0+ causes error when building with Sphinx.
-        #   See also https://issues.apache.org/jira/browse/SPARK-35375.
         # Pin the MarkupSafe to 2.0.1 to resolve the CI error.
         #   See also https://issues.apache.org/jira/browse/SPARK-38279.
         python3.9 -m pip install 'sphinx==4.2.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 'markupsafe==2.0.1' 'pyzmq<24.0.0'

From 486439334702439807ad83fd4dc54884ede4f6eb Mon Sep 17 00:00:00 2001
From: Hyukjin Kwon <gurwls223@apache.org>
Date: Tue, 28 Nov 2023 16:34:37 +0900
Subject: [PATCH 36/40] [SPARK-46127][TESTS][PYTHON][FOLLOW-UP] Fix skip
 condition from " < (3, 12)" to " > (3, 11)"

### What changes were proposed in this pull request?

This PR proposes to fix the test condition from " < (3, 12)" to " > (3, 11)".

### Why are the changes needed?

Incorrect condition. The test has to be skipped with Python 3.12+

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Manually.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44050 from HyukjinKwon/SPARK-46127-followup.

Authored-by: Hyukjin Kwon <gurwls223@apache.org>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 python/pyspark/tests/test_worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py
index bab853967b9e1..9e8a05d18347c 100644
--- a/python/pyspark/tests/test_worker.py
+++ b/python/pyspark/tests/test_worker.py
@@ -230,7 +230,7 @@ def conf(cls):
         _conf.set("spark.python.worker.faulthandler.enabled", "true")
         return _conf
 
-    @unittest.skipIf(sys.version_info < (3, 12), "SPARK-46130: Flaky with Python 3.12")
+    @unittest.skipIf(sys.version_info > (3, 11), "SPARK-46130: Flaky with Python 3.12")
     def test_python_segfault(self):
         try:
 

From 04c9583cecdf929e8ac57eb07a8c53f488c47671 Mon Sep 17 00:00:00 2001
From: Jungtaek Lim <kabhwan.opensource@gmail.com>
Date: Tue, 28 Nov 2023 17:05:11 +0900
Subject: [PATCH 37/40] [SPARK-45833][SS][DOCS] Document the new introduction
 of state data source
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

This PR proposes to add a new doc page describing the new data source, `state data source`.

Worth noting that we explicitly mention the data source as experimental, so that we do not close the opportunity to improve further if it's backward incompatible.

### Why are the changes needed?

The data source is an user-facing one and would be evolved over time, hence we'll need to document and publicize it.

### Does this PR introduce _any_ user-facing change?

Yes, doc change.

### How was this patch tested?

Built the docs directory with jekyll. Here are relevant screenshots.

<img width="1136" alt="스크린샷 2023-11-28 오후 5 02 47" src="https://github.com/apache/spark/assets/1317309/b36fea49-89f0-484d-b90d-be069c589b8f">

![structured-streaming-state-data-source-1](https://github.com/apache/spark/assets/1317309/c65cef9f-750b-4c00-a289-c705386a538b)

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #43920 from HeartSaVioR/SPARK-45833.

Authored-by: Jungtaek Lim <kabhwan.opensource@gmail.com>
Signed-off-by: Jungtaek Lim <kabhwan.opensource@gmail.com>
---
 .../structured-streaming-programming-guide.md |   8 +
 .../structured-streaming-state-data-source.md | 248 ++++++++++++++++++
 2 files changed, 256 insertions(+)
 create mode 100644 docs/structured-streaming-state-data-source.md

diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md
index 547834c7f9e3a..33b9453a18c37 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -2452,6 +2452,14 @@ Specifically for built-in HDFS state store provider, users can check the state s
 it is best if cache missing count is minimized that means Spark won't waste too much time on loading checkpointed state.
 User can increase Spark locality waiting configurations to avoid loading state store providers in different executors across batches.
 
+#### State Data Source (Experimental)
+
+Apache Spark provides a streaming state related data source that provides the ability to manipulate state stores in the checkpoint. Users can run the batch query with State Data Source to get the visibility of the states for existing streaming query.
+
+As of Spark 4.0, the data source only supports read feature. See [State Data Source Integration Guide](structured-streaming-state-data-source.html) for more details.
+
+NOTE: this data source is currently marked as experimental - source options and the behavior (output) might be subject to change.
+
 ## Starting Streaming Queries
 Once you have defined the final result DataFrame/Dataset, all that is left is for you to start the streaming computation. To do that, you have to use the `DataStreamWriter`
 ([Python](api/python/reference/pyspark.ss/api/pyspark.sql.streaming.DataStreamWriter.html#pyspark.sql.streaming.DataStreamWriter)/[Scala](api/scala/org/apache/spark/sql/streaming/DataStreamWriter.html)/[Java](api/java/org/apache/spark/sql/streaming/DataStreamWriter.html) docs)
diff --git a/docs/structured-streaming-state-data-source.md b/docs/structured-streaming-state-data-source.md
new file mode 100644
index 0000000000000..a9353861c532c
--- /dev/null
+++ b/docs/structured-streaming-state-data-source.md
@@ -0,0 +1,248 @@
+---
+layout: global
+displayTitle: State Data Source Integration Guide
+title: State Data Source Integration Guide
+license: |
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+---
+
+State data source Guide in Structured Streaming (Experimental)
+
+## Overview
+
+State data source provides functionality to manipulate the state from the checkpoint.
+
+As of Spark 4.0, state data source provides the read functionality with a batch query. Additional functionalities including write is on the future roadmap.
+
+NOTE: this data source is currently marked as experimental - source options and the behavior (output) might be subject to change.
+
+## Reading state key-values from the checkpoint
+
+State data source enables reading key-value pairs from the state store in the checkpoint, via running a separate batch query.
+Users can leverage the functionality to cover two major use cases described below:
+
+* Construct a test checking both output and the state. It is non-trivial to deduce the key-value of the state from the output, and having visibility of the state would be a huge win on testing.
+* Investigate an incident against stateful streaming query. If users observe the incorrect output and want to track how it came up, having visibility of the state would be required.
+
+Users can read an instance of state store, which is matched to a single stateful operator in most cases. This means, users can expect that they can read the entire key-value pairs in the state for a single stateful operator. 
+
+Note that there could be an exception, e.g. stream-stream join, which leverages multiple state store instances internally. The data source abstracts the internal representation away from users and
+provides a user-friendly approach to read the state. See the section for stream-stream join for more details.
+
+### Creating a State store for Batch Queries (all defaults)
+
+<div class="codetabs">
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+
+df = spark \
+.read \
+.format("statestore") \
+.load("<checkpointLocation>")
+
+{% endhighlight %}
+</div>
+
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+
+val df = spark
+.read
+.format("statestore")
+.load("<checkpointLocation>")
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+
+Dataset<Row> df = spark
+.read()
+.format("statestore")
+.load("<checkpointLocation>");
+
+{% endhighlight %}
+</div>
+
+</div>
+
+Each row in the source has the following schema:
+
+<table class="table table-striped">
+<thead><tr><th>Column</th><th>Type</th><th>Note</th></tr></thead>
+<tr>
+  <td>key</td>
+  <td>struct (depends on the type for state key)</td>
+  <td></td>
+</tr>
+<tr>
+  <td>value</td>
+  <td>struct (depends on the type for state value)</td>
+  <td></td>
+</tr>
+<tr>
+  <td>_partition_id</td>
+  <td>int</td>
+  <td>metadata column (hidden unless specified with SELECT)</td>
+</tr>
+</table>
+
+The nested columns for key and value heavily depend on the input schema of the stateful operator as well as the type of operator.
+Users are encouraged to query about the schema via df.schema() / df.printSchema() first to understand the type of output.
+
+The following options must be set for the source.
+
+<table class="table table-striped">
+<thead><tr><th>Option</th><th>value</th><th>meaning</th></tr></thead>
+<tr>
+  <td>path</td>
+  <td>string</td>
+  <td>Specify the root directory of the checkpoint location. You can either specify the path via option("path", `path`) or load(`path`).</td>
+</tr>
+</table>
+
+The following configurations are optional:
+
+<table class="table table-striped">
+<thead><tr><th>Option</th><th>value</th><th>default</th><th>meaning</th></tr></thead>
+<tr>
+  <td>batchId</td>
+  <td>numeric value</td>
+  <td>latest committed batch</td>
+  <td>Represents the target batch to read from. This option is used when users want to perform time-travel. The batch should be committed but not yet cleaned up.</td>
+</tr>
+<tr>
+  <td>operatorId</td>
+  <td>numeric value</td>
+  <td>0</td>
+  <td>Represents the target operator to read from. This option is used when the query is using multiple stateful operators.</td>
+</tr>
+<tr>
+  <td>storeName</td>
+  <td>string</td>
+  <td>DEFAULT</td>
+  <td>Represents the target state store name to read from. This option is used when the stateful operator uses multiple state store instances. It is not required except stream-stream join.</td>
+</tr>
+<tr>
+  <td>joinSide</td>
+  <td>string ("left" or "right")</td>
+  <td>(none)</td>
+  <td>Represents the target side to read from. This option is used when users want to read the state from stream-stream join.</td>
+</tr>
+</table>
+
+### Reading state for Stream-stream join
+
+Structured Streaming implements the stream-stream join feature via leveraging multiple instances of state store internally.
+These instances logically compose buffers to store the input rows for left and right.
+
+Since it is more obvious to users to reason about, the data source provides the option 'joinSide' to read the buffered input for specific side of the join.
+To enable the functionality to read the internal state store instance directly, we also allow specifying the option 'storeName', with restriction that 'storeName' and 'joinSide' cannot be specified together.
+
+## State metadata source
+
+Before querying the state from existing checkpoint via state data source, users would like to understand the information for the checkpoint, especially about state operator. This includes which operators and state store instances are available in the checkpoint, available range of batch IDs, etc.
+
+Structured Streaming provides a data source named "State metadata source" to provide the state-related metadata information from the checkpoint.
+
+Note: The metadata is constructed when the streaming query is running with Spark 4.0+. The existing checkpoint which has been running with lower Spark version does not have the metadata and will be unable to query/use with this metadata source. It is required to run the streaming query pointing the existing checkpoint in Spark 4.0+ to construct the metadata before querying.
+
+### Creating a State metadata store for Batch Queries
+
+<div class="codetabs">
+
+<div data-lang="python" markdown="1">
+{% highlight python %}
+
+df = spark \
+.read \
+.format("state-metadata") \
+.load("<checkpointLocation>")
+
+{% endhighlight %}
+</div>
+
+<div data-lang="scala" markdown="1">
+{% highlight scala %}
+
+val df = spark
+.read
+.format("state-metadata")
+.load("<checkpointLocation>")
+
+{% endhighlight %}
+</div>
+
+<div data-lang="java" markdown="1">
+{% highlight java %}
+
+Dataset<Row> df = spark
+.read()
+.format("state-metadata")
+.load("<checkpointLocation>");
+
+{% endhighlight %}
+</div>
+
+</div>
+
+Each row in the source has the following schema:
+
+<table class="table table-striped">
+<thead><tr><th>Column</th><th>Type</th><th>Note</th></tr></thead>
+<tr>
+  <td>operatorId</td>
+  <td>int</td>
+  <td></td>
+</tr>
+<tr>
+  <td>operatorName</td>
+  <td>string</td>
+  <td></td>
+</tr>
+<tr>
+  <td>stateStoreName</td>
+  <td>int</td>
+  <td></td>
+</tr>
+<tr>
+  <td>numPartitions</td>
+  <td>int</td>
+  <td></td>
+</tr>
+<tr>
+  <td>minBatchId</td>
+  <td>int</td>
+  <td>The minimum batch ID available for querying state. The value could be invalid if the streaming query taking the checkpoint is running, as cleanup would run.</td>
+</tr>
+<tr>
+  <td>maxBatchId</td>
+  <td>int</td>
+  <td>The maximum batch ID available for querying state. The value could be invalid if the streaming query taking the checkpoint is running, as the query will commit further batches.</td>
+</tr>
+<tr>
+  <td>_numColsPrefixKey</td>
+  <td>int</td>
+  <td>metadata column (hidden unless specified with SELECT)</td>
+</tr>
+</table>
+
+One of the major use cases of this data source is to identify the operatorId to query if the query has multiple stateful operators, e.g. stream-stream join followed by deduplication.
+The column 'operatorName' helps users to identify the operatorId for given operator.
+
+Additionally, if users want to query about an internal state store instance for a stateful operator (e.g. stream-stream join), the column 'stateStoreName' would be useful to determine the target.

From b5c94f1c02c66d422956260af6eba9527588ecf8 Mon Sep 17 00:00:00 2001
From: Ruifeng Zheng <ruifengz@apache.org>
Date: Tue, 28 Nov 2023 16:54:52 +0800
Subject: [PATCH 38/40] [SPARK-46103][INFRA][FOLLOWUP] Unpin jinja2 in Python
 linter

### What changes were proposed in this pull request?
https://github.com/apache/spark/pull/44012 unpinned jinja2 in doc build, this PR unpin it in Python linter.

this pr is only for master branch, and won't affect branch-3.x daily build

### Why are the changes needed?
to be consistent with requirements.

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ci

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #44051 from zhengruifeng/infra_linter_jinja.

Authored-by: Ruifeng Zheng <ruifengz@apache.org>
Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 01e5458340af2..3bfd1abb48d9c 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -692,7 +692,7 @@ jobs:
     - name: Install Python linter dependencies
       if: inputs.branch != 'branch-3.3' && inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5'
       run: |
-        python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==23.9.1'
+        python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc jinja2 'black==23.9.1'
         python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.59.3' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0'
     - name: Python linter
       run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python

From f2ea75f24690e14c1cccdd469159e16e443b3418 Mon Sep 17 00:00:00 2001
From: Haejoon Lee <haejoon.lee@databricks.com>
Date: Tue, 28 Nov 2023 18:13:52 +0900
Subject: [PATCH 39/40] [SPARK-32407][SPARK-35375][FOLLOWUP] Delete remaining
 comments on `Sphinx` and `Jinja2`

### What changes were proposed in this pull request?

This is followup for https://github.com/apache/spark/pull/44046 to remove remaining comments.

### Why are the changes needed?

We don't need those comments anymore since it's fixed.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

The existing CI should pass

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #44052 from itholic/SPARK-35375.

Authored-by: Haejoon Lee <haejoon.lee@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 docs/README.md | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/docs/README.md b/docs/README.md
index 87d68c2f86499..99ccf69dbaee5 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -52,13 +52,6 @@ Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to rep
 
 To generate SQL and Python API docs, you'll need to install these libraries:
 
-<!--
-TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
-See also https://github.com/sphinx-doc/sphinx/issues/7551.
-
-TODO(SPARK-35375): Jinja2 3.0.0+ causes error when building with Sphinx.
-See also https://issues.apache.org/jira/browse/SPARK-35375.
--->
 Run the following command from $SPARK_HOME:
 ```sh
 $ pip install --upgrade -r dev/requirements.txt

From a6cda2302c2962072af104c5d012329b06cbf166 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 28 Nov 2023 12:53:13 +0100
Subject: [PATCH 40/40] [SPARK-45760][SQL][FOLLOWUP] Inline With inside
 conditional branches

### What changes were proposed in this pull request?

This is a followup of https://github.com/apache/spark/pull/43623 to fix a regression. For `With` inside conditional branches, they may not be evaluated at all and we should not pull out the common expressions into a `Project`, but just inline.

### Why are the changes needed?

avoid perf regression

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

new test

### Was this patch authored or co-authored using generative AI tooling?

No

Closes #43978 from cloud-fan/with.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/expressions/Expression.scala |   5 +
 .../expressions/conditionalExpressions.scala  |  19 ++-
 .../expressions/nullExpressions.scala         |   8 ++
 .../optimizer/RewriteWithExpression.scala     | 119 ++++++++++++------
 .../RewriteWithExpressionSuite.scala          |  79 +++++++++++-
 5 files changed, 185 insertions(+), 45 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
index 0dc70c6c3947c..2cc813bd30556 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala
@@ -513,6 +513,11 @@ trait ConditionalExpression extends Expression {
    */
   def alwaysEvaluatedInputs: Seq[Expression]
 
+  /**
+   * Return a copy of itself with a new `alwaysEvaluatedInputs`.
+   */
+  def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): ConditionalExpression
+
   /**
    * Return groups of branches. For each group, at least one branch will be hit at runtime,
    * so that we can eagerly evaluate the common expressions of a group.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
index 28a7db51621fd..9ee2f2bb41417 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala
@@ -56,6 +56,10 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi
    */
   override def alwaysEvaluatedInputs: Seq[Expression] = predicate :: Nil
 
+  override def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): If = {
+    copy(predicate = alwaysEvaluatedInputs.head)
+  }
+
   override def branchGroups: Seq[Seq[Expression]] = Seq(Seq(trueValue, falseValue))
 
   final override val nodePatterns : Seq[TreePattern] = Seq(IF)
@@ -165,8 +169,15 @@ case class CaseWhen(
 
   final override val nodePatterns : Seq[TreePattern] = Seq(CASE_WHEN)
 
-  override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression =
-    super.legacyWithNewChildren(newChildren)
+  override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): CaseWhen = {
+    if (newChildren.length % 2 == 0) {
+      copy(branches = newChildren.grouped(2).map { case Seq(a, b) => (a, b) }.toSeq)
+    } else {
+      copy(
+        branches = newChildren.dropRight(1).grouped(2).map { case Seq(a, b) => (a, b) }.toSeq,
+        elseValue = newChildren.lastOption)
+    }
+  }
 
   // both then and else expressions should be considered.
   @transient
@@ -213,6 +224,10 @@ case class CaseWhen(
    */
   override def alwaysEvaluatedInputs: Seq[Expression] = children.head :: Nil
 
+  override def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): CaseWhen = {
+    withNewChildrenInternal(alwaysEvaluatedInputs.toIndexedSeq ++ children.drop(1))
+  }
+
   override def branchGroups: Seq[Seq[Expression]] = {
     // We look at subexpressions in conditions and values of `CaseWhen` separately. It is
     // because a subexpression in conditions will be run no matter which condition is matched
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 0e9e375b8acf8..4ccb369f5e2b2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -70,6 +70,10 @@ case class Coalesce(children: Seq[Expression])
    */
   override def alwaysEvaluatedInputs: Seq[Expression] = children.head :: Nil
 
+  override def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): Coalesce = {
+    withNewChildrenInternal(alwaysEvaluatedInputs.toIndexedSeq ++ children.drop(1))
+  }
+
   override def branchGroups: Seq[Seq[Expression]] = if (children.length > 1) {
     // If there is only one child, the first child is already covered by
     // `alwaysEvaluatedInputs` and we should exclude it here.
@@ -290,6 +294,10 @@ case class NaNvl(left: Expression, right: Expression)
    */
   override def alwaysEvaluatedInputs: Seq[Expression] = left :: Nil
 
+  override def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): NaNvl = {
+    copy(left = alwaysEvaluatedInputs.head)
+  }
+
   override def branchGroups: Seq[Seq[Expression]] = Seq(children)
 
   override def eval(input: InternalRow): Any = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala
index c5bd71b4a7d1f..cf2c77069a195 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala
@@ -19,7 +19,8 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import scala.collection.mutable
 
-import org.apache.spark.sql.catalyst.expressions.{Alias, CommonExpressionDef, CommonExpressionRef, Expression, With}
+import org.apache.spark.SparkException
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMON_EXPR_REF, WITH_EXPRESSION}
@@ -35,56 +36,92 @@ object RewriteWithExpression extends Rule[LogicalPlan] {
   override def apply(plan: LogicalPlan): LogicalPlan = {
     plan.transformWithPruning(_.containsPattern(WITH_EXPRESSION)) {
       case p if p.expressions.exists(_.containsPattern(WITH_EXPRESSION)) =>
-        var newChildren = p.children
-        var newPlan: LogicalPlan = p.transformExpressionsUp {
-          case With(child, defs) =>
-            val refToExpr = mutable.HashMap.empty[Long, Expression]
-            val childProjections = Array.fill(newChildren.size)(mutable.ArrayBuffer.empty[Alias])
+        val inputPlans = p.children.toArray
+        var newPlan: LogicalPlan = p.mapExpressions { expr =>
+          rewriteWithExprAndInputPlans(expr, inputPlans)
+        }
+        newPlan = newPlan.withNewChildren(inputPlans.toIndexedSeq)
+        if (p.output == newPlan.output) {
+          newPlan
+        } else {
+          Project(p.output, newPlan)
+        }
+    }
+  }
+
+  private def rewriteWithExprAndInputPlans(
+      e: Expression,
+      inputPlans: Array[LogicalPlan]): Expression = {
+    if (!e.containsPattern(WITH_EXPRESSION)) return e
+    e match {
+      case w: With =>
+        // Rewrite nested With expressions first
+        val child = rewriteWithExprAndInputPlans(w.child, inputPlans)
+        val defs = w.defs.map(rewriteWithExprAndInputPlans(_, inputPlans))
+        val refToExpr = mutable.HashMap.empty[Long, Expression]
+        val childProjections = Array.fill(inputPlans.length)(mutable.ArrayBuffer.empty[Alias])
+
+        defs.zipWithIndex.foreach { case (CommonExpressionDef(child, id), index) =>
+          if (child.containsPattern(COMMON_EXPR_REF)) {
+            throw SparkException.internalError(
+              "Common expression definition cannot reference other Common expression definitions")
+          }
 
-            defs.zipWithIndex.foreach { case (CommonExpressionDef(child, id), index) =>
-              if (CollapseProject.isCheap(child)) {
-                refToExpr(id) = child
-              } else {
-                val childProjectionIndex = newChildren.indexWhere(
-                  c => child.references.subsetOf(c.outputSet)
-                )
-                if (childProjectionIndex == -1) {
-                  // When we cannot rewrite the common expressions, force to inline them so that the
-                  // query can still run. This can happen if the join condition contains `With` and
-                  // the common expression references columns from both join sides.
-                  // TODO: things can go wrong if the common expression is nondeterministic. We
-                  //       don't fix it for now to match the old buggy behavior when certain
-                  //       `RuntimeReplaceable` did not use the `With` expression.
-                  // TODO: we should calculate the ref count and also inline the common expression
-                  //       if it's ref count is 1.
-                  refToExpr(id) = child
-                } else {
-                  val alias = Alias(child, s"_common_expr_$index")()
-                  childProjections(childProjectionIndex) += alias
-                  refToExpr(id) = alias.toAttribute
-                }
-              }
+          if (CollapseProject.isCheap(child)) {
+            refToExpr(id) = child
+          } else {
+            val childProjectionIndex = inputPlans.indexWhere(
+              c => child.references.subsetOf(c.outputSet)
+            )
+            if (childProjectionIndex == -1) {
+              // When we cannot rewrite the common expressions, force to inline them so that the
+              // query can still run. This can happen if the join condition contains `With` and
+              // the common expression references columns from both join sides.
+              // TODO: things can go wrong if the common expression is nondeterministic. We
+              //       don't fix it for now to match the old buggy behavior when certain
+              //       `RuntimeReplaceable` did not use the `With` expression.
+              // TODO: we should calculate the ref count and also inline the common expression
+              //       if it's ref count is 1.
+              refToExpr(id) = child
+            } else {
+              val alias = Alias(child, s"_common_expr_$index")()
+              childProjections(childProjectionIndex) += alias
+              refToExpr(id) = alias.toAttribute
             }
+          }
+        }
+
+        for (i <- inputPlans.indices) {
+          val projectList = childProjections(i)
+          if (projectList.nonEmpty) {
+            inputPlans(i) = Project(inputPlans(i).output ++ projectList, inputPlans(i))
+          }
+        }
 
-            newChildren = newChildren.zip(childProjections).map { case (child, projections) =>
-              if (projections.nonEmpty) {
-                Project(child.output ++ projections, child)
-              } else {
-                child
-              }
+        child.transformWithPruning(_.containsPattern(COMMON_EXPR_REF)) {
+          case ref: CommonExpressionRef =>
+            if (!refToExpr.contains(ref.id)) {
+              throw SparkException.internalError("Undefined common expression id " + ref.id)
             }
+            refToExpr(ref.id)
+        }
 
+      case c: ConditionalExpression =>
+        val newAlwaysEvaluatedInputs = c.alwaysEvaluatedInputs.map(
+          rewriteWithExprAndInputPlans(_, inputPlans))
+        val newExpr = c.withNewAlwaysEvaluatedInputs(newAlwaysEvaluatedInputs)
+        // Use transformUp to handle nested With.
+        newExpr.transformUpWithPruning(_.containsPattern(WITH_EXPRESSION)) {
+          case With(child, defs) =>
+            // For With in the conditional branches, they may not be evaluated at all and we can't
+            // pull the common expressions into a project which will always be evaluated. Inline it.
+            val refToExpr = defs.map(d => d.id -> d.child).toMap
             child.transformWithPruning(_.containsPattern(COMMON_EXPR_REF)) {
               case ref: CommonExpressionRef => refToExpr(ref.id)
             }
         }
 
-        newPlan = newPlan.withNewChildren(newChildren)
-        if (p.output == newPlan.output) {
-          newPlan
-        } else {
-          Project(p.output, newPlan)
-        }
+      case other => other.mapChildren(rewriteWithExprAndInputPlans(_, inputPlans))
     }
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala
index c625379eb5ffd..a386e9bf4efe6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, CommonExpressionDef, CommonExpressionRef, With}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Coalesce, CommonExpressionDef, CommonExpressionRef, With}
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
@@ -57,7 +58,7 @@ class RewriteWithExpressionSuite extends PlanTest {
     )
   }
 
-  test("nested WITH expression") {
+  test("nested WITH expression in the definition expression") {
     val a = testRelation.output.head
     val commonExprDef = CommonExpressionDef(a + a)
     val ref = new CommonExpressionRef(commonExprDef)
@@ -85,6 +86,57 @@ class RewriteWithExpressionSuite extends PlanTest {
     )
   }
 
+  test("nested WITH expression in the main expression") {
+    val a = testRelation.output.head
+    val commonExprDef = CommonExpressionDef(a + a)
+    val ref = new CommonExpressionRef(commonExprDef)
+    val innerExpr = With(ref + ref, Seq(commonExprDef))
+    val innerCommonExprName = "_common_expr_0"
+
+    val b = testRelation.output.last
+    val outerCommonExprDef = CommonExpressionDef(b + b)
+    val outerRef = new CommonExpressionRef(outerCommonExprDef)
+    val outerExpr = With(outerRef * outerRef + innerExpr, Seq(outerCommonExprDef))
+    val outerCommonExprName = "_common_expr_0"
+
+    val plan = testRelation.select(outerExpr.as("col"))
+    val rewrittenInnerExpr = (a + a).as(innerCommonExprName)
+    val rewrittenOuterExpr = (b + b).as(outerCommonExprName)
+    val finalExpr = rewrittenOuterExpr.toAttribute * rewrittenOuterExpr.toAttribute +
+      (rewrittenInnerExpr.toAttribute + rewrittenInnerExpr.toAttribute)
+    comparePlans(
+      Optimizer.execute(plan),
+      testRelation
+        .select((testRelation.output :+ rewrittenInnerExpr): _*)
+        .select((testRelation.output :+ rewrittenInnerExpr.toAttribute :+ rewrittenOuterExpr): _*)
+        .select(finalExpr.as("col"))
+        .analyze
+    )
+  }
+
+  test("correlated nested WITH expression is not supported") {
+    val b = testRelation.output.last
+    val outerCommonExprDef = CommonExpressionDef(b + b)
+    val outerRef = new CommonExpressionRef(outerCommonExprDef)
+
+    val a = testRelation.output.head
+    // The inner expression definition references the outer expression
+    val commonExprDef1 = CommonExpressionDef(a + a + outerRef)
+    val ref1 = new CommonExpressionRef(commonExprDef1)
+    val innerExpr1 = With(ref1 + ref1, Seq(commonExprDef1))
+
+    val outerExpr1 = With(outerRef + innerExpr1, Seq(outerCommonExprDef))
+    intercept[SparkException](Optimizer.execute(testRelation.select(outerExpr1.as("col"))))
+
+    val commonExprDef2 = CommonExpressionDef(a + a)
+    val ref2 = new CommonExpressionRef(commonExprDef2)
+    // The inner main expression references the outer expression
+    val innerExpr2 = With(ref2 + outerRef, Seq(commonExprDef1))
+
+    val outerExpr2 = With(outerRef + innerExpr2, Seq(outerCommonExprDef))
+    intercept[SparkException](Optimizer.execute(testRelation.select(outerExpr2.as("col"))))
+  }
+
   test("WITH expression in filter") {
     val a = testRelation.output.head
     val commonExprDef = CommonExpressionDef(a + a)
@@ -154,4 +206,27 @@ class RewriteWithExpressionSuite extends PlanTest {
         )
     )
   }
+
+  test("WITH expression inside conditional expression") {
+    val a = testRelation.output.head
+    val commonExprDef = CommonExpressionDef(a + a)
+    val ref = new CommonExpressionRef(commonExprDef)
+    val expr = Coalesce(Seq(a, With(ref * ref, Seq(commonExprDef))))
+    val inlinedExpr = Coalesce(Seq(a, (a + a) * (a + a)))
+    val plan = testRelation.select(expr.as("col"))
+    // With in the conditional branches is always inlined.
+    comparePlans(Optimizer.execute(plan), testRelation.select(inlinedExpr.as("col")))
+
+    val expr2 = Coalesce(Seq(With(ref * ref, Seq(commonExprDef)), a))
+    val plan2 = testRelation.select(expr2.as("col"))
+    val commonExprName = "_common_expr_0"
+    // With in the always-evaluated branches can still be optimized.
+    comparePlans(
+      Optimizer.execute(plan2),
+      testRelation
+        .select((testRelation.output :+ (a + a).as(commonExprName)): _*)
+        .select(Coalesce(Seq(($"$commonExprName" * $"$commonExprName"), a)).as("col"))
+        .analyze
+    )
+  }
 }

Command	Description	HTTP METHOD	Since Version
`create`	Create a Spark driver via `cluster` mode.	POST	1.3.0
`kill`	Kill a single Spark driver.	POST	1.3.0
`killall`	Kill all running Spark drivers.	POST	4.0.0
`status`	Check the status of a Spark job.	GET	1.3.0
`clear`	Clear the completed drivers and applications.	POST	4.0.0
Column	Type	Note
key	struct (depends on the type for state key)
value	struct (depends on the type for state value)
_partition_id	int	metadata column (hidden unless specified with SELECT)
Option	value	default	meaning
batchId	numeric value	latest committed batch	Represents the target batch to read from. This option is used when users want to perform time-travel. The batch should be committed but not yet cleaned up.
operatorId	numeric value	0	Represents the target operator to read from. This option is used when the query is using multiple stateful operators.
storeName	string	DEFAULT	Represents the target state store name to read from. This option is used when the stateful operator uses multiple state store instances. It is not required except stream-stream join.
joinSide	string ("left" or "right")	(none)	Represents the target side to read from. This option is used when users want to read the state from stream-stream join.
Column	Type	Note
operatorId	int
operatorName	string
stateStoreName	int
numPartitions	int
minBatchId	int	The minimum batch ID available for querying state. The value could be invalid if the streaming query taking the checkpoint is running, as cleanup would run.
maxBatchId	int	The maximum batch ID available for querying state. The value could be invalid if the streaming query taking the checkpoint is running, as the query will commit further batches.
_numColsPrefixKey	int	metadata column (hidden unless specified with SELECT)