From 441e15ef5f113c7f8e89ef89074493c35006fd42 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Fri, 24 Nov 2023 01:30:59 -0800 Subject: [PATCH 01/40] [SPARK-46087][PYTHON] Sync PySpark dependencies in docs and dev requirements ### What changes were proposed in this pull request? This PR proposes to synchronize the versions of dependencies listed in the [PySpark documentation](https://spark.apache.org/docs/latest/api/python/getting_started/install.html#dependencies) with those specified in the [dev/requirements.txt](https://github.com/apache/spark/blob/master/dev/requirements.txt) file. ### Why are the changes needed? Aligning the versions of dependencies ensures that the development environment reflects the actual user environment more accurately. ### Does this PR introduce _any_ user-facing change? No API changes. ### How was this patch tested? Build the documents from latest master branch manually and sync the version of dependencies: Screenshot 2023-11-24 at 2 49 09 PM ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44000 from itholic/req-sync. Authored-by: Haejoon Lee Signed-off-by: Dongjoon Hyun --- dev/requirements.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dev/requirements.txt b/dev/requirements.txt index 66a74471377dd..7de55ec24968a 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -1,11 +1,11 @@ # PySpark dependencies (required) -py4j +py4j>=0.10.9.7 # PySpark dependencies (optional) -numpy -pyarrow +numpy>=1.21 +pyarrow>=4.0.0 six==1.16.0 -pandas +pandas>=1.4.4 scipy plotly mlflow>=2.3.1 @@ -52,8 +52,8 @@ black==23.9.1 py # Spark Connect (required) -grpcio==1.59.3 -grpcio-status==1.59.3 +grpcio>=1.59.3 +grpcio-status>=1.59.3 protobuf==4.25.1 googleapis-common-protos>=1.56.4 From 132bb63a897f4f4049f34deefc065ed3eac6a90f Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Fri, 24 Nov 2023 19:38:31 +0900 Subject: [PATCH 02/40] [SPARK-46016][DOCS][PS] Fix pandas API support list properly ### What changes were proposed in this pull request? This PR proposes to fix a critical issue in the [Supported pandas API documentation](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/supported_pandas_api.html) where many essential APIs such as `DataFrame.max`, `DataFrame.min`, `DataFrame.mean`, `and DataFrame.median`, etc. were incorrectly marked as not implemented - marked as "N" - as below: Screenshot 2023-11-24 at 12 37 49 PM The root cause of this issue was that the script used to generate the support list excluded functions inherited from parent classes. For instance, `CategoricalIndex.max` is actually supported by inheriting the `Index` class but was not directly implemented in `CategoricalIndex`, leading to it being marked as unsupported: Screenshot 2023-11-24 at 12 30 08 PM ### Why are the changes needed? The current documentation inaccurately represents the state of supported pandas API, which could significantly hinder user experience and adoption. By correcting these inaccuracies, we ensure that the documentation reflects the true capabilities of Pandas API on Spark, providing users with reliable and accurate information. ### Does this PR introduce _any_ user-facing change? No. This PR only updates the documentation to accurately reflect the current state of supported pandas API. ### How was this patch tested? Manually build documentation, and check if the supported pandas API list is correctly generated as below: Screenshot 2023-11-24 at 12 36 31 PM ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43996 from itholic/fix_supported_api_gen. Authored-by: Haejoon Lee Signed-off-by: Hyukjin Kwon --- python/pyspark/pandas/supported_api_gen.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py index a83731db8fc16..27d5cd4b37f9d 100644 --- a/python/pyspark/pandas/supported_api_gen.py +++ b/python/pyspark/pandas/supported_api_gen.py @@ -138,23 +138,11 @@ def _create_supported_by_module( # module not implemented return {} - pd_funcs = dict( - [ - m - for m in getmembers(pd_module, isfunction) - if not m[0].startswith("_") and m[0] in pd_module.__dict__ - ] - ) + pd_funcs = dict([m for m in getmembers(pd_module, isfunction) if not m[0].startswith("_")]) if not pd_funcs: return {} - ps_funcs = dict( - [ - m - for m in getmembers(ps_module, isfunction) - if not m[0].startswith("_") and m[0] in ps_module.__dict__ - ] - ) + ps_funcs = dict([m for m in getmembers(ps_module, isfunction) if not m[0].startswith("_")]) return _organize_by_implementation_status( module_name, pd_funcs, ps_funcs, pd_module_group, ps_module_group From 2f6a38cfcb384b4f504e1c08264887ae90d441bc Mon Sep 17 00:00:00 2001 From: Alice Sayutina Date: Sat, 25 Nov 2023 09:52:27 +0900 Subject: [PATCH 03/40] [SPARK-45922][CONNECT][CLIENT] Minor retries refactoring (follow-up to multiple policies) ### What changes were proposed in this pull request? Follow up to https://github.com/apache/spark/pull/43591. Refactor default policy arguments into being an arguments on the class, not within core.py ### Why are the changes needed? General refactoring, also makes it easier for other policies to derive. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing coverage ### Was this patch authored or co-authored using generative AI tooling? No Closes #43800 from cdkrot/SPARK-45922. Authored-by: Alice Sayutina Signed-off-by: Hyukjin Kwon --- .../sql/connect/client/RetryPolicy.scala | 2 +- python/pyspark/sql/connect/client/core.py | 19 +--------- python/pyspark/sql/connect/client/retries.py | 37 ++++++++++++++++--- .../sql/tests/connect/client/test_client.py | 3 +- 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RetryPolicy.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RetryPolicy.scala index cb5b97f2e4aff..8c8472d780dbc 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RetryPolicy.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/RetryPolicy.scala @@ -55,7 +55,7 @@ object RetryPolicy { def defaultPolicy(): RetryPolicy = RetryPolicy( name = "DefaultPolicy", // Please synchronize changes here with Python side: - // pyspark/sql/connect/client/core.py + // pyspark/sql/connect/client/retries.py // // Note: these constants are selected so that the maximum tolerated wait is guaranteed // to be at least 10 minutes diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py index 58b48bd69ba43..5d8db69c641ff 100644 --- a/python/pyspark/sql/connect/client/core.py +++ b/python/pyspark/sql/connect/client/core.py @@ -595,23 +595,8 @@ def __init__( self._user_id = None self._retry_policies: List[RetryPolicy] = [] - default_policy_args = { - # Please synchronize changes here with Scala side - # GrpcRetryHandler.scala - # - # Note: the number of retries is selected so that the maximum tolerated wait - # is guaranteed to be at least 10 minutes - "max_retries": 15, - "backoff_multiplier": 4.0, - "initial_backoff": 50, - "max_backoff": 60000, - "jitter": 500, - "min_jitter_threshold": 2000, - } - if retry_policy: - default_policy_args.update(retry_policy) - - default_policy = DefaultPolicy(**default_policy_args) + retry_policy_args = retry_policy or dict() + default_policy = DefaultPolicy(**retry_policy_args) self.set_retry_policies([default_policy]) if self._builder.session_id is None: diff --git a/python/pyspark/sql/connect/client/retries.py b/python/pyspark/sql/connect/client/retries.py index 6aa959e09b5b0..26aa6893dfae5 100644 --- a/python/pyspark/sql/connect/client/retries.py +++ b/python/pyspark/sql/connect/client/retries.py @@ -185,6 +185,9 @@ def __init__( self._done = False def can_retry(self, exception: BaseException) -> bool: + if isinstance(exception, RetryException): + return True + return any(policy.can_retry(exception) for policy in self._policies) def accept_exception(self, exception: BaseException) -> bool: @@ -204,8 +207,12 @@ def _last_exception(self) -> BaseException: def _wait(self) -> None: exception = self._last_exception() - # Attempt to find a policy to wait with + if isinstance(exception, RetryException): + # Considered immediately retriable + logger.debug(f"Got error: {repr(exception)}. Retrying.") + return + # Attempt to find a policy to wait with for policy in self._policies: if not policy.can_retry(exception): continue @@ -244,12 +251,34 @@ def __iter__(self) -> Generator[AttemptManager, None, None]: class RetryException(Exception): """ An exception that can be thrown upstream when inside retry and which is always retryable + even without policies """ class DefaultPolicy(RetryPolicy): - def __init__(self, **kwargs): # type: ignore[no-untyped-def] - super().__init__(**kwargs) + # Please synchronize changes here with Scala side in + # org.apache.spark.sql.connect.client.RetryPolicy + # + # Note: the number of retries is selected so that the maximum tolerated wait + # is guaranteed to be at least 10 minutes + + def __init__( + self, + max_retries: Optional[int] = 15, + backoff_multiplier: float = 4.0, + initial_backoff: int = 50, + max_backoff: Optional[int] = 60000, + jitter: int = 500, + min_jitter_threshold: int = 2000, + ): + super().__init__( + max_retries=max_retries, + backoff_multiplier=backoff_multiplier, + initial_backoff=initial_backoff, + max_backoff=max_backoff, + jitter=jitter, + min_jitter_threshold=min_jitter_threshold, + ) def can_retry(self, e: BaseException) -> bool: """ @@ -267,8 +296,6 @@ def can_retry(self, e: BaseException) -> bool: True if the exception can be retried, False otherwise. """ - if isinstance(e, RetryException): - return True if not isinstance(e, grpc.RpcError): return False diff --git a/python/pyspark/sql/tests/connect/client/test_client.py b/python/pyspark/sql/tests/connect/client/test_client.py index 580ebc3965bb5..12e690c3a3099 100644 --- a/python/pyspark/sql/tests/connect/client/test_client.py +++ b/python/pyspark/sql/tests/connect/client/test_client.py @@ -31,7 +31,6 @@ from pyspark.sql.connect.client.retries import ( Retrying, DefaultPolicy, - RetryException, RetriesExceeded, ) from pyspark.sql.connect.client.reattach import ExecutePlanResponseReattachableIterator @@ -111,7 +110,7 @@ def sleep(t): try: for attempt in Retrying(client._retry_policies, sleep=sleep): with attempt: - raise RetryException() + raise TestException("Retryable error", grpc.StatusCode.UNAVAILABLE) except RetriesExceeded: pass From 50f189b3f48aad21307d52ad0c90ff4d9ac5e06d Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Fri, 24 Nov 2023 17:26:43 -0800 Subject: [PATCH 04/40] [SPARK-46066][SQL] Use the Separators API instead of the String API to construct the `DefaultPrettyPrinter` ### What changes were proposed in this pull request? This pr use the `Separators` API instead of the `String` API to construct Jackson `DefaultPrettyPrinter` due to the `String` API has been marked as deprecated in Jackson 2.16.0: ```java /** * Constructor that specifies separator String to use between root values; * if null, no separator is printed. *

* Note: simply constructs a {link SerializedString} out of parameter, * calls {link #DefaultPrettyPrinter(SerializableString)} * * param rootSeparator String to use as root value separator * deprecated in 2.16. Use the Separators API instead. */ Deprecated public DefaultPrettyPrinter(String rootSeparator) { this((rootSeparator == null) ? null : new SerializedString(rootSeparator)); } ``` ### Why are the changes needed? Clean up deprecated Jackson API usage. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass Github Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #43973 from LuciferYang/jackson-216-Deprecated. Lead-authored-by: yangjie01 Co-authored-by: YangJie Signed-off-by: Dongjoon Hyun --- .../org/apache/spark/sql/catalyst/json/JacksonGenerator.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala index e02b286061861..e01457ff10255 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonGenerator.scala @@ -75,7 +75,8 @@ class JacksonGenerator( private val gen = { val generator = new JsonFactory().createGenerator(writer).setRootValueSeparator(null) if (options.pretty) { - generator.setPrettyPrinter(new DefaultPrettyPrinter("")) + generator.setPrettyPrinter( + new DefaultPrettyPrinter(PrettyPrinter.DEFAULT_SEPARATORS.withRootSeparator(""))) } if (options.writeNonAsciiCharacterAsCodePoint) { generator.setHighestNonEscapedChar(0x7F) From a694a8a0be540e5d60d7f462e0761c4ba3b8b3e6 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 24 Nov 2023 17:31:22 -0800 Subject: [PATCH 05/40] [SPARK-46095][DOCS] Document `REST API` for Spark Standalone Cluster MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? This PR aims to document `REST API` for Spark Standalone Cluster. ### Why are the changes needed? To help the users to understand Apache Spark features. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual review. `REST API` Section is added newly. **AFTER** Screenshot 2023-11-24 at 4 13 53 PM ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44007 from dongjoon-hyun/SPARK-46095. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- docs/spark-standalone.md | 80 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index ce739cb90b531..2ab68d2a8049f 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -518,6 +518,8 @@ Spark applications supports the following configuration properties specific to s # Launching Spark Applications +## Spark Protocol + The [`spark-submit` script](submitting-applications.html) provides the most straightforward way to submit a compiled Spark application to the cluster. For standalone clusters, Spark currently supports two deploy modes. In `client` mode, the driver is launched in the same process as the @@ -540,6 +542,84 @@ failing repeatedly, you may do so through: You can find the driver ID through the standalone Master web UI at `http://:8080`. +## REST API + +If `spark.master.rest.enabled` is enabled, Spark master provides additional REST API +via http://[host:port]/[version]/submissions/[action] where +host is the master host, and +port is the port number specified by `spark.master.rest.port` (default: 6066), and +version is a protocol version, v1 as of today, and +action is one of the following supported actions. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CommandDescriptionHTTP METHODSince Version
createCreate a Spark driver via cluster mode.POST1.3.0
killKill a single Spark driver.POST1.3.0
killallKill all running Spark drivers.POST4.0.0
statusCheck the status of a Spark job.GET1.3.0
clearClear the completed drivers and applications.POST4.0.0
+ +The following is a curl CLI command example with the `pi.py` and REST API. + +```bash +$ curl -XPOST http://IP:PORT/v1/submissions/create \ +--header "Content-Type:application/json;charset=UTF-8" \ +--data '{ + "appResource": "", + "sparkProperties": { + "spark.master": "spark://master:7077", + "spark.app.name": "Spark Pi", + "spark.driver.memory": "1g", + "spark.driver.cores": "1", + "spark.jars": "" + }, + "clientSparkVersion": "", + "mainClass": "org.apache.spark.deploy.SparkSubmit", + "environmentVariables": { }, + "action": "CreateSubmissionRequest", + "appArgs": [ "/opt/spark/examples/src/main/python/pi.py", "10" ] +}' +``` + +The following is the response from the REST API for the above create request. + +```bash +{ + "action" : "CreateSubmissionResponse", + "message" : "Driver successfully submitted as driver-20231124153531-0000", + "serverSparkVersion" : "4.0.0", + "submissionId" : "driver-20231124153531-0000", + "success" : true +} +``` + + # Resource Scheduling The standalone cluster mode currently only supports a simple FIFO scheduler across applications. From 5211f6b140a74bd28f7e05934508bdafdbe7f237 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Fri, 24 Nov 2023 17:52:23 -0800 Subject: [PATCH 06/40] [SPARK-46085][CONNECT] Dataset.groupingSets in Scala Spark Connect client ### What changes were proposed in this pull request? This PR proposes to add `Dataset.groupingsets` API added from https://github.com/apache/spark/pull/43813 to Scala Spark Connect cleint. ### Why are the changes needed? For feature parity. ### Does this PR introduce _any_ user-facing change? Yes, it adds a new API to Scala Spark Connect client. ### How was this patch tested? Unittest was added. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43995 from HyukjinKwon/SPARK-46085. Authored-by: Hyukjin Kwon Signed-off-by: Dongjoon Hyun --- .../scala/org/apache/spark/sql/Dataset.scala | 35 ++++++++++++ .../spark/sql/RelationalGroupedDataset.scala | 8 ++- .../spark/sql/PlanGenerationTestSuite.scala | 6 +++ .../explain-results/groupingSets.explain | 4 ++ .../query-tests/queries/groupingSets.json | 50 ++++++++++++++++++ .../queries/groupingSets.proto.bin | Bin 0 -> 106 bytes 6 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 connector/connect/common/src/test/resources/query-tests/explain-results/groupingSets.explain create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/groupingSets.json create mode 100644 connector/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala index a1e57226e530f..d760c9d97693b 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -1532,6 +1532,41 @@ class Dataset[T] private[sql] ( proto.Aggregate.GroupType.GROUP_TYPE_CUBE) } + /** + * Create multi-dimensional aggregation for the current Dataset using the specified grouping + * sets, so we can run aggregation on them. See [[RelationalGroupedDataset]] for all the + * available aggregate functions. + * + * {{{ + * // Compute the average for all numeric columns group by specific grouping sets. + * ds.groupingSets(Seq(Seq($"department", $"group"), Seq()), $"department", $"group").avg() + * + * // Compute the max age and average salary, group by specific grouping sets. + * ds.groupingSets(Seq($"department", $"gender"), Seq()), $"department", $"group").agg(Map( + * "salary" -> "avg", + * "age" -> "max" + * )) + * }}} + * + * @group untypedrel + * @since 4.0.0 + */ + @scala.annotation.varargs + def groupingSets(groupingSets: Seq[Seq[Column]], cols: Column*): RelationalGroupedDataset = { + val groupingSetMsgs = groupingSets.map { groupingSet => + val groupingSetMsg = proto.Aggregate.GroupingSets.newBuilder() + for (groupCol <- groupingSet) { + groupingSetMsg.addGroupingSet(groupCol.expr) + } + groupingSetMsg.build() + } + new RelationalGroupedDataset( + toDF(), + cols, + proto.Aggregate.GroupType.GROUP_TYPE_GROUPING_SETS, + groupingSets = Some(groupingSetMsgs)) + } + /** * (Scala-specific) Aggregates on the entire Dataset without groups. * {{{ diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala index 5ed97e45c7701..776a6231eaecd 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -39,7 +39,8 @@ class RelationalGroupedDataset private[sql] ( private[sql] val df: DataFrame, private[sql] val groupingExprs: Seq[Column], groupType: proto.Aggregate.GroupType, - pivot: Option[proto.Aggregate.Pivot] = None) { + pivot: Option[proto.Aggregate.Pivot] = None, + groupingSets: Option[Seq[proto.Aggregate.GroupingSets]] = None) { private[this] def toDF(aggExprs: Seq[Column]): DataFrame = { df.sparkSession.newDataFrame { builder => @@ -60,6 +61,11 @@ class RelationalGroupedDataset private[sql] ( builder.getAggregateBuilder .setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_PIVOT) .setPivot(pivot.get) + case proto.Aggregate.GroupType.GROUP_TYPE_GROUPING_SETS => + assert(groupingSets.isDefined) + val aggBuilder = builder.getAggregateBuilder + .setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_GROUPING_SETS) + groupingSets.get.foreach(aggBuilder.addGroupingSets) case g => throw new UnsupportedOperationException(g.toString) } } diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index 5cc63bc45a04a..c5c917ebfa955 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -3017,6 +3017,12 @@ class PlanGenerationTestSuite simple.groupBy(Column("id")).pivot("a").agg(functions.count(Column("b"))) } + test("groupingSets") { + simple + .groupingSets(Seq(Seq(fn.col("a")), Seq.empty[Column]), fn.col("a")) + .agg("a" -> "max", "a" -> "count") + } + test("width_bucket") { simple.select(fn.width_bucket(fn.col("b"), fn.col("b"), fn.col("b"), fn.col("a"))) } diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupingSets.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupingSets.explain new file mode 100644 index 0000000000000..1e3fe1a987ef5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupingSets.explain @@ -0,0 +1,4 @@ +Aggregate [a#0, spark_grouping_id#0L], [a#0, max(a#0) AS max(a)#0, count(a#0) AS count(a)#0L] ++- Expand [[id#0L, a#0, b#0, a#0, 0], [id#0L, a#0, b#0, null, 1]], [id#0L, a#0, b#0, a#0, spark_grouping_id#0L] + +- Project [id#0L, a#0, b#0, a#0 AS a#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.json b/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.json new file mode 100644 index 0000000000000..6e84824ec7a3a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.json @@ -0,0 +1,50 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPING_SETS", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "max", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a", + "planId": "0" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a", + "planId": "0" + } + }] + } + }], + "groupingSets": [{ + "groupingSet": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + }, { + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupingSets.proto.bin new file mode 100644 index 0000000000000000000000000000000000000000..ce0294096706ecc7e0528e34bde6f5e438ef0d37 GIT binary patch literal 106 zcmd;L5@7U7;nLt@5@3i@5>hBGDJo4avB^xaO3F;n%q!7Jv;vVyRw? Date: Sat, 25 Nov 2023 14:38:34 -0600 Subject: [PATCH 07/40] [SPARK-46100][CORE][PYTHON] Reduce stack depth by replace (string|array).size with (string|array).length ### What changes were proposed in this pull request? There are a lot of `[string|array].size` called. In fact, the size calls the underlying length, this behavior increase the stack length. We should call `[string|array].length` directly. We also get the compile waring `Replace .size with .length on arrays and strings` This PR just improve the core module. ### Why are the changes needed? Reduce stack depth by replace (string|array).size with (string|array).length ### Does this PR introduce _any_ user-facing change? 'No'. ### How was this patch tested? Exists test cases. ### Was this patch authored or co-authored using generative AI tooling? 'No'. Closes #44011 from beliefer/SPARK-46100. Authored-by: Jiaan Geng Signed-off-by: Sean Owen --- .../spark/api/python/PythonRunner.scala | 2 +- .../spark/deploy/master/ui/MasterPage.scala | 4 +- .../spark/executor/ExecutorMetrics.scala | 2 +- .../apache/spark/resource/ResourceUtils.scala | 2 +- .../spark/scheduler/TaskDescription.scala | 2 +- .../spark/scheduler/TaskSchedulerImpl.scala | 4 +- .../apache/spark/ui/ConsoleProgressBar.scala | 2 +- .../org/apache/spark/util/HadoopFSUtils.scala | 2 +- .../util/io/ChunkedByteBufferFileRegion.scala | 2 +- .../org/apache/spark/CheckpointSuite.scala | 16 ++-- .../org/apache/spark/DistributedSuite.scala | 16 ++-- .../scala/org/apache/spark/FileSuite.scala | 2 +- .../apache/spark/MapOutputTrackerSuite.scala | 4 +- .../org/apache/spark/PartitioningSuite.scala | 4 +- .../scala/org/apache/spark/ShuffleSuite.scala | 2 +- .../deploy/DecommissionWorkerSuite.scala | 2 +- .../spark/deploy/SparkSubmitSuite.scala | 4 +- .../StandaloneDynamicAllocationSuite.scala | 22 ++--- .../spark/deploy/client/AppClientSuite.scala | 6 +- .../history/FsHistoryProviderSuite.scala | 20 ++--- .../rest/StandaloneRestSubmitSuite.scala | 2 +- .../WholeTextFileRecordReaderSuite.scala | 4 +- .../plugin/PluginContainerSuite.scala | 4 +- .../spark/rdd/AsyncRDDActionsSuite.scala | 2 +- .../spark/rdd/LocalCheckpointSuite.scala | 2 +- .../spark/rdd/PairRDDFunctionsSuite.scala | 44 +++++----- .../org/apache/spark/rdd/PipedRDDSuite.scala | 10 +-- .../scala/org/apache/spark/rdd/RDDSuite.scala | 80 +++++++++---------- .../org/apache/spark/rdd/SortingSuite.scala | 6 +- .../spark/rdd/ZippedPartitionsSuite.scala | 4 +- .../spark/resource/ResourceProfileSuite.scala | 2 +- .../spark/resource/ResourceUtilsSuite.scala | 6 +- .../spark/scheduler/AQEShuffledRDD.scala | 2 +- .../CoarseGrainedSchedulerBackendSuite.scala | 2 +- .../spark/scheduler/DAGSchedulerSuite.scala | 32 ++++---- .../spark/scheduler/MapStatusSuite.scala | 2 +- .../OutputCommitCoordinatorSuite.scala | 8 +- .../scheduler/TaskSchedulerImplSuite.scala | 12 +-- .../spark/scheduler/TaskSetManagerSuite.scala | 4 +- .../KryoSerializerDistributedSuite.scala | 2 +- .../sort/IndexShuffleBlockResolverSuite.scala | 2 +- .../apache/spark/storage/DiskStoreSuite.scala | 2 +- .../apache/spark/util/FileAppenderSuite.scala | 4 +- .../util/collection/SizeTrackerSuite.scala | 2 +- 44 files changed, 180 insertions(+), 180 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala index d6363182606d9..e6d5a750ea325 100644 --- a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala +++ b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala @@ -378,7 +378,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( resources.foreach { case (k, v) => PythonRDD.writeUTF(k, dataOut) PythonRDD.writeUTF(v.name, dataOut) - dataOut.writeInt(v.addresses.size) + dataOut.writeInt(v.addresses.length) v.addresses.foreach { case addr => PythonRDD.writeUTF(addr, dataOut) } diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala index cb325b37958ec..b2f35984d37f8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala +++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala @@ -83,13 +83,13 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") { .flatMap(_.iterator) .groupBy(_._1) // group by resource name .map { case (rName, rInfoArr) => - rName -> rInfoArr.map(_._2.addresses.size).sum + rName -> rInfoArr.map(_._2.addresses.length).sum } val usedInfo = aliveWorkers.map(_.resourcesInfoUsed) .flatMap(_.iterator) .groupBy(_._1) // group by resource name .map { case (rName, rInfoArr) => - rName -> rInfoArr.map(_._2.addresses.size).sum + rName -> rInfoArr.map(_._2.addresses.length).sum } formatResourcesUsed(totalInfo, usedInfo) } diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala index 486e59652218b..8c474e9b76c6a 100644 --- a/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala +++ b/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala @@ -46,7 +46,7 @@ class ExecutorMetrics private[spark] extends Serializable { private[spark] def this(metrics: Array[Long]) = { this() - Array.copy(metrics, 0, this.metrics, 0, Math.min(metrics.size, this.metrics.size)) + Array.copy(metrics, 0, this.metrics, 0, Math.min(metrics.length, this.metrics.length)) } private[spark] def this(metrics: AtomicLongArray) = { diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala index 9080be01a9e66..00c655f4a4f4d 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala @@ -303,7 +303,7 @@ private[spark] object ResourceUtils extends Logging { allocations: Map[String, ResourceInformation], execReqs: Map[String, ExecutorResourceRequest]): Unit = { execReqs.foreach { case (rName, req) => - require(allocations.contains(rName) && allocations(rName).addresses.size >= req.amount, + require(allocations.contains(rName) && allocations(rName).addresses.length >= req.amount, s"Resource: ${rName}, with addresses: " + s"${allocations(rName).addresses.mkString(",")} " + s"is less than what the user requested: ${req.amount})") diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala index 6e6507782a49e..75032086ead72 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskDescription.scala @@ -80,7 +80,7 @@ private[spark] object TaskDescription { map.foreach { case (key, value) => dataOut.writeUTF(key) dataOut.writeUTF(value.name) - dataOut.writeInt(value.addresses.size) + dataOut.writeInt(value.addresses.length) value.addresses.foreach(dataOut.writeUTF(_)) } } diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala index 41f6b3ad64bf5..15ae2fef221d1 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala @@ -434,7 +434,7 @@ private[spark] class TaskSchedulerImpl( // addresses are the same as that we allocated in taskResourceAssignments since it's // synchronized. We don't remove the exact addresses allocated because the current // approach produces the identical result with less time complexity. - availableResources(i)(rName).remove(0, rInfo.addresses.size) + availableResources(i)(rName).remove(0, rInfo.addresses.length) } } } catch { @@ -752,7 +752,7 @@ private[spark] class TaskSchedulerImpl( .mkString(",") addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr)) - logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for " + + logInfo(s"Successfully scheduled all the ${addressesWithDescs.length} tasks for " + s"barrier stage ${taskSet.stageId}.") } taskSet.barrierPendingLaunchTasks.clear() diff --git a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala index dff94b4e875de..b5473e076946b 100644 --- a/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala +++ b/core/src/main/scala/org/apache/spark/ui/ConsoleProgressBar.scala @@ -74,7 +74,7 @@ private[spark] class ConsoleProgressBar(sc: SparkContext) extends Logging { * the progress bar, then progress bar will be showed in next line without overwrite logs. */ private def show(now: Long, stages: Seq[StageData]): Unit = { - val width = TerminalWidth / stages.size + val width = TerminalWidth / stages.length val bar = stages.map { s => val total = s.numTasks val header = s"[Stage ${s.stageId}:" diff --git a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala index 3245a528b74cf..4c7b12f60cc8d 100644 --- a/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala @@ -245,7 +245,7 @@ private[spark] object HadoopFSUtils extends Logging { val allLeafStatuses = { val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory) val filteredNestedFiles: Seq[FileStatus] = contextOpt match { - case Some(context) if dirs.size > parallelismThreshold => + case Some(context) if dirs.length > parallelismThreshold => parallelListLeafFilesInternal( context, dirs.map(_.getPath).toImmutableArraySeq, diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferFileRegion.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferFileRegion.scala index 23fc0f88f0b93..ec74ce0473efd 100644 --- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferFileRegion.scala +++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferFileRegion.scala @@ -69,7 +69,7 @@ private[io] class ChunkedByteBufferFileRegion( if (keepGoing) { // advance to the next chunk (if there are any more) currentChunkIdx += 1 - if (currentChunkIdx == chunks.size) { + if (currentChunkIdx == chunks.length) { keepGoing = false } else { currentChunk = chunks(currentChunkIdx) diff --git a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala index c425596eb0433..874f4896bb01e 100644 --- a/core/src/test/scala/org/apache/spark/CheckpointSuite.scala +++ b/core/src/test/scala/org/apache/spark/CheckpointSuite.scala @@ -170,10 +170,10 @@ trait RDDCheckpointTester { self: SparkFunSuite => * upon checkpointing. Ignores the checkpointData field, which may grow when we checkpoint. */ private def getSerializedSizes(rdd: RDD[_]): (Int, Int) = { - val rddSize = Utils.serialize(rdd).size - val rddCpDataSize = Utils.serialize(rdd.checkpointData).size - val rddPartitionSize = Utils.serialize(rdd.partitions).size - val rddDependenciesSize = Utils.serialize(rdd.dependencies).size + val rddSize = Utils.serialize(rdd).length + val rddCpDataSize = Utils.serialize(rdd.checkpointData).length + val rddPartitionSize = Utils.serialize(rdd.partitions).length + val rddDependenciesSize = Utils.serialize(rdd.dependencies).length // Print detailed size, helps in debugging logInfo("Serialized sizes of " + rdd + @@ -339,7 +339,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS runTest("ParallelCollectionRDD") { reliableCheckpoint: Boolean => val parCollection = sc.makeRDD(1 to 4, 2) - val numPartitions = parCollection.partitions.size + val numPartitions = parCollection.partitions.length checkpoint(parCollection, reliableCheckpoint) assert(parCollection.dependencies === Nil) val result = parCollection.collect() @@ -358,7 +358,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS val blockManager = SparkEnv.get.blockManager blockManager.putSingle(blockId, "test", StorageLevel.MEMORY_ONLY) val blockRDD = new BlockRDD[String](sc, Array(blockId)) - val numPartitions = blockRDD.partitions.size + val numPartitions = blockRDD.partitions.length checkpoint(blockRDD, reliableCheckpoint) val result = blockRDD.collect() if (reliableCheckpoint) { @@ -507,7 +507,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS runTest("CheckpointRDD with zero partitions") { reliableCheckpoint: Boolean => val rdd = new BlockRDD[Int](sc, Array.empty[BlockId]) - assert(rdd.partitions.size === 0) + assert(rdd.partitions.length === 0) assert(rdd.isCheckpointed === false) assert(rdd.isCheckpointedAndMaterialized === false) checkpoint(rdd, reliableCheckpoint) @@ -516,7 +516,7 @@ class CheckpointSuite extends SparkFunSuite with RDDCheckpointTester with LocalS assert(rdd.count() === 0) assert(rdd.isCheckpointed) assert(rdd.isCheckpointedAndMaterialized) - assert(rdd.partitions.size === 0) + assert(rdd.partitions.length === 0) } runTest("checkpointAllMarkedAncestors") { reliableCheckpoint: Boolean => diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala index e156533be15ca..a2b09f0ef3c3a 100644 --- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala @@ -80,7 +80,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex sc = new SparkContext(clusterUrl, "test") val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1)), 5) val groups = pairs.groupByKey(5).collect() - assert(groups.size === 2) + assert(groups.length === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 assert(valuesFor1.toList.sorted === List(1, 2, 3)) val valuesFor2 = groups.find(_._1 == 2).get._2 @@ -264,8 +264,8 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex sc = new SparkContext(clusterUrl, "test") val data = sc.parallelize(Seq(true, true), 2) assert(data.count() === 2) // force executors to start - assert(data.map(markNodeIfIdentity).collect().size === 2) - assert(data.map(failOnMarkedIdentity).collect().size === 2) + assert(data.map(markNodeIfIdentity).collect().length === 2) + assert(data.map(failOnMarkedIdentity).collect().length === 2) } test("recover from repeated node failures during shuffle-map") { @@ -275,7 +275,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex for (i <- 1 to 3) { val data = sc.parallelize(Seq(true, false), 2) assert(data.count() === 2) - assert(data.map(markNodeIfIdentity).collect().size === 2) + assert(data.map(markNodeIfIdentity).collect().length === 2) assert(data.map(failOnMarkedIdentity).map(x => x -> x).groupByKey().count() === 2) } } @@ -287,7 +287,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex for (i <- 1 to 3) { val data = sc.parallelize(Seq(true, true), 2) assert(data.count() === 2) - assert(data.map(markNodeIfIdentity).collect().size === 2) + assert(data.map(markNodeIfIdentity).collect().length === 2) // This relies on mergeCombiners being used to perform the actual reduce for this // test to actually be testing what it claims. val grouped = data.map(x => x -> x).combineByKey( @@ -295,7 +295,7 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex (x: Boolean, y: Boolean) => x, (x: Boolean, y: Boolean) => failOnMarkedIdentity(x) ) - assert(grouped.collect().size === 1) + assert(grouped.collect().length === 1) } } @@ -310,8 +310,8 @@ class DistributedSuite extends SparkFunSuite with Matchers with LocalSparkContex data.persist(StorageLevel.MEMORY_ONLY_2) assert(data.count() === 4) - assert(data.map(markNodeIfIdentity).collect().size === 4) - assert(data.map(failOnMarkedIdentity).collect().size === 4) + assert(data.map(markNodeIfIdentity).collect().length === 4) + assert(data.map(failOnMarkedIdentity).collect().length === 4) // Create a new replicated RDD to make sure that cached peer information doesn't cause // problems. diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala index 4a2b2339159cb..7750db6020887 100644 --- a/core/src/test/scala/org/apache/spark/FileSuite.scala +++ b/core/src/test/scala/org/apache/spark/FileSuite.scala @@ -236,7 +236,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext { // Try reading the output back as an object file val ct = reflect.ClassTag[Any](Utils.classForName(className, noSparkClassLoader = true)) val output = sc.objectFile[Any](outputDir) - assert(output.collect().size === 3) + assert(output.collect().length === 3) assert(output.collect().head.getClass.getName === className) } } diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala index dde30aee82878..5d635011d2ec6 100644 --- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala @@ -237,13 +237,13 @@ class MapOutputTrackerSuite extends SparkFunSuite with LocalSparkContext { // as it has 4 out of 7 bytes of output. val topLocs50 = tracker.getLocationsWithLargestOutputs(10, 0, 1, 0.5) assert(topLocs50.nonEmpty) - assert(topLocs50.get.size === 1) + assert(topLocs50.get.length === 1) assert(topLocs50.get.head === BlockManagerId("a", "hostA", 1000)) // When the threshold is 20%, both hosts should be returned as preferred locations. val topLocs20 = tracker.getLocationsWithLargestOutputs(10, 0, 1, 0.2) assert(topLocs20.nonEmpty) - assert(topLocs20.get.size === 2) + assert(topLocs20.get.length === 2) assert(topLocs20.get.toSet === Seq(BlockManagerId("a", "hostA", 1000), BlockManagerId("b", "hostB", 1000)).toSet) diff --git a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala index 28fa9f5e23e79..3447ba8c1765e 100644 --- a/core/src/test/scala/org/apache/spark/PartitioningSuite.scala +++ b/core/src/test/scala/org/apache/spark/PartitioningSuite.scala @@ -77,7 +77,7 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva for (element <- 1 to 1000) { val partition = partitioner.getPartition(element) if (numPartitions > 1) { - if (partition < rangeBounds.size) { + if (partition < rangeBounds.length) { assert(element <= rangeBounds(partition)) } if (partition > 0) { @@ -111,7 +111,7 @@ class PartitioningSuite extends SparkFunSuite with SharedSparkContext with Priva assert(count === rdd.count()) sketched.foreach { case (idx, n, sample) => assert(n === idx) - assert(sample.size === math.min(n, sampleSizePerPartition)) + assert(sample.length === math.min(n, sampleSizePerPartition)) } } diff --git a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala index a92d532907adf..ac10a00d98e04 100644 --- a/core/src/test/scala/org/apache/spark/ShuffleSuite.scala +++ b/core/src/test/scala/org/apache/spark/ShuffleSuite.scala @@ -51,7 +51,7 @@ abstract class ShuffleSuite extends SparkFunSuite with Matchers with LocalRootDi sc = new SparkContext("local", "test", myConf) val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1)), 4) val groups = pairs.groupByKey(4).collect() - assert(groups.size === 2) + assert(groups.length === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 assert(valuesFor1.toList.sorted === List(1, 2, 3)) val valuesFor2 = groups.find(_._1 == 2).get._2 diff --git a/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala b/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala index 3b3bcff0c5a3f..20993df718a3b 100644 --- a/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/DecommissionWorkerSuite.scala @@ -439,7 +439,7 @@ class DecommissionWorkerSuite val appId = sc.applicationId eventually(timeout(1.minute), interval(1.seconds)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.getExecutorLimit === Int.MaxValue) } diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index a032e9aa16be9..553d001285b2d 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -1736,7 +1736,7 @@ object SimpleApplicationTest { .map(x => SparkEnv.get.conf.get(config)) .collect() .distinct - if (executorValues.size != 1) { + if (executorValues.length != 1) { throw new SparkException(s"Inconsistent values for $config: " + s"${executorValues.mkString("values(", ", ", ")")}") } @@ -1795,7 +1795,7 @@ class TestFileSystem extends org.apache.hadoop.fs.LocalFileSystem { class TestSparkApplication extends SparkApplication with Matchers { override def start(args: Array[String], conf: SparkConf): Unit = { - assert(args.size === 1) + assert(args.length === 1) assert(args(0) === "hello") assert(conf.get("spark.test.hello") === "world") assert(sys.props.get("spark.test.hello") === None) diff --git a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala index 01995ca3632d2..5ecc551c16b8c 100644 --- a/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/StandaloneDynamicAllocationSuite.scala @@ -69,7 +69,7 @@ class StandaloneDynamicAllocationSuite workers = makeWorkers(10, 2048) // Wait until all workers register with master successfully eventually(timeout(1.minute), interval(10.milliseconds)) { - assert(getMasterState.workers.size === numWorkers) + assert(getMasterState.workers.length === numWorkers) } } @@ -93,7 +93,7 @@ class StandaloneDynamicAllocationSuite val appId = sc.applicationId eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === 2) assert(apps.head.getExecutorLimit === Int.MaxValue) @@ -140,7 +140,7 @@ class StandaloneDynamicAllocationSuite val appId = sc.applicationId eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === 2) assert(apps.head.executors.values.map(_.cores).toArray === Array(4, 4)) @@ -195,7 +195,7 @@ class StandaloneDynamicAllocationSuite val appId = sc.applicationId eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === 2) assert(apps.head.executors.values.map(_.cores).toArray === Array(8, 8)) @@ -248,7 +248,7 @@ class StandaloneDynamicAllocationSuite val appId = sc.applicationId eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === 10) // 20 cores total assert(apps.head.getExecutorLimit === Int.MaxValue) @@ -302,7 +302,7 @@ class StandaloneDynamicAllocationSuite val appId = sc.applicationId eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === 4) // 8 cores total assert(apps.head.getExecutorLimit === Int.MaxValue) @@ -360,7 +360,7 @@ class StandaloneDynamicAllocationSuite sc.requestExecutors(2) eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === 2) assert(apps.head.getExecutorLimit === 2) @@ -385,7 +385,7 @@ class StandaloneDynamicAllocationSuite sc.requestExecutors(2) eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === 2) assert(apps.head.getExecutorLimit === 2) @@ -425,7 +425,7 @@ class StandaloneDynamicAllocationSuite val appId = sc.applicationId eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === 2) assert(apps.head.getExecutorLimit === Int.MaxValue) @@ -465,7 +465,7 @@ class StandaloneDynamicAllocationSuite val appId = sc.applicationId eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === initialExecutorLimit) assert(apps.head.getExecutorLimit === initialExecutorLimit) @@ -477,7 +477,7 @@ class StandaloneDynamicAllocationSuite val appId = sc.applicationId eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() - assert(apps.size === 1) + assert(apps.length === 1) assert(apps.head.id === appId) assert(apps.head.executors.size === 2) assert(apps.head.getExecutorLimit === Int.MaxValue) diff --git a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala index d109ed8442d44..3555faf5c2cb9 100644 --- a/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/client/AppClientSuite.scala @@ -71,7 +71,7 @@ class AppClientSuite workers = makeWorkers(10, 2048) // Wait until all workers register with master successfully eventually(timeout(1.minute), interval(10.milliseconds)) { - assert(getMasterState.workers.size === numWorkers) + assert(getMasterState.workers.length === numWorkers) } } @@ -99,7 +99,7 @@ class AppClientSuite eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() assert(ci.listener.connectedIdList.size === 1, "client listener should have one connection") - assert(apps.size === 1, "master should have 1 registered app") + assert(apps.length === 1, "master should have 1 registered app") } // Send message to Master to request Executors, verify request by change in executor limit @@ -176,7 +176,7 @@ class AppClientSuite eventually(timeout(10.seconds), interval(10.millis)) { val apps = getApplications() assert(ci.listener.connectedIdList.size === 1, "client listener should have one connection") - assert(apps.size === 1, "master should have 1 registered app") + assert(apps.length === 1, "master should have 1 registered app") } // Send message to Master to request Executors with multiple resource profiles. diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala index d16e904bdcf13..3013a5bf4a294 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala @@ -1113,13 +1113,13 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P provider.checkForLogs() provider.cleanLogs() - assert(new File(testDir.toURI).listFiles().size === logCount) + assert(new File(testDir.toURI).listFiles().length === logCount) // Move the clock forward 1 day and scan the files again. They should still be there. clock.advance(TimeUnit.DAYS.toMillis(1)) provider.checkForLogs() provider.cleanLogs() - assert(new File(testDir.toURI).listFiles().size === logCount) + assert(new File(testDir.toURI).listFiles().length === logCount) // Update the slow app to contain valid info. Code should detect the change and not clean // it up. @@ -1133,7 +1133,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P clock.advance(TimeUnit.DAYS.toMillis(2)) provider.checkForLogs() provider.cleanLogs() - assert(new File(testDir.toURI).listFiles().size === validLogCount) + assert(new File(testDir.toURI).listFiles().length === validLogCount) } test("always find end event for finished apps") { @@ -1414,12 +1414,12 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P provider.checkForLogs() // The invalid application log file would be cleaned by checkAndCleanLog(). - assert(new File(testDir.toURI).listFiles().size === 1) + assert(new File(testDir.toURI).listFiles().length === 1) clock.advance(1) // cleanLogs() would clean the valid application log file. provider.cleanLogs() - assert(new File(testDir.toURI).listFiles().size === 0) + assert(new File(testDir.toURI).listFiles().length === 0) } private def assertOptionAfterSerde(opt: Option[Long], expected: Option[Long]): Unit = { @@ -1556,7 +1556,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P SparkListenerJobStart(1, 0, Seq.empty)), rollFile = false) provider.checkForLogs() provider.cleanLogs() - assert(dir.listFiles().size === 1) + assert(dir.listFiles().length === 1) assert(provider.getListing().length === 1) // Manually delete the appstatus file to make an invalid rolling event log @@ -1578,7 +1578,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P provider.checkForLogs() provider.cleanLogs() assert(provider.getListing().length === 1) - assert(dir.listFiles().size === 2) + assert(dir.listFiles().length === 2) // Make sure a new provider sees the valid application provider.stop() @@ -1615,7 +1615,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P // The 1st checkForLogs should scan/update app2 only since it is newer than app1 provider.checkForLogs() assert(provider.getListing().length === 1) - assert(dir.listFiles().size === 2) + assert(dir.listFiles().length === 2) assert(provider.getListing().map(e => e.id).contains("app2")) assert(!provider.getListing().map(e => e.id).contains("app1")) @@ -1630,7 +1630,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P // The 2nd checkForLogs should scan/update app3 only since it is newer than app1 provider.checkForLogs() assert(provider.getListing().length === 2) - assert(dir.listFiles().size === 3) + assert(dir.listFiles().length === 3) assert(provider.getListing().map(e => e.id).contains("app3")) assert(!provider.getListing().map(e => e.id).contains("app1")) @@ -1655,7 +1655,7 @@ abstract class FsHistoryProviderSuite extends SparkFunSuite with Matchers with P SparkListenerJobStart(1, 0, Seq.empty)), rollFile = false) provider.checkForLogs() provider.cleanLogs() - assert(dir.listFiles().size === 1) + assert(dir.listFiles().length === 1) assert(provider.getListing().length === 1) // Manually delete event log files and create event log file reader diff --git a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala index 2f645e69079a2..abe05a8055843 100644 --- a/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/rest/StandaloneRestSubmitSuite.scala @@ -289,7 +289,7 @@ class StandaloneRestSubmitSuite extends SparkFunSuite { val statusRequestPath = s"$httpUrl/$v/submissions/status" val goodJson = constructSubmitRequest(masterUrl).toJson val badJson1 = goodJson.replaceAll("action", "fraction") // invalid JSON - val badJson2 = goodJson.substring(goodJson.size / 2) // malformed JSON + val badJson2 = goodJson.substring(goodJson.length / 2) // malformed JSON val notJson = "\"hello, world\"" val (response1, code1) = sendHttpRequestWithResponse(submitRequestPath, "POST") // missing JSON val (response2, code2) = sendHttpRequestWithResponse(submitRequestPath, "POST", badJson1) diff --git a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala index e64ebe2a55142..0fc0b7536067e 100644 --- a/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala +++ b/core/src/test/scala/org/apache/spark/input/WholeTextFileRecordReaderSuite.scala @@ -97,7 +97,7 @@ class WholeTextFileRecordReaderSuite extends SparkFunSuite { val res = sc.wholeTextFiles(dir.toString, 3).collect() - assert(res.size === WholeTextFileRecordReaderSuite.fileNames.size, + assert(res.length === WholeTextFileRecordReaderSuite.fileNames.length, "Number of files read out does not fit with the actual value.") for ((filename, contents) <- res) { @@ -120,7 +120,7 @@ class WholeTextFileRecordReaderSuite extends SparkFunSuite { val res = sc.wholeTextFiles(dir.toString, 3).collect() - assert(res.size === WholeTextFileRecordReaderSuite.fileNames.size, + assert(res.length === WholeTextFileRecordReaderSuite.fileNames.length, "Number of files read out does not fit with the actual value.") for ((filename, contents) <- res) { diff --git a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala index ef214bd50d928..95b484d7176a5 100644 --- a/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala +++ b/core/src/test/scala/org/apache/spark/internal/plugin/PluginContainerSuite.scala @@ -214,11 +214,11 @@ class PluginContainerSuite extends SparkFunSuite with LocalSparkContext { } val execFiles = children.filter(_.getName.startsWith(NonLocalModeSparkPlugin.executorFileStr)) - assert(execFiles.size === 1) + assert(execFiles.length === 1) val allLines = Files.readLines(execFiles(0), StandardCharsets.UTF_8) assert(allLines.size === 1) val addrs = NonLocalModeSparkPlugin.extractGpuAddrs(allLines.get(0)) - assert(addrs.size === 2) + assert(addrs.length === 2) assert(addrs.sorted === Array("3", "4")) assert(NonLocalModeSparkPlugin.driverContext != null) diff --git a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala index 56783de1c13b4..4239180ba6c37 100644 --- a/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/AsyncRDDActionsSuite.scala @@ -91,7 +91,7 @@ class AsyncRDDActionsSuite extends SparkFunSuite with TimeLimits { val expected = input.take(num) val saw = rdd.takeAsync(num).get() assert(saw == expected, "incorrect result for rdd with %d partitions (expected %s, saw %s)" - .format(rdd.partitions.size, expected, saw)) + .format(rdd.partitions.length, expected, saw)) } val input = Range(1, 1000) diff --git a/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala b/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala index f644fee74a18b..591b8b4c0df7e 100644 --- a/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/LocalCheckpointSuite.scala @@ -159,7 +159,7 @@ class LocalCheckpointSuite extends SparkFunSuite with LocalSparkContext { test("missing checkpoint block fails with informative message") { val rdd = newRdd.localCheckpoint() - val numPartitions = rdd.partitions.size + val numPartitions = rdd.partitions.length val partitionIndices = rdd.partitions.map(_.index) val bmm = sc.env.blockManager.master diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala index 9b60d2eeeed1b..e436d98843411 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala @@ -41,7 +41,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val pairs = sc.parallelize(Seq((1, 1), (1, 1), (3, 2), (5, 1), (5, 3)), 2) val sets = pairs.aggregateByKey(new HashSet[Int]())(_ += _, _ ++= _).collect() - assert(sets.size === 3) + assert(sets.length === 3) val valuesFor1 = sets.find(_._1 == 1).get._2 assert(valuesFor1.toList.sorted === List(1)) val valuesFor3 = sets.find(_._1 == 3).get._2 @@ -53,7 +53,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { test("groupByKey") { val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1))) val groups = pairs.groupByKey().collect() - assert(groups.size === 2) + assert(groups.length === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 assert(valuesFor1.toList.sorted === List(1, 2, 3)) val valuesFor2 = groups.find(_._1 == 2).get._2 @@ -63,7 +63,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { test("groupByKey with duplicates") { val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (1, 1), (2, 1))) val groups = pairs.groupByKey().collect() - assert(groups.size === 2) + assert(groups.length === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 assert(valuesFor1.toList.sorted === List(1, 1, 2, 3)) val valuesFor2 = groups.find(_._1 == 2).get._2 @@ -73,7 +73,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { test("groupByKey with negative key hash codes") { val pairs = sc.parallelize(Seq((-1, 1), (-1, 2), (-1, 3), (2, 1))) val groups = pairs.groupByKey().collect() - assert(groups.size === 2) + assert(groups.length === 2) val valuesForMinus1 = groups.find(_._1 == -1).get._2 assert(valuesForMinus1.toList.sorted === List(1, 2, 3)) val valuesFor2 = groups.find(_._1 == 2).get._2 @@ -83,7 +83,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { test("groupByKey with many output partitions") { val pairs = sc.parallelize(Seq((1, 1), (1, 2), (1, 3), (2, 1))) val groups = pairs.groupByKey(10).collect() - assert(groups.size === 2) + assert(groups.length === 2) val valuesFor1 = groups.find(_._1 == 1).get._2 assert(valuesFor1.toList.sorted === List(1, 2, 3)) val valuesFor2 = groups.find(_._1 == 2).get._2 @@ -249,7 +249,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.join(rdd2).collect() - assert(joined.size === 4) + assert(joined.length === 4) assert(joined.toSet === Set( (1, (1, 'x')), (1, (2, 'x')), @@ -262,7 +262,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (1, 3))) val rdd2 = sc.parallelize(Seq((1, 'x'), (1, 'y'))) val joined = rdd1.join(rdd2).collect() - assert(joined.size === 6) + assert(joined.length === 6) assert(joined.toSet === Set( (1, (1, 'x')), (1, (1, 'y')), @@ -277,7 +277,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.leftOuterJoin(rdd2).collect() - assert(joined.size === 5) + assert(joined.length === 5) assert(joined.toSet === Set( (1, (1, Some('x'))), (1, (2, Some('x'))), @@ -296,7 +296,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd2 = sc.emptyRDD[(Int, Int)](intPairCT) val joined = rdd1.cogroup(rdd2).collect() - assert(joined.size > 0) + assert(joined.length > 0) } // See SPARK-9326 @@ -307,7 +307,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.emptyRDD[Int](intCT).groupBy((x) => 5) val joined = rdd1.cogroup(rdd2).collect() - assert(joined.size > 0) + assert(joined.length > 0) } // See SPARK-22465 @@ -377,7 +377,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.rightOuterJoin(rdd2).collect() - assert(joined.size === 5) + assert(joined.length === 5) assert(joined.toSet === Set( (1, (Some(1), 'x')), (1, (Some(2), 'x')), @@ -391,7 +391,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.fullOuterJoin(rdd2).collect() - assert(joined.size === 6) + assert(joined.length === 6) assert(joined.toSet === Set( (1, (Some(1), Some('x'))), (1, (Some(2), Some('x'))), @@ -406,14 +406,14 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.parallelize(Seq((4, 'x'), (5, 'y'), (5, 'z'), (6, 'w'))) val joined = rdd1.join(rdd2).collect() - assert(joined.size === 0) + assert(joined.length === 0) } test("join with many output partitions") { val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.join(rdd2, 10).collect() - assert(joined.size === 4) + assert(joined.length === 4) assert(joined.toSet === Set( (1, (1, 'x')), (1, (2, 'x')), @@ -426,7 +426,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd1 = sc.parallelize(Seq((1, 1), (1, 2), (2, 1), (3, 1))) val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val joined = rdd1.groupWith(rdd2).collect() - assert(joined.size === 4) + assert(joined.length === 4) val joinedSet = joined.map(x => (x._1, (x._2._1.toList, x._2._2.toList))).toSet assert(joinedSet === Set( (1, (List(1, 2), List('x'))), @@ -441,7 +441,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd2 = sc.parallelize(Seq((1, 'x'), (2, 'y'), (2, 'z'), (4, 'w'))) val rdd3 = sc.parallelize(Seq((1, 'a'), (3, 'b'), (4, 'c'), (4, 'd'))) val joined = rdd1.groupWith(rdd2, rdd3).collect() - assert(joined.size === 4) + assert(joined.length === 4) val joinedSet = joined.map(x => (x._1, (x._2._1.toList, x._2._2.toList, x._2._3.toList))).toSet assert(joinedSet === Set( @@ -458,7 +458,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val rdd3 = sc.parallelize(Seq((1, 'a'), (3, 'b'), (4, 'c'), (4, 'd'))) val rdd4 = sc.parallelize(Seq((2, '@'))) val joined = rdd1.groupWith(rdd2, rdd3, rdd4).collect() - assert(joined.size === 4) + assert(joined.length === 4) val joinedSet = joined.map(x => (x._1, (x._2._1.toList, x._2._2.toList, x._2._3.toList, x._2._4.toList))).toSet assert(joinedSet === Set( @@ -492,14 +492,14 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val b = a.map(a => (a, (a * 2).toString)) // then a group by, and see we didn't revert to 2 partitions val c = b.groupByKey() - assert(c.partitions.size === 2000) + assert(c.partitions.length === 2000) } test("default partitioner uses largest partitioner") { val a = sc.makeRDD(Seq((1, "a"), (2, "b")), 2) val b = sc.makeRDD(Seq((1, "a"), (2, "b")), 2000) val c = a.join(b) - assert(c.partitions.size === 2000) + assert(c.partitions.length === 2000) } test("subtract") { @@ -507,7 +507,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val b = sc.parallelize(Array(2, 3, 4).toImmutableArraySeq, 4) val c = a.subtract(b) assert(c.collect().toSet === Set(1)) - assert(c.partitions.size === a.partitions.size) + assert(c.partitions.length === a.partitions.length) } test("subtract with narrow dependency") { @@ -531,7 +531,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { val b = sc.parallelize(Seq((2, 20), (3, 30), (4, 40)), 4) val c = a.subtractByKey(b) assert(c.collect().toSet === Set((1, "a"), (1, "a"))) - assert(c.partitions.size === a.partitions.size) + assert(c.partitions.length === a.partitions.length) } test("subtractByKey with narrow dependency") { @@ -795,7 +795,7 @@ class PairRDDFunctionsSuite extends SparkFunSuite with SharedSparkContext { assertBinomialSample(exact = exact, actual = v.toInt, trials = trials(k).toInt, p = samplingRate) } - assert(takeSample.size === takeSample.toSet.size) + assert(takeSample.length === takeSample.toSet.size) takeSample.foreach { x => assert(1 <= x._2 && x._2 <= n, s"elements not in [1, $n]") } } diff --git a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala index 3a097e5335a2a..7f12d8b624c84 100644 --- a/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/PipedRDDSuite.scala @@ -47,7 +47,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall val piped = nums.pipe(Seq("cat")) val c = piped.collect() - assert(c.size === 4) + assert(c.length === 4) assert(c(0) === "1") assert(c(1) === "2") assert(c(2) === "3") @@ -61,7 +61,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall // verify that both RDD.pipe(command: String) and RDD.pipe(command: String, env) work good for (piped <- Seq(nums.pipe("wc -l"), nums.pipe("wc -l", Map[String, String]()))) { val c = piped.collect() - assert(c.size === 2) + assert(c.length === 2) assert(c(0).trim === "2") assert(c(1).trim === "2") } @@ -129,7 +129,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall val c = piped.collect() - assert(c.size === 8) + assert(c.length === 8) assert(c(0) === "0") assert(c(1) === "\u0001") assert(c(2) === "1_") @@ -151,7 +151,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall f(e + "_") } }).collect() - assert(d.size === 8) + assert(d.length === 8) assert(d(0) === "0") assert(d(1) === "\u0001") assert(d(2) === "b\t2_") @@ -216,7 +216,7 @@ class PipedRDDSuite extends SparkFunSuite with SharedSparkContext with Eventuall val nums = sc.makeRDD(Array(1, 2, 3, 4).toImmutableArraySeq, 2) val piped = nums.pipe(Seq("cat"), separateWorkingDir = true) val c = piped.collect() - assert(c.size === 4) + assert(c.length === 4) assert(c(0) === "1") assert(c(1) === "2") assert(c(2) === "3") diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala index 32ba2053258eb..706ebfa936470 100644 --- a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala @@ -322,7 +322,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { test("empty RDD") { val empty = new EmptyRDD[Int](sc) assert(empty.count() === 0) - assert(empty.collect().size === 0) + assert(empty.collect().length === 0) val thrown = intercept[UnsupportedOperationException]{ empty.reduce(_ + _) @@ -331,12 +331,12 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val emptyKv = new EmptyRDD[(Int, Int)](sc) val rdd = sc.parallelize(1 to 2, 2).map(x => (x, x)) - assert(rdd.join(emptyKv).collect().size === 0) - assert(rdd.rightOuterJoin(emptyKv).collect().size === 0) - assert(rdd.leftOuterJoin(emptyKv).collect().size === 2) - assert(rdd.fullOuterJoin(emptyKv).collect().size === 2) - assert(rdd.cogroup(emptyKv).collect().size === 2) - assert(rdd.union(emptyKv).collect().size === 2) + assert(rdd.join(emptyKv).collect().length === 0) + assert(rdd.rightOuterJoin(emptyKv).collect().length === 0) + assert(rdd.leftOuterJoin(emptyKv).collect().length === 2) + assert(rdd.fullOuterJoin(emptyKv).collect().length === 2) + assert(rdd.cogroup(emptyKv).collect().length === 2) + assert(rdd.union(emptyKv).collect().length === 2) } test("repartitioned RDDs") { @@ -348,7 +348,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { // Coalesce partitions val repartitioned1 = data.repartition(2) - assert(repartitioned1.partitions.size == 2) + assert(repartitioned1.partitions.length == 2) val partitions1 = repartitioned1.glom().collect() assert(partitions1(0).length > 0) assert(partitions1(1).length > 0) @@ -356,7 +356,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { // Split partitions val repartitioned2 = data.repartition(20) - assert(repartitioned2.partitions.size == 20) + assert(repartitioned2.partitions.length == 20) val partitions2 = repartitioned2.glom().collect() assert(partitions2(0).length > 0) assert(partitions2(19).length > 0) @@ -370,7 +370,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val data = sc.parallelize(input.toImmutableArraySeq, initialPartitions) val repartitioned1 = data.repartition(2) - assert(repartitioned1.partitions.size == 2) + assert(repartitioned1.partitions.length == 2) val partitions1 = repartitioned1.glom().collect() // some noise in balancing is allowed due to randomization assert(math.abs(partitions1(0).length - 500) < initialPartitions) @@ -380,7 +380,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { def testSplitPartitions(input: Seq[Int], initialPartitions: Int, finalPartitions: Int): Unit = { val data = sc.parallelize(input, initialPartitions) val repartitioned = data.repartition(finalPartitions) - assert(repartitioned.partitions.size === finalPartitions) + assert(repartitioned.partitions.length === finalPartitions) val partitions = repartitioned.glom().collect() // assert all elements are present assert(repartitioned.collect().sortWith(_ > _).toSeq === input.toSeq.sortWith(_ > _).toSeq) @@ -441,7 +441,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { // when shuffling, we can increase the number of partitions val coalesced6 = data.coalesce(20, shuffle = true) - assert(coalesced6.partitions.size === 20) + assert(coalesced6.partitions.length === 20) assert(coalesced6.collect().toSet === (1 to 10).toSet) } @@ -564,13 +564,13 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val coalesced2 = data2.coalesce(partitions) // test that we have 10000 partitions - assert(coalesced2.partitions.size == 10000, "Expected 10000 partitions, but got " + - coalesced2.partitions.size) + assert(coalesced2.partitions.length == 10000, "Expected 10000 partitions, but got " + + coalesced2.partitions.length) // test that we have 100 partitions val coalesced3 = data2.coalesce(numMachines * 2) - assert(coalesced3.partitions.size == 100, "Expected 100 partitions, but got " + - coalesced3.partitions.size) + assert(coalesced3.partitions.length == 100, "Expected 100 partitions, but got " + + coalesced3.partitions.length) // test that the groups are load balanced with 100 +/- 20 elements in each val maxImbalance3 = coalesced3.partitions @@ -613,9 +613,9 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val data = sc.parallelize(1 to 10, 10) // Note that split number starts from 0, so > 8 means only 10th partition left. val prunedRdd = new PartitionPruningRDD(data, splitNum => splitNum > 8) - assert(prunedRdd.partitions.size === 1) + assert(prunedRdd.partitions.length === 1) val prunedData = prunedRdd.collect() - assert(prunedData.size === 1) + assert(prunedData.length === 1) assert(prunedData(0) === 10) } @@ -626,7 +626,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { test("take") { var nums = sc.makeRDD(Range(1, 1000), 1) - assert(nums.take(0).size === 0) + assert(nums.take(0).length === 0) assert(nums.take(1) === Array(1)) assert(nums.take(3) === Array(1, 2, 3)) assert(nums.take(500) === (1 to 500).toArray) @@ -635,7 +635,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { assert(nums.take(1000) === (1 to 999).toArray) nums = sc.makeRDD(Range(1, 1000), 2) - assert(nums.take(0).size === 0) + assert(nums.take(0).length === 0) assert(nums.take(1) === Array(1)) assert(nums.take(3) === Array(1, 2, 3)) assert(nums.take(500) === (1 to 500).toArray) @@ -644,7 +644,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { assert(nums.take(1000) === (1 to 999).toArray) nums = sc.makeRDD(Range(1, 1000), 100) - assert(nums.take(0).size === 0) + assert(nums.take(0).length === 0) assert(nums.take(1) === Array(1)) assert(nums.take(3) === Array(1, 2, 3)) assert(nums.take(500) === (1 to 500).toArray) @@ -653,7 +653,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { assert(nums.take(1000) === (1 to 999).toArray) nums = sc.makeRDD(Range(1, 1000), 1000) - assert(nums.take(0).size === 0) + assert(nums.take(0).length === 0) assert(nums.take(1) === Array(1)) assert(nums.take(3) === Array(1, 2, 3)) assert(nums.take(500) === (1 to 500).toArray) @@ -662,7 +662,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { assert(nums.take(1000) === (1 to 999).toArray) nums = sc.parallelize(1 to 2, 2) - assert(nums.take(2147483638).size === 2) + assert(nums.take(2147483638).length === 2) assert(nums.takeAsync(2147483638).get().size === 2) } @@ -670,7 +670,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val nums = Seq.range(1, 100000) val ints = sc.makeRDD(scala.util.Random.shuffle(nums), 2) val topK = ints.top(5) - assert(topK.size === 5) + assert(topK.length === 5) assert(topK === nums.reverse.take(5)) } @@ -679,7 +679,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { implicit val ord = implicitly[Ordering[String]].reverse val rdd = sc.makeRDD(words, 2) val topK = rdd.top(2) - assert(topK.size === 2) + assert(topK.length === 2) assert(topK.sorted === Array("b", "a")) } @@ -687,7 +687,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val nums = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) val rdd = sc.makeRDD(nums.toImmutableArraySeq, 2) val sortedLowerK = rdd.takeOrdered(5) - assert(sortedLowerK.size === 5) + assert(sortedLowerK.length === 5) assert(sortedLowerK === Array(1, 2, 3, 4, 5)) } @@ -695,7 +695,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val nums = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) val rdd = sc.makeRDD(nums.toImmutableArraySeq, 2) val sortedLowerK = rdd.takeOrdered(0) - assert(sortedLowerK.size === 0) + assert(sortedLowerK.length === 0) } test("SPARK-40276: takeOrdered with empty RDDs") { @@ -708,7 +708,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { implicit val ord = implicitly[Ordering[Int]].reverse val rdd = sc.makeRDD(nums.toImmutableArraySeq, 2) val sortedTopK = rdd.takeOrdered(5) - assert(sortedTopK.size === 5) + assert(sortedTopK.length === 5) assert(sortedTopK === Array(10, 9, 8, 7, 6)) assert(sortedTopK === nums.sorted(ord).take(5)) } @@ -736,48 +736,48 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { for (num <- List(5, 20, 100)) { val sample = data.takeSample(withReplacement = false, num = num) - assert(sample.size === num) // Got exactly num elements + assert(sample.length === num) // Got exactly num elements assert(sample.toSet.size === num) // Elements are distinct assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]") } for (seed <- 1 to 5) { val sample = data.takeSample(withReplacement = false, 20, seed) - assert(sample.size === 20) // Got exactly 20 elements + assert(sample.length === 20) // Got exactly 20 elements assert(sample.toSet.size === 20) // Elements are distinct assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]") } for (seed <- 1 to 5) { val sample = data.takeSample(withReplacement = false, 100, seed) - assert(sample.size === 100) // Got only 100 elements + assert(sample.length === 100) // Got only 100 elements assert(sample.toSet.size === 100) // Elements are distinct assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]") } for (seed <- 1 to 5) { val sample = data.takeSample(withReplacement = true, 20, seed) - assert(sample.size === 20) // Got exactly 20 elements + assert(sample.length === 20) // Got exactly 20 elements assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]") } { val sample = data.takeSample(withReplacement = true, num = 20) - assert(sample.size === 20) // Got exactly 20 elements + assert(sample.length === 20) // Got exactly 20 elements assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]") } { val sample = data.takeSample(withReplacement = true, num = n) - assert(sample.size === n) // Got exactly n elements + assert(sample.length === n) // Got exactly n elements // Chance of getting all distinct elements is astronomically low, so test we got < n assert(sample.toSet.size < n, "sampling with replacement returned all distinct elements") assert(sample.forall(x => 1 <= x && x <= n), s"elements not in [1, $n]") } for (seed <- 1 to 5) { val sample = data.takeSample(withReplacement = true, n, seed) - assert(sample.size === n) // Got exactly n elements + assert(sample.length === n) // Got exactly n elements // Chance of getting all distinct elements is astronomically low, so test we got < n assert(sample.toSet.size < n, "sampling with replacement returned all distinct elements") } for (seed <- 1 to 5) { val sample = data.takeSample(withReplacement = true, 2 * n, seed) - assert(sample.size === 2 * n) // Got exactly 2 * n elements + assert(sample.length === 2 * n) // Got exactly 2 * n elements // Chance of getting all distinct elements is still quite low, so test we got < n assert(sample.toSet.size < n, "sampling with replacement returned all distinct elements") } @@ -794,7 +794,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { val data = sc.parallelize(1 to n, 2) for(seed <- 1 to 5) { val splits = data.randomSplit(Array(1.0, 2.0, 3.0), seed) - assert(splits.size == 3, "wrong number of splits") + assert(splits.length == 3, "wrong number of splits") assert(splits.flatMap(_.collect()).sorted.toList == data.collect().toList, "incomplete or wrong split") val s = splits.map(_.count()) @@ -1179,7 +1179,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { sc.hadoopFile(outDir, classOf[TextInputFormat], classOf[LongWritable], classOf[Text]) val coalescedHadoopRDD = hadoopRDD.coalesce(2, partitionCoalescer = Option(new SizeBasedCoalescer(maxSplitSize))) - assert(coalescedHadoopRDD.partitions.size <= 10) + assert(coalescedHadoopRDD.partitions.length <= 10) var totalPartitionCount = 0L coalescedHadoopRDD.partitions.foreach(partition => { var splitSizeSum = 0L @@ -1256,7 +1256,7 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext with Eventually { .map(coalescedRDD.getPreferredLocations(_).head) .groupBy(identity) .view - .mapValues(_.size) + .mapValues(_.length) // Make sure the coalesced partitions are distributed fairly evenly between the two locations. // This should not become flaky since the DefaultPartitionsCoalescer uses a fixed seed. @@ -1357,7 +1357,7 @@ class SizeBasedCoalescer(val maxSize: Int) extends PartitionCoalescer with Seria totalSum += splitSize } - while (index < partitions.size) { + while (index < partitions.length) { val partition = partitions(index) val fileSplit = partition.asInstanceOf[HadoopPartition].inputSplit.value.asInstanceOf[FileSplit] diff --git a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala index 802889b047796..5771e99b64c69 100644 --- a/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/SortingSuite.scala @@ -35,7 +35,7 @@ class SortingSuite extends SparkFunSuite with SharedSparkContext with Matchers { val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) } val pairs = sc.parallelize(pairArr.toImmutableArraySeq, 2) val sorted = pairs.sortByKey() - assert(sorted.partitions.size === 2) + assert(sorted.partitions.length === 2) assert(sorted.collect() === pairArr.sortBy(_._1)) } @@ -44,7 +44,7 @@ class SortingSuite extends SparkFunSuite with SharedSparkContext with Matchers { val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) } val pairs = sc.parallelize(pairArr.toImmutableArraySeq, 2) val sorted = pairs.sortByKey(true, 1) - assert(sorted.partitions.size === 1) + assert(sorted.partitions.length === 1) assert(sorted.collect() === pairArr.sortBy(_._1)) } @@ -53,7 +53,7 @@ class SortingSuite extends SparkFunSuite with SharedSparkContext with Matchers { val pairArr = Array.fill(1000) { (rand.nextInt(), rand.nextInt()) } val pairs = sc.parallelize(pairArr.toImmutableArraySeq, 2) val sorted = pairs.sortByKey(true, 20) - assert(sorted.partitions.size === 20) + assert(sorted.partitions.length === 20) assert(sorted.collect() === pairArr.sortBy(_._1)) } diff --git a/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala index 7079b9ea8eadc..c04719eb9ea6f 100644 --- a/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala +++ b/core/src/test/scala/org/apache/spark/rdd/ZippedPartitionsSuite.scala @@ -21,7 +21,7 @@ import org.apache.spark.{SharedSparkContext, SparkFunSuite} object ZippedPartitionsSuite { def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = { - Iterator(i.toArray.size, s.toArray.size, d.toArray.size) + Iterator(i.toArray.length, s.toArray.length, d.toArray.length) } } @@ -35,7 +35,7 @@ class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext { val obtainedSizes = zippedRDD.collect() val expectedSizes = Array(2, 3, 1, 2, 3, 1) - assert(obtainedSizes.size == 6) + assert(obtainedSizes.length == 6) assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2)) } } diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala index fd7018f189e26..be38315cd75fe 100644 --- a/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala +++ b/core/src/test/scala/org/apache/spark/resource/ResourceProfileSuite.scala @@ -374,7 +374,7 @@ class ResourceProfileSuite extends SparkFunSuite with MockitoSugar { rprof.require(eReq) // Update this if new resource type added - assert(ResourceProfile.allSupportedExecutorResources.size === 5, + assert(ResourceProfile.allSupportedExecutorResources.length === 5, "Executor resources should have 5 supported resources") assert(rprof.build().getCustomExecutorResources().size === 1, "Executor resources should have 1 custom resource") diff --git a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala index 1ab9f7c5d2b0c..20d6cc7671582 100644 --- a/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/resource/ResourceUtilsSuite.scala @@ -101,13 +101,13 @@ class ResourceUtilsSuite extends SparkFunSuite val gpuValue = resources.get(GPU) assert(gpuValue.nonEmpty, "Should have a gpu entry") assert(gpuValue.get.name == "gpu", "name should be gpu") - assert(gpuValue.get.addresses.size == 2, "Should have 2 indexes") + assert(gpuValue.get.addresses.length == 2, "Should have 2 indexes") assert(gpuValue.get.addresses.sameElements(Array("0", "1")), "should have 0,1 entries") val fpgaValue = resources.get(FPGA) assert(fpgaValue.nonEmpty, "Should have a gpu entry") assert(fpgaValue.get.name == "fpga", "name should be fpga") - assert(fpgaValue.get.addresses.size == 3, "Should have 3 indexes") + assert(fpgaValue.get.addresses.length == 3, "Should have 3 indexes") assert(fpgaValue.get.addresses.sameElements(Array("f1", "f2", "f3")), "should have f1,f2,f3 entries") } @@ -231,7 +231,7 @@ class ResourceUtilsSuite extends SparkFunSuite val gpuValue = resources.get(GPU) assert(gpuValue.nonEmpty, "Should have a gpu entry") assert(gpuValue.get.name == "gpu", "name should be gpu") - assert(gpuValue.get.addresses.size == 2, "Should have 2 indexes") + assert(gpuValue.get.addresses.length == 2, "Should have 2 indexes") assert(gpuValue.get.addresses.sameElements(Array("0", "1")), "should have 0,1 entries") } } diff --git a/core/src/test/scala/org/apache/spark/scheduler/AQEShuffledRDD.scala b/core/src/test/scala/org/apache/spark/scheduler/AQEShuffledRDD.scala index 3f8eaede6e799..84f9ef0d557e6 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/AQEShuffledRDD.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/AQEShuffledRDD.scala @@ -48,7 +48,7 @@ class CoalescedPartitioner(val parent: Partitioner, val partitionStartIndices: A result } - override def numPartitions: Int = partitionStartIndices.size + override def numPartitions: Int = partitionStartIndices.length override def getPartition(key: Any): Int = { parentPartitionMapping(parent.getPartition(key)) diff --git a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala index bf5e9d96cd80e..e9b8ae4bffe6d 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/CoarseGrainedSchedulerBackendSuite.scala @@ -62,7 +62,7 @@ class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkCo } assert(thrown.getMessage.contains("using broadcast variables for large values")) val smaller = sc.parallelize(1 to 4).collect() - assert(smaller.size === 4) + assert(smaller.length === 4) } test("compute max number of concurrent tasks can be launched") { diff --git a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala index 0f7146bc7c150..c55f627075e8f 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/DAGSchedulerSuite.scala @@ -462,9 +462,9 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti /** Send the given CompletionEvent messages for the tasks in the TaskSet. */ private def complete(taskSet: TaskSet, taskEndInfos: Seq[(TaskEndReason, Any)]): Unit = { - assert(taskSet.tasks.size >= taskEndInfos.size) + assert(taskSet.tasks.length >= taskEndInfos.size) for ((result, i) <- taskEndInfos.zipWithIndex) { - if (i < taskSet.tasks.size) { + if (i < taskSet.tasks.length) { runEvent(makeCompletionEvent(taskSet.tasks(i), result._1, result._2)) } } @@ -474,9 +474,9 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti accumId: Long, taskSet: TaskSet, results: Seq[(TaskEndReason, Any)]): Unit = { - assert(taskSet.tasks.size >= results.size) + assert(taskSet.tasks.length >= results.size) for ((result, i) <- results.zipWithIndex) { - if (i < taskSet.tasks.size) { + if (i < taskSet.tasks.length) { runEvent(makeCompletionEvent( taskSet.tasks(i), result._1, @@ -1671,21 +1671,21 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti runEvent(makeCompletionEvent( taskSet.tasks(0), Success, - makeMapStatus("hostA", reduceRdd.partitions.size))) + makeMapStatus("hostA", reduceRdd.partitions.length))) assert(shuffleStage.numAvailableOutputs === 0) // should work because it's a non-failed host (so the available map outputs will increase) runEvent(makeCompletionEvent( taskSet.tasks(0), Success, - makeMapStatus("hostB", reduceRdd.partitions.size))) + makeMapStatus("hostB", reduceRdd.partitions.length))) assert(shuffleStage.numAvailableOutputs === 1) // should be ignored for being too old runEvent(makeCompletionEvent( taskSet.tasks(0), Success, - makeMapStatus("hostA", reduceRdd.partitions.size))) + makeMapStatus("hostA", reduceRdd.partitions.length))) assert(shuffleStage.numAvailableOutputs === 1) // should work because it's a new epoch, which will increase the number of available map @@ -1694,7 +1694,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti runEvent(makeCompletionEvent( taskSet.tasks(1), Success, - makeMapStatus("hostA", reduceRdd.partitions.size))) + makeMapStatus("hostA", reduceRdd.partitions.length))) assert(shuffleStage.numAvailableOutputs === 2) assert(mapOutputTracker.getMapSizesByExecutorId(shuffleId, 0).map(_._1).toSet === HashSet(makeBlockManagerId("hostB"), makeBlockManagerId("hostA"))) @@ -2081,7 +2081,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti // stage complete), but the tasks that ran on HostA need to be re-run, so the DAGScheduler // should re-submit the stage with one task (the task that originally ran on HostA). assert(taskSets.size === 2) - assert(taskSets(1).tasks.size === 1) + assert(taskSets(1).tasks.length === 1) // Make sure that the stage that was re-submitted was the ShuffleMapStage (not the reduce // stage, which shouldn't be run until all of the tasks in the ShuffleMapStage complete on @@ -2735,7 +2735,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti // Now complete tasks in the second task set val newTaskSet = taskSets(1) // 2 tasks should have been re-submitted, for tasks 0 and 1 (which ran on hostA). - assert(newTaskSet.tasks.size === 2) + assert(newTaskSet.tasks.length === 2) // Complete task 0 from the original task set (i.e., not the one that's currently active). // This should still be counted towards the job being complete (but there's still one // outstanding task). @@ -2878,7 +2878,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti // failed hostA, so both should be resubmitted. Complete them on hostB successfully. scheduler.resubmitFailedStages() assert(taskSets(2).stageId === 0 && taskSets(2).stageAttemptId === 1 - && taskSets(2).tasks.size === 2) + && taskSets(2).tasks.length === 2) complete(taskSets(2), Seq( (Success, makeMapStatus("hostB", 2)), (Success, makeMapStatus("hostB", 2)))) @@ -2898,7 +2898,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti // Task(stageId=1, stageAttemptId=1, partitionId=1) of this new active stage attempt // is still running. assert(taskSets(3).stageId === 1 && taskSets(3).stageAttemptId === 1 - && taskSets(3).tasks.size === 2) + && taskSets(3).tasks.length === 2) runEvent(makeCompletionEvent( taskSets(3).tasks(0), Success, makeMapStatus("hostB", 2))) @@ -2907,7 +2907,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti // was ignored due to executor failure assert(taskSets.size === 5) assert(taskSets(4).stageId === 1 && taskSets(4).stageAttemptId === 2 - && taskSets(4).tasks.size === 1) + && taskSets(4).tasks.length === 1) // Complete task(stageId=1, stageAttempt=2, partitionId=1) successfully. runEvent(makeCompletionEvent( @@ -4445,7 +4445,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti // a scenario where stage 0 needs to be resubmitted upon finishing all tasks. // Merge finalization should be scheduled in this case. for ((result, i) <- taskResults.zipWithIndex) { - if (i == taskSets(0).tasks.size - 1) { + if (i == taskSets(0).tasks.length - 1) { mapOutputTracker.removeOutputsOnHost("host0") } runEvent(makeCompletionEvent(taskSets(0).tasks(i), result._1, result._2)) @@ -4522,7 +4522,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti // a scenario where stage 0 needs to be resubmitted upon finishing all tasks. // Merge finalization should be scheduled in this case. for ((result, i) <- taskResults.zipWithIndex) { - if (i == taskSets(0).tasks.size - 1) { + if (i == taskSets(0).tasks.length - 1) { mapOutputTracker.removeOutputsOnHost("host0") } runEvent(makeCompletionEvent(taskSets(0).tasks(i), result._1, result._2)) @@ -4986,7 +4986,7 @@ class DAGSchedulerSuite extends SparkFunSuite with TempLocalSparkContext with Ti * Note that this checks only the host and not the executor ID. */ private def assertLocations(taskSet: TaskSet, hosts: Seq[Seq[String]]): Unit = { - assert(hosts.size === taskSet.tasks.size) + assert(hosts.size === taskSet.tasks.length) for ((taskLocs, expectedLocs) <- taskSet.tasks.map(_.preferredLocations).zip(hosts)) { assert(taskLocs.map(_.host).toSet === expectedLocs.toSet) } diff --git a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala index cf2240a0511d7..13e7ff758ebaf 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/MapStatusSuite.scala @@ -268,7 +268,7 @@ class MapStatusSuite extends SparkFunSuite { "number of skewed block sizes") val smallAndUntrackedBlocks = - nonEmptyBlocks.slice(0, nonEmptyBlocks.size - trackedSkewedBlocksLength) + nonEmptyBlocks.slice(0, nonEmptyBlocks.length - trackedSkewedBlocksLength) val avg = smallAndUntrackedBlocks.sum / smallAndUntrackedBlocks.length val loc = BlockManagerId("a", "b", 10) diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala index 0533f9d7d8a49..f1a4b97c2981d 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala @@ -143,14 +143,14 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter { val rdd = sc.parallelize(Seq(1), 1) sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).commitSuccessfully _, rdd.partitions.indices) - assert(tempDir.list().size === 1) + assert(tempDir.list().length === 1) } ignore("If commit fails, if task is retried it should not be locked, and will succeed.") { val rdd = sc.parallelize(Seq(1), 1) sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).failFirstCommitAttempt _, rdd.partitions.indices) - assert(tempDir.list().size === 1) + assert(tempDir.list().length === 1) } test("Job should not complete if all commits are denied") { @@ -161,13 +161,13 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter { def resultHandler(x: Int, y: Unit): Unit = {} val futureAction: SimpleFutureAction[Unit] = sc.submitJob[Int, Unit, Unit](rdd, OutputCommitFunctions(tempDir.getAbsolutePath).commitSuccessfully, - 0 until rdd.partitions.size, resultHandler, ()) + 0 until rdd.partitions.length, resultHandler, ()) // It's an error if the job completes successfully even though no committer was authorized, // so throw an exception if the job was allowed to complete. intercept[TimeoutException] { ThreadUtils.awaitResult(futureAction, 5.seconds) } - assert(tempDir.list().size === 0) + assert(tempDir.list().length === 0) } test("Only authorized committer failures can clear the authorized committer lock (SPARK-6614)") { diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala index f0ae7fc74112b..2ab7df0d9cfd3 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala @@ -1815,10 +1815,10 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext var has1Gpu = 0 for (tDesc <- taskDescriptions) { assert(tDesc.resources.contains(GPU)) - if (tDesc.resources(GPU).addresses.size == 2) { + if (tDesc.resources(GPU).addresses.length == 2) { has2Gpus += 1 } - if (tDesc.resources(GPU).addresses.size == 1) { + if (tDesc.resources(GPU).addresses.length == 1) { has1Gpu += 1 } } @@ -1836,7 +1836,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext taskDescriptions = taskScheduler.resourceOffers(workerOffers3).flatten assert(2 === taskDescriptions.length) assert(taskDescriptions.head.resources.contains(GPU)) - assert(2 == taskDescriptions.head.resources(GPU).addresses.size) + assert(2 == taskDescriptions.head.resources(GPU).addresses.length) } test("Scheduler works with task resource profiles") { @@ -1875,10 +1875,10 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext var has1Gpu = 0 for (tDesc <- taskDescriptions) { assert(tDesc.resources.contains(GPU)) - if (tDesc.resources(GPU).addresses.size == 2) { + if (tDesc.resources(GPU).addresses.length == 2) { has2Gpus += 1 } - if (tDesc.resources(GPU).addresses.size == 1) { + if (tDesc.resources(GPU).addresses.length == 1) { has1Gpu += 1 } } @@ -1896,7 +1896,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext taskDescriptions = taskScheduler.resourceOffers(workerOffers3).flatten assert(2 === taskDescriptions.length) assert(taskDescriptions.head.resources.contains(GPU)) - assert(2 == taskDescriptions.head.resources(GPU).addresses.size) + assert(2 == taskDescriptions.head.resources(GPU).addresses.length) } test("Calculate available tasks slots for task resource profiles") { diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala index 2fe50a486dbd6..2f8b6df8beac5 100644 --- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala +++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala @@ -845,7 +845,7 @@ class TaskSetManagerSuite // multiple 1k result val r = sc.makeRDD(0 until 10, 10).map(genBytes(1024)).collect() - assert(10 === r.size) + assert(10 === r.length) // single 10M result val thrown = intercept[SparkException] {sc.makeRDD(genBytes(10 << 20)(0), 1).collect()} @@ -863,7 +863,7 @@ class TaskSetManagerSuite sc = new SparkContext("local", "test", conf) // final result is below limit. val r = sc.makeRDD(0 until 2000, 2000).distinct(10).filter(_ == 0).collect() - assert(1 === r.size) + assert(1 === r.length) } test("[SPARK-13931] taskSetManager should not send Resubmitted tasks after being a zombie") { diff --git a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala index 4acb4bbc779c3..25db9a5c68612 100644 --- a/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala +++ b/core/src/test/scala/org/apache/spark/serializer/KryoSerializerDistributedSuite.scala @@ -48,7 +48,7 @@ class KryoSerializerDistributedSuite extends SparkFunSuite with LocalSparkContex val shuffledRDD = cachedRDD.map { case (i, o) => (i * i * i - 10 * i * i, o)} // Join the two RDDs, and force evaluation - assert(shuffledRDD.join(cachedRDD).collect().size == 1) + assert(shuffledRDD.join(cachedRDD).collect().length == 1) } } diff --git a/core/src/test/scala/org/apache/spark/shuffle/sort/IndexShuffleBlockResolverSuite.scala b/core/src/test/scala/org/apache/spark/shuffle/sort/IndexShuffleBlockResolverSuite.scala index 8a9537b4f18d7..a9ca9135f38a9 100644 --- a/core/src/test/scala/org/apache/spark/shuffle/sort/IndexShuffleBlockResolverSuite.scala +++ b/core/src/test/scala/org/apache/spark/shuffle/sort/IndexShuffleBlockResolverSuite.scala @@ -236,7 +236,7 @@ class IndexShuffleBlockResolverSuite extends SparkFunSuite { ShuffleMergedBlockId(shuffleId, shuffleMergeId, reduceId), dirs) assert(mergedBlockMeta.getNumChunks === 3) - assert(mergedBlockMeta.readChunkBitmaps().size === 3) + assert(mergedBlockMeta.readChunkBitmaps().length === 3) assert(mergedBlockMeta.readChunkBitmaps()(0).contains(1)) assert(mergedBlockMeta.readChunkBitmaps()(0).contains(2)) assert(!mergedBlockMeta.readChunkBitmaps()(0).contains(3)) diff --git a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala index be1b9be2d85d9..b644224652266 100644 --- a/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala +++ b/core/src/test/scala/org/apache/spark/storage/DiskStoreSuite.scala @@ -117,7 +117,7 @@ class DiskStoreSuite extends SparkFunSuite { val chunkedByteBuffer = blockData.toChunkedByteBuffer(ByteBuffer.allocate) val chunks = chunkedByteBuffer.chunks - assert(chunks.size === 2) + assert(chunks.length === 2) for (chunk <- chunks) { assert(chunk.limit() === 10 * 1024) } diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala index c377f2495d05d..35ef0587b9b4c 100644 --- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala @@ -192,9 +192,9 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter { // verify whether the earliest file has been deleted val rolledOverFiles = allGeneratedFiles.filter { _ != testFile.toString }.toArray.sorted - logInfo(s"All rolled over files generated:${rolledOverFiles.size}\n" + + logInfo(s"All rolled over files generated:${rolledOverFiles.length}\n" + rolledOverFiles.mkString("\n")) - assert(rolledOverFiles.size > 2) + assert(rolledOverFiles.length > 2) val earliestRolledOverFile = rolledOverFiles.head val existingRolledOverFiles = RollingFileAppender.getSortedRolledOverFiles( testFile.getParentFile.toString, testFile.getName).map(_.toString) diff --git a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala index 8aa4be6c2ff8d..82a4c85b02fa0 100644 --- a/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/collection/SizeTrackerSuite.scala @@ -104,7 +104,7 @@ private object SizeTrackerSuite { * Run speed tests for size tracking collections. */ def main(args: Array[String]): Unit = { - if (args.size < 1) { + if (args.length < 1) { // scalastyle:off println println("Usage: SizeTrackerSuite [num elements]") // scalastyle:on println From d30c9a90c6cf9033c45f6f418864c8d7013911e5 Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Sun, 26 Nov 2023 14:10:27 +0100 Subject: [PATCH 08/40] [SPARK-45826][SQL] Add a SQL config for stack traces in DataFrame query context ### What changes were proposed in this pull request? In the PR, I propose to add new SQL config `spark.sql.stackTracesInDataFrameContext` which defines how many non-Spark stack traces should be captured into DataFrame query context. By default, the config is set to 1. ### Why are the changes needed? To improve user experience with Spark SQL. When users troubleshoot an issue, they might need more stack traces in the DataFrame context. For example: ```scala scala> spark.conf.set("spark.sql.ansi.enabled", true) scala> spark.conf.set("spark.sql.stackTracesInDataFrameContext", 3) scala> spark.range(1).select(lit(1) / lit(0)).collect() org.apache.spark.SparkArithmeticException: [DIVIDE_BY_ZERO] Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. SQLSTATE: 22012 == DataFrame == "div" was called from (:1) (:16) .(:1) ``` ### Does this PR introduce _any_ user-facing change? No, it doesn't change the default behaviour. ### How was this patch tested? By running the modified test suite: ``` $ build/sbt "test:testOnly *QueryContextSuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43695 from MaxGekk/df-context-slice-conf-2. Authored-by: Max Gekk Signed-off-by: Max Gekk --- .../scala/org/apache/spark/sql/internal/SQLConf.scala | 9 +++++++++ .../src/main/scala/org/apache/spark/sql/package.scala | 5 ++++- .../org/apache/spark/sql/errors/QueryContextSuite.scala | 7 +++++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 6a8e1f92fc510..5133c40bc6faa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4577,6 +4577,13 @@ object SQLConf { .booleanConf .createWithDefault(false) + val STACK_TRACES_IN_DATAFRAME_CONTEXT = buildConf("spark.sql.stackTracesInDataFrameContext") + .doc("The number of non-Spark stack traces in the captured DataFrame query context.") + .version("4.0.0") + .intConf + .checkValue(_ > 0, "The number of stack traces in the DataFrame context must be positive.") + .createWithDefault(1) + /** * Holds information about keys that have been deprecated. * @@ -5465,6 +5472,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def legacyRaiseErrorWithoutErrorClass: Boolean = getConf(SQLConf.LEGACY_RAISE_ERROR_WITHOUT_ERROR_CLASS) + def stackTracesInDataFrameContext: Int = getConf(SQLConf.STACK_TRACES_IN_DATAFRAME_CONTEXT) + /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/core/src/main/scala/org/apache/spark/sql/package.scala b/sql/core/src/main/scala/org/apache/spark/sql/package.scala index 96bef83af0a86..877d9906a1cff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/package.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/package.scala @@ -22,6 +22,7 @@ import java.util.regex.Pattern import org.apache.spark.annotation.{DeveloperApi, Unstable} import org.apache.spark.sql.catalyst.trees.{CurrentOrigin, Origin} import org.apache.spark.sql.execution.SparkStrategy +import org.apache.spark.sql.internal.SQLConf /** * Allows the execution of relational queries, including those expressed in SQL using Spark. @@ -103,7 +104,9 @@ package object sql { while (i < st.length && !sparkCode(st(i))) i += 1 // Stop at the end of the first Spark code traces while (i < st.length && sparkCode(st(i))) i += 1 - val origin = Origin(stackTrace = Some(st.slice(i - 1, i + 1))) + val origin = Origin(stackTrace = Some(st.slice( + from = i - 1, + until = i + SQLConf.get.stackTracesInDataFrameContext))) CurrentOrigin.withOrigin(origin)(f) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryContextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryContextSuite.scala index 7d57eeb01bfa1..426822da3c912 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryContextSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryContextSuite.scala @@ -25,14 +25,17 @@ import org.apache.spark.sql.test.SharedSparkSession class QueryContextSuite extends QueryTest with SharedSparkSession { test("summary of DataFrame context") { - withSQLConf(SQLConf.ANSI_ENABLED.key -> "true") { + withSQLConf( + SQLConf.ANSI_ENABLED.key -> "true", + SQLConf.STACK_TRACES_IN_DATAFRAME_CONTEXT.key -> "2") { val e = intercept[SparkArithmeticException] { spark.range(1).select(lit(1) / lit(0)).collect() } assert(e.getQueryContext.head.summary() == """== DataFrame == |"div" was called from - |org.apache.spark.sql.errors.QueryContextSuite.$anonfun$new$3(QueryContextSuite.scala:30) + |org.apache.spark.sql.errors.QueryContextSuite.$anonfun$new$3(QueryContextSuite.scala:32) + |org.scalatest.Assertions.intercept(Assertions.scala:749) |""".stripMargin) } } From ade861d19910df724d9233df98c059ff9d57f795 Mon Sep 17 00:00:00 2001 From: wforget <643348094@qq.com> Date: Sun, 26 Nov 2023 23:28:52 +0800 Subject: [PATCH 09/40] [SPARK-45974][SQL] Add scan.filterAttributes non-empty judgment for RowLevelOperationRuntimeGroupFiltering ### What changes were proposed in this pull request? Add scan.filterAttributes non-empty judgment for RowLevelOperationRuntimeGroupFiltering. ### Why are the changes needed? When scan.filterAttributes is empty, an invalid dynamic pruning condition will be generated in RowLevelOperationRuntimeGroupFiltering. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? added test case ### Was this patch authored or co-authored using generative AI tooling? No Closes #43869 from wForget/SPARK-45974. Authored-by: wforget <643348094@qq.com> Signed-off-by: Wenchen Fan --- ...wLevelOperationRuntimeGroupFiltering.scala | 4 ++- .../connector/MergeIntoTableSuiteBase.scala | 32 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala index b8288c636c386..7c28f91ee1cc6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/RowLevelOperationRuntimeGroupFiltering.scala @@ -51,7 +51,8 @@ class RowLevelOperationRuntimeGroupFiltering(optimizeSubqueries: Rule[LogicalPla // apply special dynamic filtering only for group-based row-level operations case GroupBasedRowLevelOperation(replaceData, _, Some(cond), DataSourceV2ScanRelation(_, scan: SupportsRuntimeV2Filtering, _, _, _)) - if conf.runtimeRowLevelOperationGroupFilterEnabled && cond != TrueLiteral => + if conf.runtimeRowLevelOperationGroupFilterEnabled && cond != TrueLiteral + && scan.filterAttributes().nonEmpty => // use reference equality on scan to find required scan relations val newQuery = replaceData.query transformUp { @@ -116,6 +117,7 @@ class RowLevelOperationRuntimeGroupFiltering(optimizeSubqueries: Rule[LogicalPla matchingRowsPlan: LogicalPlan, buildKeys: Seq[Attribute], pruningKeys: Seq[Attribute]): Expression = { + assert(buildKeys.nonEmpty && pruningKeys.nonEmpty) val buildQuery = Aggregate(buildKeys, buildKeys, matchingRowsPlan) DynamicPruningExpression( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala index e7555c23fa4fc..5668e5981910c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/MergeIntoTableSuiteBase.scala @@ -32,6 +32,38 @@ abstract class MergeIntoTableSuiteBase extends RowLevelOperationSuiteBase { import testImplicits._ + test("SPARK-45974: merge into non filter attributes table") { + val tableName: String = "cat.ns1.non_partitioned_table" + withTable(tableName) { + withTempView("source") { + val sourceRows = Seq( + (1, 100, "hr"), + (2, 200, "finance"), + (3, 300, "hr")) + sourceRows.toDF("pk", "salary", "dep").createOrReplaceTempView("source") + + sql(s"CREATE TABLE $tableName (pk INT NOT NULL, salary INT, dep STRING)".stripMargin) + + val df = sql( + s"""MERGE INTO $tableName t + |USING (select * from source) s + |ON t.pk = s.pk + |WHEN MATCHED THEN + | UPDATE SET t.salary = s.salary + |WHEN NOT MATCHED THEN + | INSERT * + |""".stripMargin) + + checkAnswer( + sql(s"SELECT * FROM $tableName"), + Seq( + Row(1, 100, "hr"), // insert + Row(2, 200, "finance"), // insert + Row(3, 300, "hr"))) // insert + } + } + } + test("merge into empty table with NOT MATCHED clause") { withTempView("source") { createTable("pk INT NOT NULL, salary INT, dep STRING") From d5fad63810149a69527706bb16333baee06a4270 Mon Sep 17 00:00:00 2001 From: Niranjan Jayakar Date: Mon, 27 Nov 2023 08:47:31 +0900 Subject: [PATCH 10/40] [SPARK-46074][CONNECT][SCALA] Insufficient details in error message on UDF failure ### What changes were proposed in this pull request? Update the error message for 'FAILED_EXECUTE_UDF' with the underlying error message. ### Why are the changes needed? The Spark Connect client does not receive the underlying cause for a UDF failure. This means that a user needs to go into the driver logs to identify the cause for failure. Update the error message so that the underlying exception's message is included. ### Does this PR introduce _any_ user-facing change? Yes. This changes the error message that the user sees when a UDF fails. A new error parameter is added but the SQL state and existing parameters are unchanged and should cause no regressions. The error message prior to this change: ``` org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 0.0 failed 1 times, most recent failure: Lost task 3.0 in stage 0.0 (TID 3) (192.168.188.21 executor driver): org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] Failed to execute user defined function (` (cmd2$Helper$$Lambda$2170/0x000000f001d23000)`: (int) => int). SQLSTATE: 39000 ``` Sample of the new error message: ``` org.apache.spark.SparkException: [FAILED_EXECUTE_UDF] User defined function (` (cmd2$Helper$$Lambda$2422/0x0000007001ec1a10)`: (int) => int) failed due to: java.lang.NoClassDefFoundError: com/nija/test/MyClass. SQLSTATE: 39000 ``` ### How was this patch tested? Tested manually by running a [local connect server] and [connect client REPL] [local connect server]: https://github.com/apache/spark/blob/master/connector/connect/bin/spark-connect-shell [connect client REPL]: https://github.com/apache/spark/blob/master/connector/connect/bin/spark-connect-scala-client ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43983 from nija-at/udf-error-msg. Authored-by: Niranjan Jayakar Signed-off-by: Hyukjin Kwon --- .../src/main/resources/error/error-classes.json | 2 +- docs/sql-error-conditions.md | 2 +- .../spark/sql/errors/QueryExecutionErrors.scala | 3 ++- .../sql/catalyst/expressions/ScalaUDFSuite.scala | 6 ++++-- .../sql/errors/QueryExecutionErrorsSuite.scala | 6 ++++-- .../spark/sql/hive/execution/HiveUDFSuite.scala | 14 ++++++++++++-- 6 files changed, 24 insertions(+), 9 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 19b70307a1cdd..5b70edf249d14 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -1067,7 +1067,7 @@ }, "FAILED_EXECUTE_UDF" : { "message" : [ - "Failed to execute user defined function (: () => )." + "User defined function (: () => ) failed due to: ." ], "sqlState" : "39000" }, diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md index c0f88bffa6e5b..71abf10da328b 100644 --- a/docs/sql-error-conditions.md +++ b/docs/sql-error-conditions.md @@ -643,7 +643,7 @@ Column expression `` cannot be sorted because its type `` is not [SQLSTATE: 39000](sql-error-conditions-sqlstates.html#class-39-external-routine-invocation-exception) -Failed to execute user defined function (``: (``) => ``). +User defined function (``: (``) => ``) failed due to: ``. ### FAILED_FUNCTION_CALL diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala index 1aa25a51fa9c6..24332479f1937 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryExecutionErrors.scala @@ -190,7 +190,8 @@ private[sql] object QueryExecutionErrors extends QueryErrorsBase with ExecutionE messageParameters = Map( "functionName" -> toSQLId(functionName), "signature" -> inputTypes, - "result" -> outputType), + "result" -> outputType, + "reason" -> e.toString), cause = e) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala index 1b40e02aa8662..00fc9d462eb65 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ScalaUDFSuite.scala @@ -50,13 +50,15 @@ class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper { Literal.create(null, StringType) :: Nil, Option(resolvedEncoder[String]()) :: Nil) + val pattern = "User defined function .+ failed due to: java.lang.NullPointerException".r + val e1 = intercept[SparkException](udf.eval()) - assert(e1.getMessage.contains("Failed to execute user defined function")) + assert(pattern.findFirstIn(e1.getMessage).isDefined) val e2 = intercept[SparkException] { checkEvaluationWithUnsafeProjection(udf, null) } - assert(e2.getMessage.contains("Failed to execute user defined function")) + assert(pattern.findFirstIn(e2.getMessage).isDefined) } test("SPARK-22695: ScalaUDF should not use global variables") { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala index a49352cbe5080..1e869bfd25aa9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryExecutionErrorsSuite.scala @@ -431,7 +431,8 @@ class QueryExecutionErrorsSuite parameters = Map( "functionName" -> functionNameRegex, "signature" -> "string, int", - "result" -> "string"), + "result" -> "string", + "reason" -> "java.lang.StringIndexOutOfBoundsException: begin 5, end 6, length 5"), matchPVals = true) } @@ -455,7 +456,8 @@ class QueryExecutionErrorsSuite errorClass = "FAILED_EXECUTE_UDF", parameters = Map("functionName" -> functionNameRegex, "signature" -> "string, int", - "result" -> "string"), + "result" -> "string", + "reason" -> "java.lang.StringIndexOutOfBoundsException: begin 5, end 6, length 5"), matchPVals = true) } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala index 3813071b680c9..096b11feb9bcd 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveUDFSuite.scala @@ -754,7 +754,9 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { "functionName" -> "`org`.`apache`.`hadoop`.`hive`.`ql`.`udf`.`generic`.`GenericUDFAssertTrue`", "signature" -> "boolean", - "result" -> "void")) + "result" -> "void", + "reason" -> + "org.apache.hadoop.hive.ql.metadata.HiveException: ASSERT_TRUE(): assertion failed.")) } } } @@ -778,6 +780,13 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { withTable("HiveSimpleUDFTable") { sql(s"create table HiveSimpleUDFTable as select false as v") val df = sql("SELECT CodeGenHiveSimpleUDF(v) from HiveSimpleUDFTable") + + val reason = """ + |org.apache.hadoop.hive.ql.metadata.HiveException: Unable to execute method public + |boolean org.apache.spark.sql.hive.execution.SimpleUDFAssertTrue.evaluate(boolean) with + |arguments {false}:ASSERT_TRUE(): assertion failed.""" + .stripMargin.replaceAll("\n", " ").trim + checkError( exception = intercept[SparkException](df.collect()).getCause.asInstanceOf[SparkException], errorClass = "FAILED_EXECUTE_UDF", @@ -785,7 +794,8 @@ class HiveUDFSuite extends QueryTest with TestHiveSingleton with SQLTestUtils { "functionName" -> "`org`.`apache`.`spark`.`sql`.`hive`.`execution`.`SimpleUDFAssertTrue`", "signature" -> "boolean", - "result" -> "boolean" + "result" -> "boolean", + "reason" -> reason ) ) } From 400db88d00e50750513d733be697b6b2dd9043d3 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Mon, 27 Nov 2023 08:49:18 +0900 Subject: [PATCH 11/40] [SPARK-46103][PYTHON][INFRA][BUILD][DOCS] Enhancing PySpark documentation ### What changes were proposed in this pull request? This PR proposes to enhance the PySpark documentation by leveraging modern Sphinx features and functionalities. The primary objective is to improve the overall user experience and readability of the documentation. To achieve this, the PR includes an upgrade of `Sphinx` and `Jinja2` to their newer/latest versions, enabling us to use the latest `pydata_sphinx_theme` features such as light/dark mode toggling. ### Why are the changes needed? Currently, the PySpark documentation is unable to utilize many of the advanced features available in recent `Sphinx` versions due to older package versions. This limitation hinders the documentation's visual appeal and usability, particularly when compared to other projects like Pandas which have already adopted these enhancements. For example: ## Pandas API reference (better layout / switching light & dark mode available) ### Dark mode Screenshot 2023-11-26 at 5 43 29 AM ### Light mode Screenshot 2023-11-26 at 5 45 01 AM ## PySpark API reference (less readable compare to pandas / no light & dark mode) Screenshot 2023-11-26 at 5 43 48 AM By updating the `Sphinx` and `Jinja2` versions, we can significantly improve the documentation's layout, design, and interactive features, thereby enhancing the end-user experience. ### Does this PR introduce _any_ user-facing change? No API changes, but users will notice a more modern and user-friendly interface in the PySpark documentation. New features like light/dark mode and improved page layouts will be available as below: ## Before Screenshot 2023-11-26 at 5 43 48 AM ## After ### Dark mode Screenshot 2023-11-26 at 6 17 13 AM ### Light mode Screenshot 2023-11-26 at 6 16 47 AM ### How was this patch tested? Manually built docs from local environment, and also tested combinations between various `Jinja2`, `Sphinx` and `pydata_sphinx_theme` versions for best document rendering. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44012 from itholic/upgrade_sphinx. Authored-by: Haejoon Lee Signed-off-by: Hyukjin Kwon --- .github/workflows/build_and_test.yml | 2 +- dev/requirements.txt | 6 +- .../docs/source/_static/spark-logo-dark.png | Bin 0 -> 23555 bytes .../docs/source/_static/spark-logo-light.png | Bin 0 -> 18773 bytes .../autosummary/accessor_attribute.rst | 6 ++ .../autosummary/accessor_method.rst | 6 ++ .../autosummary/class_with_docs.rst | 4 +- .../_templates/autosummary/plot_class.rst | 53 ++++++++++++++++++ python/docs/source/conf.py | 6 +- .../source/reference/pyspark.pandas/frame.rst | 8 ++- .../reference/pyspark.pandas/indexing.rst | 12 ++++ .../source/reference/pyspark.pandas/io.rst | 5 ++ .../reference/pyspark.pandas/series.rst | 22 +++++++- .../reference/pyspark.sql/spark_session.rst | 14 +++++ 14 files changed, 136 insertions(+), 8 deletions(-) create mode 100644 python/docs/source/_static/spark-logo-dark.png create mode 100644 python/docs/source/_static/spark-logo-light.png create mode 100644 python/docs/source/_templates/autosummary/accessor_attribute.rst create mode 100644 python/docs/source/_templates/autosummary/accessor_method.rst create mode 100644 python/docs/source/_templates/autosummary/plot_class.rst diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 5033ab00601ab..a4c9ec3042582 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -751,7 +751,7 @@ jobs: # See also https://issues.apache.org/jira/browse/SPARK-35375. # Pin the MarkupSafe to 2.0.1 to resolve the CI error. # See also https://issues.apache.org/jira/browse/SPARK-38279. - python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme sphinx-copybutton nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0' + python3.9 -m pip install 'sphinx==4.2.0' mkdocs 'pydata_sphinx_theme==0.13' sphinx-copybutton nbsphinx numpydoc jinja2 'markupsafe==2.0.1' 'pyzmq<24.0.0' python3.9 -m pip install ipython_genutils # See SPARK-38517 python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 diff --git a/dev/requirements.txt b/dev/requirements.txt index 7de55ec24968a..a7af0907c7264 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -31,12 +31,12 @@ pandas-stubs<1.2.0.54 mkdocs # Documentation (Python) -pydata_sphinx_theme +pydata_sphinx_theme==0.13 ipython nbsphinx numpydoc -jinja2<3.0.0 -sphinx<3.1.0 +jinja2 +sphinx==4.2.0 sphinx-plotly-directive sphinx-copybutton docutils<0.18.0 diff --git a/python/docs/source/_static/spark-logo-dark.png b/python/docs/source/_static/spark-logo-dark.png new file mode 100644 index 0000000000000000000000000000000000000000..7460faec37fc7923d11d968fe8bc9d90f72381fd GIT binary patch literal 23555 zcmZ^~1ymg0vM-DWhaka%LvRU$%itC~cyNc{u7g{EAi;vW1P|^`aJS&@KIp)}JpSk0 zchCLqdSCb2y{oJ0SJJ(Et!@2HRaq7bofI7o4h~CRPU`b({qkBuQBhv6H5@5-uN9J& zxRN*=-0wIHs0s4xJ(Zcmd$pKx$QPMNK0 z!mol*3mth&B_%lKR~i)#86FP~@s)yqJ>lR<;r_AqO2K`CC;M;uGd$ye=^((tMcBY0 z{g=*{*ZQAB?zQ}b`FD+&3;!RBxd{KQ{gR9L-}GyYaB#MT5Vh9|%}Gwz6%Gyq|DOU6 zm!3%s2Zu0XqoL!bqogQk=4j7uV(w^a!R}@6^bZP7*h}!0w6}0Gq4Kh~b8r>(5~2B* zhTtpx51WIA>R&2uwjwk-N~%^r7jsL&&r&k~1%JI0p|N&z za}wm>@bvU#_XMyzx>#|16c7;L;N;@q;$nN%U~~0$a5M2@b8w~ow~_yAN6Nz0%*DpZ z&BoDz>L0r%rjG7zA~ZDrIQsA7-*LLxSpLV8gX@2#^_n2ZKN60Q?3^6`FPMdw&HoSV zAIZOA|H|v%?u7p_CaC+_!qw5v{hzRad|bl+^6>v*{wKbF1OB70>0sjq{J)U@#s1$Y zo&U)H5%Bs1{^RF=Nd5~c%<<3L{vU7u z-%aaZ+}E}RL>K1x?{*4AX9^!qgM$-;la~_L@Pa?fLJQQFZn%H)nEVkIN*{<@t0?hK zB8jcEnM&MAa`Wg~Qa^h{5*bb08D87mNqhjA01wx5t|YE>l3|1f$W>KJjKuvDi%VA* zAt*k6*lukDrro4Gd3NsW=&AC7GYB}Gwyc~b=w7C5-*kRPH5(X?LQ2Z`b@OqStA&q! zuq7Q#5tlyzzb?0E)67Mov>3UmDvH;$|n{I#B-7 zY?=pnk>>I@^`NyTesP%7!BX@5g%TXBn7ZV<1|`{84bYesqV%tAGO80s6i-x@Pisd2 z)xuIbWOunT+Q5`)&BhR-cTJP_f}I@&j;Wvc`p0;(d2;iD!ckJPdD}<|i~`QbuJ1__ zd=Dg7o_x%2bV+>c>UTcKeaFxxQB>$uJq)NX7Zq*fd4MjvTD7AUXuqVa$wz-D3Gn<{ z8D7yQg%}`Yt9(7Z$bMvSzpgur0eJ~XcOG4Lbw)PjMkWbRXU{kpC2Fl7&RO%A2haau zV4rk~wjF!utCS`M3m1_ZyNg;u@jQQ~UR<%I<2b` zC`?gh+0KXt)V(e`bbpAddhnRaCgi&RQ%nyf8#`$VtFOC;$cgy4f_erZhv>Q@w93^L zKit)dgP20Us4nuB@8Xcf?E=8XXEGPrOTTMSOE&iCu31YF#%)bTN_l>H}jSJ-n0+t(3ZFK%VdF(CSN^!^Km~L*`C4DeJMEvB1srnNPlxzNN#0 z79gocYRCZD7bmE>{!{$=>(`Z?Kw3lO!XLWQ<$cmYn;vWKUI} zn&jlP8J3+}-UO3YuQI$n2tEhDY1?EMmGQrtQS9F2@;venGoH?$t3-D=P7=^Wlt##*eDR^_!x$V_r>1b=RVl{@z{K4w1q*}HX>F!$KErElk_x&fMq9yPMh_0U((Y)h%N+}bsMG-I{{ zne!+h+4xa%-YZ+!7x`?SgR!5a<^Y9^+O)S{ux@OfK{MU zoeo7Rs`w=FrQ?Rrq7G9oYmbnP+75}7)a%37Knrfb@$vP_-N=n)vGtOECduf@3f<_% z35)O)OlnbZMk{BU*AzU@a&F{%elrDD$Fq1#+4gVV3rVZGbRM z_0E-+0)x@>3B?S|CIG;>D>=1ljeCc!Afv9X)#sXPJZc_6ChA+0@O0wES~<_t_JlWJ zzllxXnGjJ~eIt_ghMI#!5a0t#C`hk;ikSjtU57GQoa#wPT76#pN;8yv>A6qww$9R# zdLutB!kTJniW8ciyg=nwY?Ir?@$U@}jrHzT_j`WHmHONJ&IV&lfyaZ#3h+zDVQ)0HnKyq0zdRmJWntzgx@8 zTq)_3QgDeeNR1@nV-EeqQu66b{Gt#V@ykWoIYyb&S2{lKniU)4lZTHEToaZUbzY}M zfpNnr0>{-c`6XHSJFtt^IieG6sd z{2xgHd$MS{4aEbstT{Hpgi?|1AeW?S?J!i)v|ok6F_+oTc!+ zsqY)n9YtM~QD6ridfNMveZh^{kNO{654#fs$BK){C)j-fEf_Z~XXHxaR3Y=?MtIF( zlEpO*C0#}`D&EWXZD->6RZgi(`IPZKx1rQiy*nlL7^%-Tse~ySFd08dM3r(BbCa>E z(?$A0#ca^v#9gf^`6=?3aM9`6IZyzj>q8`#=%S%OUQX+#k*#@8YG}#1cXE-C^#0KlqCjBn6nD;U_?B$a+$9i$+ezNq$x0Y_ zBES{Nd$+i3-B#pYe@FWMv2VpOZQ_@aL8UK8?cCi=auBG{b*7lmXmxeIOC!}XkYQC^AaOgmI_`HtrHw*bF4hS3OE`<|$ zy6ro#4OVr#EHvwmm$XsQ>!4Ua|AvT+N9wNs9Ysd=%-@(!sK{!0$>m7B=rUID+#(>q z`Tkk!K})fMxFr^&QZf6TrJluE1^Zw3LzJz=^yv${HPdN#bG>vJ@Csa%QR%kE4ebV2 zw%kV$MKEhKPkog5rrG3An|u4a>cvReux7i19cEVv*H0YKcbO$a8j$^sR22t@%WB+a zBxd6+C*bMnV#`i{sJ%act;95d!{9O)V{+ZS>BB+Q(L`2xNJp#}n{!t-y<3gbaO;F? zNz*7N$I5D@9ztt-Nb)f5Qi<7OYYY+hNv(3M67NBj{`s`)ZGcU~*5a0h)AQ9X+17pLj=p4FFf^Mi@U5$FMb9@ z!@Z-)fU18pk>>YCl=JA0+C8yUxAO(wZSU6UCME6y?hIGPgP~i*_ssP*Tsb%|KgLnW zeV8s)>@l=o$}_e9fKcmcHe0!?jb+90+DiY$mrqIals&QLn z>lQ_+T!9H(fo~1efV=KnLkzY2paUw0GicfleYeOX)geELTlHUFT0i>37%>h^8=Iak zqGPM}i%lW8S+h<=T%oTTTz383+??&)-Qz9Kso;;_i~JNOszF2+%RGP_$YqScm$Dqz z9e(-Glst{no;L*X)6FX`nSWV;-1VVfZE~uTy2tw$uY1NgRS+8D7BKj?;C^rOo$OBv z8v$;RK#+Z+FFdP#Wz^{;*(KxIS)+FE)rR=ZIIe&Na10w5QSqFLdaK@AKTO@B!vkPt!G}L$Q z{VKI@*+&K!!~^PwI=@1+Y2_UTx;>!>c*7=}G+DH)qz^xo(JEK8+?wkmZ`t%u$eV%h=rXxBy@IiLR%3c>ZB93c7OV8OU4L!o9mYgP7AC zdQJkQ94^ha4@6k8lJ39t;YjnK2iN1q(BWi0 z>pi^z6IuWsH>RjO;QK>~KH}K20#E9+VxQ z&;6Svd_4>;_Ly13j2zs7&nC-ND7)8g#Hs63u8Y@tca-x<-j~DcgM4s=V9A=&R&^o!PH)^V&r&#GV<)Hs2N8MHlDO96l zQ=zTQ3!+}^-fnTth`m5f?LAfDtG7$cBU&%RessyIX}h=1nPVyPf$rU%q-XNHdmW1A znPAFLSW0=q4B}SOy;k4(f9LIzwqT#8avj(WD|aZpBX;CO9^Cuw3B3??{IjJ586$7a zUa6pjY(fGCc28prYhAOn`JS?e?yYcgT*hmao2of{?TEDBIb-#y%qY^nFwF*v_E;{A zS7h+*tQ{v)&TXwKo)=%kV}M9m{7}a8^r5cz$F;2q`MLH&X2?x@y;djC;MkGBg`}1I4MJo;CHfu>8YPX`Q_isLTuAi_&TQJ01Z|^-~)*Df8hcru= zPfXTw?X9T{5YI_}`QIH;gBltcGFl8Mf3Cu=9LT4fnfW4@vL-5yR@t@&g2tUO6uh#B z9Z${<{;Z%EZIp$)kbRMvtKtDO z&w0DVoO9}NdonK{P*;Uy#04oMJY&58w2I~)*Z%lL^4j07Dr7imboe`~EjjhG2ae;}Q6cQujP*BLL-gZqo-=KF=YyPfK+|1ir9pX`SxFSJFqYxpxSrZ2!*K2M`Ipd}VJqUbd|Oa^ zIqbeQ>WOP)BP~RhZ~M%S-eVE!quAeV6_OMCZLLQ|X{+}153 zs!2Vo+bW;xySA=W1pjvgkB({emz|o^6&1`f$5QRMek%HLcYGzRP*y>UWN1PjdIk1|p5owz$}g8Sc*Flv`e0sW|)Z-EP5q z@*L}h1bPV?iTNxpB9b(7kmx#tbZV*cjJfGyj?+SWc*ji^4bT&vn+GSk$4m5wd z!3_wZIJ}=hFfe1{#u!8!^vU+2Gx$xzRLjg!_me6P%+$i0wryR|{uOMS^QEEJBIfVP zr{VtCna^AO`<0pIyJO8PJ#Q&w6UEFIs@9pJNDSkN^roJ$SV`C z`3Fw1b`7tR%llp!KE)xVWN*GDnT(ThXRnu1?z}D3GX&ie^%gOD7%_gPixpaOncC~z zxsLhaGDV$V?LnKmwqt4VcPnG?_V>G6Fw8)%C0ZmOcMtt)R1yejCSjEatIpT~- z8hu=%#6%opSS&=h$V7F3O^lAbnv8Qdn@Tt)l8aw$OFaojzkOY(j}p}!UfcK*d>9vV zxhZskgZdE~<(Exa>k59h-^vKvn<|b>MGs(L1*=hf+m1=Mt%RxY%kzJ-u8yWjoX1%c zHbVW&n{VBGiJrJ>bnGWZ_x|>?srx`%_B_ebA+`-a(HWd&dbah&VcNa!8C(@>dg) z)wR@%gZq7giDSI6^4k;MXz*IWxq<6>jKbo$=&tw3m(6QEVv&&873`#IU-VvLkhsDN z8b&I|`m|oNja@k~IqQW*Sl59AeWH+pU3Bu?6Yq!MgK{m~cvZMnne)?U!5~$J=HGIp zZVU1XBxmMwe&dIR99o0u)1_xvI7tCfxRLYGd^$-E;lB-E%m`HmTji{=!1@q)Eu4W? zRle!;geIY=%K+qnJU6gTpIWd-++)yrz=p4{)mW5^Cwg(E8axJrNP!Y z#eLv~i_E5Z>PlfV3D7@L{}@%u!lhkmG-}H@m6TM_CFszj*!+>k#-+dow@BO09Z^Za z35Z2#9&+tYnWLlRq}loY#3l848jfm3Ho~F0LLF>qvQfI9t5jX?uTcjxcywRMSRBuJ zNom{g_-kF&$OUH{Xooz~eD;_N>u&;%c%q5g6a_`c*-G7#B~zD&QrKIWOFAq}vd)xu z<~QBN%q*P7oo(_QR7uD&)M6L?IiQM`Ria|paF$C~hl7MlE--Er^0Y>`AG)2w?)(V^ zW}jHzgP-Iq6>*zo!9WTh0PK=oIrTo34_jscbLb?oPGnrk#lF}Bg(15m0Ze7)+e_sV z1GU1X&RyLhfC2Kz6nFDy%R^6F_L7J*heb#CuwhUNS+_^hA|XbN0GCnNu!)Oh!}d~b zZX5az;W*Nkmx^zRb)zrPz{NiBdN?`uZtJ-{SmvIDnezxm1(RCp~Sgn|ZR%YHg05Or&`ehp0eGT2Y!$C2}VCiQS4> zbs$6_!Og)(2jD13Ph9g|gs7bQRg?uiz^nO{>v}A3XwnV{ma9)1sbwJ)yw|I z0VU1TQ5np_I&u+9+CzqfJI3nUl+hI@TfzG1{@bvMfHNDrv-zht57OpAB(d?PS} zts{wU%j4#0qCRR0i2a{cYJYVC6j285hq`-X&MI-~L6>xJ8a>C&(cD#EF9!w=VIvdB zaa!a<7mJF?jTsmk0Y%A_Wxa>C5`LQ&+$mP1-eT1GdmKqUss0j>xU!8l4PeO;0sTdx z6HFTx!{dUlXE;NDPi?q3RHXE4$C0kDAxc5;0QVd4rA!4__a>*^7!vj+#q-GfmVw?n z+P^1dQgPTaQKj7k_l=fX5s%OwA^Nk=L>zQ=J}1||+r04gs^x!HWFS(TOT|_rxw8Pg5k?lY5q<{4fiwg_hFSa-OvwpFRIyikA;wZQ{py%O7sMs z2ByR}a4ieET71g-_&LS5oi^;ZVir}qKI?^Rhg1l7K=q{qL6AA7kqRIJXa_!=weVi%Wu?{sID6Uxxu^;eupR)?Lh z{KH79x^WTL{6R>pc$|2+Jz}$Nvw^h_Zy!fW9(Om2l%7a;wBMzVhMeB|2c(hf@ZSkt z#G>MY;7}Ir7iR0F7yUIoSZ>O3`Tf^%-Z<<47Hm>Ctz^n}Q@AwWqR3SsZHA2f+0oO< z{#a*(ilG+_(lad!7@h5Uk+hDJh#2EFHP3TR!v?U4-vQ#>+gL`&quBnW9?X6f!(u`{ zgh>5d_^4x`;dTzL{fe%jJFwRbrBo|cLA6JV%nNbDmNC%h#4hEo*NB|38i6%JFYdn2 zvZlsr;>%Gav?<30RpOpAtWp0@Gq_+%eyvluylL&q5aT#tr{3tm?&S6?7n<3@7}iCdB2q{abhk95G zaD>6JpM%2jvoZ+XgVzgj9FKdp?&4Hheb!Rxhewz&%QeUiuR%<{jf2zMo5xJiqJwcku-OW4h1l41nyu0qN?WJHl1%a6CI1KB+9Xu;^WArfgE!-3{ea zB>udhl>V1_ex`BD+8yb!Yr~lAJ)uu#yRr|eec(B?T>!k0avt9Z`unjj zhcexo237Vr?B6jKzz1b(&Ph9Kc~YLCxIhn0+vDETwl<+{& zesR54Cd6sJ3>LE4Dl<~~QUl&_vpOgp7kaGPH~ZP1UlwO?dNUK@i^=GSN{aiqcQV^{ z!X_xN(P-g&FOW#I>e)M2yK< z3SNnq1IHR3SD`3XD3ti{bv&1dd(Xy)SgIO{=7_kaN40xKER%;k3}`*%BgG!S@jmq1y?R6`E3mb- zAO^VuI7*ee#j_v@P|Hs+0s+PcHck^+kA|m!g$T#VPllG5^H6)I3*#fTK;A0(X&kuv^*5#G_}^lhS$^dd>$v~vOR}U8=5W0EIUhvA5BeB`W5?yZ8k5FcO&L@KYzJ04f095^sUf_~I1EPvKC`L_$Cu311;7Xo zK9)}FhbdpdMzXPVOtB^=hq@V4Ax}CSVHLcYmUx)fULRu=$}diak;`mqB+kwGKeREQ z_vLiKlN!)(y@+9BHFgbxNshMMIgmZB%l6OBskkD>aM($ftr9AS7$Ld-!e5f{kH!$| zG2_P`#~zAREZ?J8H=+eFO_o1U!&@nyB?)MYnlGunKU^5kfE-K2i^?W$_X(mvheDCb zC>jvxN5jBI%;v>~UEArubzskByFjD*!XuI=u?|AzP-~F_e=e$%jv}b-_d9%XUG*i0`5a7xpy?tE+2q9;OCzyS~2D%t7D& zi3(oCEXHj~WtoNay3l{PGL^D$W25X)<*$?4Gn7d{gA>`~s#~R^&tdjv7OdRUB(+Ef z6AvU5AxVW$#P=KlV#6`>LvWyntgEq-DP_5Sm@&n+_hZe%>Y;q{zqzM{^ZC3Jkk_fW z@@91+hY|`SN~=CwL=GuH)Wl-@UE>(HR3E)%MO}wa_Z5~??>cV(kY$wQxZ%a&sc4b# zJ-8eK=5S(yhvQZq<6r#4vv(zo2xGL)eWk|U1^d45qS;w?Ena=Un|`mI?u9}=4l+A` zA8_{LtdN8g*ZeL!alF~+7+|OZUn|(|M`vx5{koJ@ zltFaD-hY*#8K10vo-Up=s2xh*POi}*^^I6WRl#3RMm&y-k|=+-Bh<~16oJ(1Z4S!r zU?k@wCF&$T!h)9xBY2TaBLAZpGbvcx-g^2Y!r(;xr98qOnaldN?#Dp-WXG%dT~$G! zPxG3gi@9zlkcj1ZQf>QES-qRQ#r8kPmx4yvZ_0e>lH^xbeguFJlFQt`3m7t|w$`btsQ<70%1Ne{L06*1!&@&&U zU-2*c>m3OrmklieVo(lHqxB*sdmV$|<#3fW;GjA^?^|ld(HukXb1ZY4j+$%O>De~6 z2f-O!7#FSSUzSj9)ZwSdc$a&83WiJub0eF&5}x4vPu(#To!4|YqfI|Rj%e<*mlgAY zkEQnV0ZjDmTGk*2Wd>q$&<)y_>wNXX5*?AOo%7p*oO1rhh91v>9Z|GH$#kOQ#4V%v zE!z|vnPZWlz)27yw6ui%$Qku*f2u%xZo_u0vV_q`AIR)q`lSvr_0o7in z-I?{*k>gXVr+29lm6(WvKZp82H!wPyue4DzZ@V96gQXigLo@%qULKf8K5! z_=oFdb4QvGD6s32x39Gk{~myp;tk47pA_bLVt7~lkxsRwj;j%aGRB+j;4s|}il{J0 z<{`$+#r(lkT|~pdXOo$v66@T?lixz&=HA41R10|=0=i&dGXE;d5zT$6@0-^Ghvw^RJvd&fvK#gbtjk@Rj#S4HAAKIK zR601h9TZ*9lLhCVG8@?fruH(Q-0YlVQ@}V(SRUWv=g*`KSLPRoCUp5OHf`)E(a)Si z`r)F>(AAKOaIBhzQv`dg28a{k`UX_JwZ>SLX=8BX9H`&OycY|4P@@WgE164;x)G89 zbJhIhx&nB$OAgH&PmngJr2pm5rzD8v<7nZea?Zf24;ledm|iv82oh^R+L-ZcA6tJy zr+tzao4)4!)pWsrYm^uea7A|DU91~>Ti`zRNky2V7)KjUP=gfY5rgN5Uu$4d?1`VC zzbTNGK+CX2;yw49Y{01iu-XlhZssq5^aNJFX^?9U{uA+HF9HDqsq!s+Wm3J0zox;k zN&dL{aC1gx&0{QB;O@HRmH1|!zwW?kxexp1s(z@RM(C~g#tIEEJ@`dGfpPnz^g28J z)qCQC#sqGOcRtDQ3`8<3|2~Dif6N;z>ZO&sR(e2PYMTd@(o_7V@)z#sEAV)dc)%dh z5hE*IWpby>=5AV6_2+ZyVQGvnq2~I+5hbf-l-m57YpM`3X5bvloqd;qE})?ADQAW9Adq+1YSN z7)zmkd~FS(;QXL^5O-hGDKDH3|8%m%68c~JZdWO<=u0w*2^4-kV} z@sht(Q5)J}sr*Y6!JBige zRuH65c%-i)P>fL=X(Xfd6DZ{3R<~uSC_NLzq0Q`d@wgsl;+JpN&}71ccj0wx#-dq zP0W3=xfH&8z_@?6fM6%|LfuOi7Tu38 z3P4~%MfR-BlS0<;t|G5g43lMf=BJ+gZSKs{PsoSLH+79X_Prod_3m! zdLH8Bial(zj57r94?uUw-@CBpeFY`gj*E?D8hHA?0;a??8IQ~RIMM8ORXEvPyZbsL zP)+!UM8r?=1B&jOu+^ae3-qd|-V%0BOLy}TTfKy{BQ1fgpxgDn!5btYj0eqCbZW29nu`v?ca z{j1q+h#7>VuV(kN%qgMG3<~15)fvLYPX7*huQsY~*&_OV6fWd&QfrV2=xTa!e>9gS zeB$>POpD{K>E3hlS@+GQ zvX#13)^un|O!pA=ZLMFg6!N1WSjMD_c^~RGnpZNZ;3asqJeNIuB-=T-3@LhSkO25# z5|AXM{jrQMz^A7^n))%QMO#7oE*cgWhOa?^7_-8_;y(nxZYY}skbY1i>@C}o*fOV9 zP(KcfuBey)3}Rhh{yw2(X!%7@NVL=WmwrW{z3#ZZd^SHW&T?Z}3ODIYsUCu78^(37b2VC_waMuqJ@r}6Km{#KONuPvQK{;KvmPdS= z2-#9mPlvX6RW+(@;B3_J{BQ?X%&ArK-x57SEQXI`M+(!-rLr$&wV|slP+s zSp@ffj;Ql8pR|#(SfuyU6E)cD&=%E?DylX91faLbNtM(nhh+{WFXVozg1b?qNJ+mm zh7;2sYg|>??1UfHP8r4|kVM!X&_!6GETM&P;F+P+dZPb&BNM#!`Gd)7VQg6;df>ah z4;NvQzDW#qLnqD0@&>Z>uvxYj&{8wCO9$e0@dW)1*d zj8!epcgNX}IQfg`NdXR1jxP>n+7JW9;ZL8Jh+re?PmvBk+`di7qPB%z;JtOXO@Bzs zshVTyqg)^hWf{nocf{d6%TxJgvr#S5uKakb1e(?Z?U4&d2BcgNL>-V*6LknYx9Tnb1f~)|;`GF{~+e8vn+A1z=&EYxq0d~C$>x_3rgEKHgowF^4ZxKJ19^3~$ z+W*LFR@By4$`!5>6=xMa3*JH`h6^_w?MD#t34d|eO86mr8bxILfiZSZ)C$=KH1D;V z53SbnUzB+Ztk!tZ3x&H?nW?@UCTmrGCTEdSCFVr1O`y=5Sdk8P9H20@4_( zSS9)E-vr|KL@zQU#7sa|rEeP+R4)!r3BEPxEees-v~@hVxp2x6 z3ue>dpL5Ps^2mDw#(!UGd;}!>-MnHbo@ZL7k8&X_j-=hGK57ecTQK=>bvg`y>9{@9 za{k0#*B3ud9~4$MWLVj=8uwleMTeAkv~fFbK{ZO1V|3{(es5$?jMc3x!95FQT#S5c z0}%PBJc+1)qiQH}7oE*vC0Nr_S*K>(^)a^FSr7$ir{}n3@aS@`V*Y@0MJ@vRT%lP4 zR{E+$@mM}KQ;pt3i#_+yF;XBzdb}zALOZ4Lbl!>27^};zO!#~HBsEHwnp;E3TUs`x z*3WFT=MciRVG>i4ACBEVL!uvxuf4mM8v@q-A*w0H&_m30&-Np1RKW_5A-<_|5HzB$ z&o%_py@LG2?Z%zHP)AM*dux2!70){6(3Pln!W_$MRr3%J?vjm!UNU39j*~EKLE)t) z%3x<=54Lt85s|p>5cmWpbeN1BWz(b24?KpiHzmv*iFyMCR!^2<{BU+Nd`KyGime{= zplu|7+vk>un3aXu`)vka#73cNYnu^evb9Y6g|<5fo?MZTQ?^rphKxli<9D@(TcZsE zyLHJ8g4K3f*b5Vemirty+>q^49+lqtq=qhxARjkIw9F1!y(i`(fWF?>_LE;Uxgd|# z_ix))fM@v;(}IR%Jz65!efQ&-RMZ>ppG&AfHwTm~mD2T`VJss@&*#`8{d5L0JLIgz7P#8 zVg-T$hkzPCHJ!>h(l|cm4Vs_K7wax5s3oYC*sFju zZt!pNp1ehldE~Mn-Yv3_CtgQ^FQRn0!1p!u{Gas@mcB95+S@G2`8UNb3IyGO9$geY zUw{2Up3G==g5a!0e7aQ-i7LLJ51uIxVMfY2-p4lUWPu5VhJ0jH-;L`(VcQPMkLZ6G z7>s#A`u&X6j;v)!{0SDFe~8+QX}uRhN-JtZtS6Giv)p1-<^xl!U8rBa@{zp)w-$Y@ zqjRCstK?Dz-Xe^gNRyI#ao%ogfI=^Eeq89Z)EBiUmy-<0-rsf+9Vvv_0GLk8;JIcw z1f)6)EjXK9CIc&{G{HG6h3oPmv=Df&zaz*LO&D2qdk-B{18sA^6IwVONZ#oV3bX@qh>tyxC{t%oB)$SmQE!lxSYvWVn!WQSzCY zZefVv!qTjT!CO4OI>kq4g-V;iMz4J3W2c=VLSJM;h&`$?w@K+u^%0yQKB#lU>qj$h z)9T^8x#;9UbS0ln2)zGLqT}Vjo&m^z#n2AR%TY9a;uAF~#pf<&JFFqcD)zG+I8gKfU=#UoYF6mdttc# z3J>v9ROuVrKlD#R$E4;28cmemq_E#(WmAK{Slz1q#@?F}z@kWws2*{elIw`Y1FydB z92gC=!hk6>>sqVYb|_{uBwYz~8qy=CZ|hby^S8j*kjkz~*c6S}B}XU$Uo36k=pg;qj@sATG;VRdB(Vz*L6!}VLdU} zRKoz_3H@u>Rtj|S+vrNDjY7VuxCKSZCo0;#l7WlF?eFl{c8e*-=*5z0!Z4O&^{|Ga zKD_t|REM|z>JMzm&m1kka=8YhHWjy|C8`n^TFo0Q4$$|uLU7#L|JuiHuW5-nWLWFc zbG9yiR(7+!#@TE6Jk+oyA(ppspsqN`A0_op?(AxxkUWT|0 zty`%A-v9mV^BpgMW|gt9GSHv;IfW#)@u_jdg34LJzQvaXJ4=&g&~2Xj%j=v!LGgT= zzx>k8(q!paw8lGsPfGD~cI4M)TVkV-dvU8awOaH<$W=!~7<{)4N;CMLQVFj3sfWIiIqn-lBx!cUX-$s2 z$2ZcS8Sd)3w4^q!lO>LVMeFlQE^k?$f)(AA-?#*v>CWdHl@ndaIK5Lv3j3wt-BA&{ zcbiVVyeHA^6`uFdrNwY=&B-_uJ5ZScRkm``qbV-MfuaG$15oAx^Q`8`A*J zrruUF23+fj9)aJ6vQ1cz(Xa15MpYM_kg1VmU->ijBnf}SH9ncko+&E67ve||Q3 zRlQl=nK*c*bCE!!s=eN+o>ph@m*7lv9?1*B2rK z`Gy&#w!1O!o|5v2weTCie(Zx;%cYmm?zjdW0LtE_iBu+EV;d>g8$n8S7%CkKm&&#T z6Q!Mbl<;I=F|I{$fR}@jE(27Zov^&+7n$qKQ!=h;s(~5zuPw8o3!FA6rcelVbcjem zOo4f0Wm8bn#_?oXNL(zOmKHJdGgg3zD==OWqovTCmgWwISQ>zhv z>HC5#ef98Tsp6+!Wf56#l5Yl%S;INPG#yIll4eK-KE9CkAD(5Ce5(CZnk-6=RFL_h zy_ah`XpAE@2;Qs1DKj3ML`k~_l^6kKhN)c&^>WB^NX*^Ra#L+)$J8gA>|xjQd>I#xrJ<&*H+|L-W__ z`r8`V87(Uo0C`-T>Ni#XV~G?b%`WJJ&c|$gxe(YDbuylJiT_4dO1dGMk&gAWI96Al zsdJcFtSgCSVCAhNUbbpDdu_~ zYn`)5G(;ZbyVUbbGTLZy?)2P8SRF8W;;DZU57Wg@qzj%<4C;=!FE`C`<){pn-pf6q zm7SK)>H>}RugN|GeIc6eqn%^C*Z4@w8>LC|8&6ghdwC*@OUgq$G*)gB9fK_G@ha*1 zgVG1|No|wFW7x&lqug4Bmww!6eX%ku5VAB!an%$ok|Nlw{|fOs${Rb%w#MY}$Ci#3 zSIq1^R^F}3&xt#AH3KcD-Py45T_Z+M2y5w3CSSDdm-&G4i>8_1+RPr`FiE&|000;V zNklt$zM%5NlHtM$4D(v{`vAV+hWKVe?7pA4ZJaQobs(R9p>Ypu@2{>oCBTJ zZLv?_8m#4y(hj9RPL?G8J1jwR(NTfJL#HUa4;CGCOTAdd{Kdzbo_ug<9Iq?W;Uk`F zQvIU(=BxvV(oPAZQvVwL>Qt)uM9bUelTDv-)Yb>6+vkP%3CW+PM`*48{y;`gU&8oJ zKT6)cV3!C^M@BiQLjKd`bOJ(KsEHft{PZceK~tD zFxlFCgwa@f)U`UV^O2S}N>j~0r5Zt5`hO{WVOd;KkQtu-OW$KF3sl^^45NqtU+b_# zE~z>shah$W&!wD|!g^+YlZ{I{`-z_!46;t+PYr-^ZSZu1=}0vc;Ydu@gMEgy8OL~9 z2e-To8XQgmNd{s%=yX&As~R8NWsVtTQUN9xhOyZbsmY2fnF$t`z<{=)&I~ZtUdA&a zOG&iRw)xm3{gG_6OeDNu^E0`>5!#ki-bBFok@Y?5ldo4^Rtd!Ic3_=umk5qxq#zzO zvF|%o$nq9Nr7f$JTmpM4WG~Wiu*cxh<|)z|7!t~5&kb0EhoN2b_xYY}T3IHu9hg~| zQ-1tr1aT4?Uz=4eZyfDT#YxZbw(@9$G%z~kdejp-*~Evu3~E4o8XkZd%!L=d{iK#=h-LE_?Ia- z&#Q%UYi(F&8+TGG9yBnW=pZ3bO@7L#wH(K=-ai-!b@ch>dN--#A-~NK_9BYOH_GR= zun(3v#<$uGT#{)g0%lNrEEPzTjdYB@e2-7wT30d~q|5Cvl3T4T$aw6B{^ht|53&g& ztzQ`5hn0_S6|i4lEmj|N48wufXna(OL;H0Lsnd6SiVX%nIQ*S71r?=7xhP0D*bc`* z2n~dy77Ho29%V&GRE&NPAz+3OhARI;S8dN-nXA4SXUMAGVaoObpEXG?tMeMsF%SaQ z$&&JfRHc7wHN?_f^@V6JGki0sBo0=Gjml$#pw)+EKC3inEUui7m9v%v>9VGN>6sYq zehTmFIWMy28H}(w=Ms(tyiM(C|ra_lQp}b=+;hlol(nDucoEoEWR+ z$00bhOU)C>!f#x2Ia!TI!8)2XGohP1XH{}ts~KxR^E#| zeJy20imC3E9_+CWgVDQvj-*b9yw+KM;c3x<)x-AfX?n5tIz(kpg9kErUPJ9zJ$Tw+ zGikLl)_Q_y9ol?cn*Oa`ou1ap_Q6!gXlVn$^cjYd|E~5)^l{Fj*pA+(xDWgIYJ4cy z2J7!DKbOlee(FB#!4zH;6v2gbXnso_BR^5tA%brSKP~?R#WVA}C2lceb&oYXC%0}N zS0#(lvv1rOp`9A>CwXZU%l~?aB=fF#?rLRxK}E2wV40g_h;$_My+(MG5Fo$t;MqL7 z0W2;I4g95E3~0FBjA^{!JsmiavGK-V_2@J8BtAz*P~sPwPZn|B%5hVc$U(qL1X zvr0&OxEDb0r+h1Cyw4VmE9YcwyT&JoMe$MIq;lf2Fnq~RTW#=Z*7iP_>KN^mRH}hQ zlXb0+qI^3{w6wDn#Q=+8+W;nSEYRlgI9K|6o$BqLWW`brqew0waA=3Y^Gp+=rwe~5 z8MzsDvJk~Glc5CQol4(dw9KIO7?R~bC1g`viu+)N;~GS|@TgR-(X!LSz zxqlgPgOJ&?8gNu+ zCj6LBp$*f@_hiYA@-v9A!lQnZMQOU5o(&N>=U)#b4-BL;fbCk1BECXX)RPOxOmihank^ zOG?@m0SP)@_*EgsZ$Jf5h5?-&a5tCa(y&!QbR!{>%a>|434v%iVNB^?IwA0UD?t8z zgxtFaqH={X@qeiUz!I*wvN4|ZxZ=Tg-(1I%d$Q1EU=U0L)7!_r3?+I~M%PK?zSlD` z`XruKF6k$Fo`e3Pka%z?=0N@*3*Qq7U#2jSB$LU)2RrsApWG4NJy3ognE^@dZS8Zf zXq;Z;+RSTW#JWa@E98SDKvFr;wABi@I?F8?C=@Q1E&uO?KvLbq)bOv-{L0szEmH~a z*h)NIkrHlS;l)C}jc!0=nBiF*E^!wBT3^A$4kubSJ9;|K4*nzrtg*vTWN2f6QQRjpQ#HpT)(w;$ixv4?paACY3|7wZ$Lcr*KI-SQ(190q9Wqt$t~IMdrZx zzu)sO8Q_iouy7^r-Ck%M8rK>s5?$goML>JJRYHD@!)+V9zu0&Rvwh%$5r8Qt48{L7 znzwlc+6;1nC$FUuqMV_(JU@`%?g7yF3{FwGsfX1Cx7a!4KSH=zXkfaqaJfsFrcb9U zgfri^1WKK0Zkv zli^06hqBxtDfbkg7x_&8KC0+gTYo!}zo_%5oaKITuyUR&zqLhDITUDh_@R|m&eOu8 zHRJyX83iV17>fUEGLU+NFQYhN81G@>O2DuCI&k1rD_CPgf03|gNYP3u{s7^pgbWfu zEy}aTxkvQQAmrK%awbd!(elHX>VN4FmXG(?IgD{j)2PgGe$g>=eMe|OJ<>Xu(OJDO zg|8wLJAu-yqhW*qoyZ0a7m@JC>O?Xts#nI%@alX_FtYa2TrXFZH4^k>fz&Rl-XoH%rXl3YQ5V z5pt+CT6mCfj_?&iy9*$%OGn0H#~?BWc+A6q#rFX05Y7&wV#ve8G|~RIWV%oUM+uR| z^bwbbIMQi15IJvA{=4O}*StfxUbsz&9`_f{6uv}A+Z-SSkc+aA_d&_16+)hZK~MV(FHi-4O)UjBB8H*re;Hk?je?}7jl$7Vto-ZIkP_Td zTdwQ;aK{?aG)MJ^LFo;l>Y6c9A&0w$ah1asG@TAT`FVW?u zr6tX{QF_zqPmmW-<`;Pev|myiTE5eK9#;qN=htumYu7L|@Hbk& zBdCpnjB-6fW>j2|QI2#QY2{4>^3iE{q*FXqX5>}aM$3uv1GVrADdYcA@2K3A4G?bz zN|{z4WT>?s+)AV#4+u|?{A{6kRL0Da3Zh|f;nC`?BBV`_2Y$5zveSmthxV|Umw0qW zJ)&twujnFl7JUO>#)VIk&dqq%`h3y;GbzF=%}X6O#NSQK3qti#L~*!W5 zI=`iv!79pu)v?ouai(~`$rMR)UiguZdQ)eBIA+Q*at_2VFy3LP@^ADRzRO=pQV}Dy z7B4?dtVpelusS@dq=$WS)v{CueUN?toE17wht=|8v<$vncmB4|$7-yo3;Y89^p+uU zc9G7}N9b5>6nS{QbvvABTctkkbC-9Bx|!CQ0i(B(?F?1$cajOCH)FBEgZ<`yb@Hx< zoN}l;h`%1?(g|1{a9`nG;n9-66Wz)szGe_T4Qnd|mWdbz#!n}z7pOydy^?rXeOE^a zrbPU4UNfdmN_@lsmwu}%@k3Lh6TgVQc% zEbD!EHgJkV0>x{IQySODcDZkZa1LY_`GFTi@wLo-2NGd1F2U+PK zKs{x-KB>H)R6Y*+%&6g$1jxfg0$ic`9j5%WEvqgA4p0A0_;um8gs&Gen0-aaqxQY! z0w)Xa6rMytV0^<+oqw&dogyUEIm-WMA)Son+pYp;d5i)D(0N$iU!zJg2w0=YTLKdm zLXl~WWoamdx>1I8PI-q6D93nPBYsSEI914n8jL~`-QX%`q?So z%3ltqH&z$Sh!`PlfzAQzCt+ydujO0gqoQ<{m$t5Ekd}Wi-_mD*8SBfhRt{vH{89bt zG90ysvK05+XTd{2+tVItfKIHvejpxVRnj+BD?Cp$HwkZ(|19Cd@?(Eng$xS7zYCuf zT7NU$lU6slhEhS%N-?7RQ^;t|3`4^W6<~z$q*F0sE*0`F^CAi0l!z>fi-XBNGRcsM zr{Q^YwNOYWrVIchP2%eaYWxEac+uGKrt{n`*=N5#J(0=nCRJ~GpPSn1fTar)eIjp+6x^&Pxv(83qAuc9XYmpy6$Xr#yIaSoLWkOCK)kxC1cx zp)CiCYx4<79mxg+Mnw+B4inxeq)dzyMVZkWPer8ybnr!f8V(-tLMIGbZZhVElBpt-?2`%y$Ua3axESHsq|gUew73F?lohsSM(PeS|!UB;Cp&&5$^& z>uUMSVS#&Mzr^fKRZ2(BzA{*6xsYdiuc{j`g#Ti4UtW`UA6bw7fV!`H+K4 zO|SOtYR0c<*LKnNW8nR*kcsSv!k-G+^0|H{cc-9e_;8`oB@{E(b|HI;_X;VD%0Y`@ z))B1X2}jZ5aj7f{rh@Qb>2js;Q1Sny&UK96D7yH%nCp683VhmH!0c zEj}+YFml2-t}Z4!Wt!~N8~IT<@xK||8HvO;BGK&JNP^0{#e%js2+5qujLHc*6{JyQR|^jn(jlt>o$CSNox>3s+fS$$~{iWtnTP%W<=#Vkd-w0G_vmWZxX0n&LbEE=8ESF zgy=7+o=S2+eQ7%m>~2^7+mzq*P2Yy+ALY^xua$F`@KGVFN@!1%pO1vP>+Es|-S6TW zRJ>imq#+2(LJ%s2aE6oRXHXa;G}tcBCgDcmGT|Mo$^q| zKZKNtJj@p5VDVQg&phRU2E`yZ0|1ThKxyzfL^xeIK?r|FWEzD!Apd&dav{rKmRk;# z*Xcz82CYFW-m@E&BN?a@gQ9C2m?)0HaHf!YO%XmvNWIWE#>>iWgK)KwALPI-g7Q0k zQ78D-L7c(nE3b<9IO8vmuQjavPS56&4?j-1E~8~d>1!>k zoG#JujnE511({JxKxr)XSfUtoiH_w-2^bGWVt_Rpi}KlMI8}M?5i(0NV7S3Ut^r03 zU*bQkGA>hImiumIgh*l1P%1Mt^ff~yZ(Z`&T)-d+C6^6Cs#0b9RAa0B3>N63WMCwp z($H18o=J*>3Y{uH14-5X<(XCFk1HfDPUTp;#wEwa4Z5s2X1;u6sHab%7j#~n{5&7F z+b+)Q&(hrl>}ePjYPO*`nTexuQo$F(EP$|d6kL>t;LMI-35b?XV+{`-Gthwa-P<<8LIL%HLuNnS&@+l8b11yfH1~fF#&_F~3Nrn_5dD^{Pb~wOGQGSqU z!kLkG`iR}3UM73PZD`;r*T8T#5Hs@qrIm4BGu7l>{gi94QD8#@yP*cE8Pd~PB{Tf- z5&}8dvs+xbTvm@YKEYN1<#dg3Ljw&BNCRdt!%-P#4LmzF@c#kMJd-QCCZ=Hk0000f8$o}y zNlj!`l)%6|slmYfg22FDzes*3U|=q+U|?rI!NB;_z`$_qv)X?Mesu(yY03RkQUas@ z!r{Q6!M}k)eqrEW7Z^A`*nir-Ffe&=g8#);z-j-72LcQ%)DjHpe|U7h;D1H-tNthR ze<5TZ`2V9Z58{8hL3xn>i--Lu2ATpad;xfSS#2jUFhq?1DmYk17VeieZA&#RXDuZ~ zeiOUj%toem#%9d!zwQ5%0uyxS|3ZJ8IUA9>|F*Gp;&&G!|34i3U-*Ay7IM=6!{Tf$ zM6RXuom9fk(TtRvnU$H9To{3rlvL2s^cTO1r1bxYf3<|jEu5Y0`B_-p+}xPmIGF7m z%~{y^`1n{@*;&}xnZ7ufoIGrujog`RohbgVlK;ORNi!!CM@xHWOFLWA|MVIe+qpOk zk(2*t=zq)q_0!q%*Z*V5*6DwS^)*12|9V*1m|0o=SNE5w;D1zpfTN|^m*xNT3$qFS zAI|>|?SJeDvixWK|K-g89qIp}ehpO^L6GHtADb`&p203G7?>!SoTQkVJNRWbydUP! z!*z$(FV|FpJ`8A0DJfTQEOh9>A>2ag{qW&8h$#GMz9>0Pm}t#gO(zRuNkNkZz-0N4 zKoH;j(HHs0Pw`F>9#yM#f<1IcIB+u<@s=?-c_Ml%dCW`Kk6R` zNd_y95=02cR|j$w-3B&dA2A+?PVinow)u3ZYh}kabKZIDH+r76f7tX#iSStLh2}OR z-7Q5EA%K<+ebYPz4tqByV4`BAn+);WUGvN=@jshsNgU?&OfuF$o69bLnxBez&erK( zx=n+{uO@O>?PU&ny-X@tLQ`AYq2a6k$ONUsrz@nt>|JZbs zdv>|8T(wcm(BU-JGxXA4yym=e{lrJ`>0l{{>uuTT-EaWGNIG9$z(8p)%B_oLIT~*W z9qx@IEV}CwFk`+}>q8j5UIs#rt#}-w7wR!S1h)+r6{`=DOL$2N_>{)E>tb+!agBjk zFJ1fm+h^6yGHkQ+DwfPKh-9uQ+JS*9Pz?iX`);)Nm-TAm8EZ+957|*Za@W0*`J0^* zR4Kq}M&paB^rhdw<=x}MznrJs@3&Y-g&ikf^Scc*sI$tQ@*fT=A1YW{bs9n9W!vl# z>JAm(CRU&-Dv##-iNblG!laF^3dS2pb3WD1P|4rw0D~a^<#@mf7KB|aVV!eVPy64k zOh~1u%k&PCIE%TSW8~%s3Gw{xS2Hw~IxId`GjF9N;qKaUULF$3ipkCSeZ!|MEJ#$O zfK8p<{24o@snF9FM3jq`_eg!xqE_7fcfutPCUV3p$gC!!71;KAj$>ERKnkcG>*yU< zIW=oj&Otm!QWDss=xX8+G)0Lc7nb_idI*{vR>K8n9~SOCq|vRQ#AwZ+2$+8Pw6S>@ z<*d)?i|eI#YSh>F$Vyj*6ub_L5)JJ6Xyjjv^_^$2E(Dv-68Ua?=pX;$2;Hw5D%gmO z<6#Nvy1n>@PuGru_+Xl!FCQeV6WD*f#pWF^2i-jy;5Vg&bC5SKbWs&5321p*%z zGOPOH!(2GuK0?KG4LM_Mi?9xn{9vLYb7eR|$<$0Q9f2fShB&{G$;GqCU|aA|v|j!4 zasWuqa8u@x)JsZc>z9=H{*54f_^}Sg74VuK_1Rq=VLS95l|N4`7t8aEkx1-ZK9q$q z@jB#utlt*B#(|$uMbz=nOdbvVtemT*j0Lfa3Gp09`Wik)wD-mI;b_y`W8l=QuW!-E zX1IXokRtWRaML(wK^|_ZzqJ&yLhpON?Z zCXx?yLcIn?-dQe3(Lq__v5QaIiH4AGO#nQ7z9!P%IW$Y1uIV^R677Qex=8!j?Wq_|^8 z-4flG^!1v(*nbg1eBgWD7IOZrim~)el^qAOs5}sJg~ah7>tWY}5AH^b39ehmf<;oQ zl_zpNWojup32s0!aAv3QMAp@2p8O4#E2aKTa~b`>daZdjr-C$0(YPDBc0?|=$IR0#jlTs;J|bV;fI}~SE#Ao`=sk$5 z8Gb}ZJ*5fz&}2r;_w}dI9OqRWp9dC=(4VM@!=30PnEY>*&a96S;+yR>n6Gm;oP$ls z4m+DtK3?qUo1q*RLfQV#KQ|4ZKOaFqqj6@A9?m}o9aii(h>oD=nBNaqY(vOuC6}yX zgT=R*BPb~gs`=2Oolgc<+Di>%Bk7qV^|nYvh%?(m3AimSg%I6qOYfAMf>$ERluzgf z!>Q*?GZZ*gdpaZb+a>Z@g1pEs9tXfD1mm_=mv5O1f)|BG!czYRiD8_7-~ciJvUWOF zG`CD94%!HeYfwji>T*iDwEeYvlEou7b+b+Ph%mx}SE}Kvp7iRp@J8Y&HfAV4dn-}C z5wdk_m+%<|FGxzhLDQVpFgD>#4Kp<`av;q4)uD(6d2=|O+ZASf5_>9}@BiCfAs0qV zBc(xx`BX}QGUU7N^QdjV&VSfClzi@xaPTAU?8&uafe19~BKFtK>rWe?USRq)vQ=F+ z*Mq84o*ENt$_WXcvZY7ZpH2EVHoe0a(rZ9%K5P=d?$Hl$FPC_-!xg>g=axD`J~DBm zCsu=0C|Q{PKGcW02;w99nk<}cgRZ_JchFJ9t~};&OQ2)UgyNpTmEpW?l7OD0sGz_| z9CUPhr^PI=U{{)6+O$^o$- zH&k2&zQHOXKwMoQd0XrRy0luNw!5qJY-0gFuM z45W*o+QIT$Q~GJ-_yWB-5U( z8_iPZ?WXlzf=UARBeM8&0UX2H@Uf8T;uO?|5z?%SLK^@2?hyD&!*xfHrSDQ;rW2Yf zMWKJ9eMu~|$bh|3zxI;M#%p!IMosaux@7E^HK*I+ya+XXJP^>bKz@M>%~d(pRD*{} zdYzE_{iIc+JmeI665O*}%cH;{4r>;WC9=f7HJ2D_@2ME9*cCoZ+_bL5C3ecCG@;kF zTe#V6Dwbo_l+gd6^?G=TH9M*;2Eh_O?hTY6_h5(&B32+`Up4!IJql6<&lWZ5P7SIlMf1k9g z#8;{avIjE~$pJrWMq2F|zE@pnX`{FRf9Tmk3PT`C9&=~qq2fFmt>)2Yxf49z>74uY zZ-0WPb4l>$oFeA#{s$JU#}i z`_PX+XnwbN)J)F^toh)V^B9&wFqdRLCk;lTJb-lhR!JnHcv>Ke` zX-;m~_1kP8)-Wk{@0_SwSrrfoct4chmI2k`sv$hg>1uy*WgIoz6d2)t&q56D=Isuy zmmlx2s|yJT0Enc1(B&orm5!qKj}Gj3(p-#!Ip@|-D1+K$cMZ(L32 zre_Gy$nH=XrDs6?iwUGzQnG+2%b#ft@-o`Mb`U+{wc`_c`bu>m1tHKdFT1*UE`Lw)n% zT0!e?rFSheM)Po+kG=l0g)%ZN&E7V)GbZecOcAzg@Ke<+r4UbH1V`NJvD+Z!9(mPP zVyf*b>ig*~PFsih_aG&K<5>6S0pyPvt9;|96{F4T-2s5P*f1VDm4fx#?)~Y!-4CPT zcp2i=)OqD$jOksHML;V)@9Dwi-%K76F7v6ZW_ow&DBRi5=DmbpQ8fM?MoU%N3hH|Z zlQa1ePqEU4g?Qt{&>$~S0LGP*vyjL3dF^z@d9e9PosmeCbhJd`*a6Hx?Kd{dUwXxg zA1JYsQCDN(%t0cYCc{ym9=i~a1o5&5;uGT+e$KWb2lipHVP1`z8+4i)F8^lB{_YzO z7kGI+rcFr26VzMwh}XWDdk&;3P!Sq9w=Nj|V$gDz zh+fotO1k$+i5+P`=`PuemL#`>fcc8g*nQ*vK0*niIN2UkGUU!At&;c#f3Es|ivZf` z3p5!dFu{!njupL(X&REbYR3rrWDg%~sC`V*Gz4%6`wIob9Bfds=Z+z^Gz@2cq=08V zu>JQR!S`WFAK>NS^Zjx+*zf1(CoGp8WkO{XgSrjNg@B z>JISK`!AvV(oxm{NbXt2l|$sxN|!4YWQ(XvUFryXt#r+0n0T%ZS{i!@P^NaC0#L$q zLO!vwlz&nQ{(V+zhWmwoM{}c^X3q)Q$zLW2K+`)UF`PdkgC+IGneAA2Kfwww(1aes z@VB$F@&;S8gG>pDtS%foz!7YoX%xRsE9ss=1V~8@F$Y~zH|-xWYF!%>=P#qkA2QX` zePau((GvC>mbfK-F`T04)cTb%W@h7%O&|Wuw>kGII%F$pAL-i9zv@X3BX&*LE8k6E z`LUHH((?MJy?)WWOUQQHR*J%|IPAwjduDn_t?6*z^p8$<(29T6CRmn89Z5k`9g)h( z>8uwUc_%fM1>INlrwr}C`dfD1c)0{SA@jz>42@UNWhsRFg;*li>M>oMx0PL+7p<}p zg|$J>_uO^yaC3izg&eBLSZI=2YsMY)hJ*}D zHvT3!q1hE>43vC}I9hN!GQ`iD+F6-C-QAW}Eyp$(HNl;=vH+?6irQfL7vz<>=+Kkw za!_pF;H~ZnfU$8ddX*FWtnEMJ&CI!-T2rWmNhXh0*(uv2RkC<5WGqE?oXe9!-lsgN zg%41N-zHo13Oa|qPja;j?R`&=@WO`BiUHFcX!3n#tEs4{h~j+{V2#{z-$(ACA!OUT z)=t4n_OD(r*xuBZA#xeC_Y|{%Z^;yY6`|F;SIVNT$0Qm%PyhcYn;=WKRfNiJ~IwVZsOgKK?1{3^SbPYxj8VajkU4gL_Ehq#vu+q zfxirN1zJ4m0~m3*+NB=Ca^{k z14AIu8tTj4U-bsUwj=6tCf)dk+rY)1Js&UROyccB!6#I80;xlJHsC-}sl|s&<}W+D znAH2ny+dIG>K5Oo+pygie7YQW`-2OB$3kbc@nJDB(hvVs@=yQzrczxC%h|${Us~m( ze<%QasinIpT7%Z?I*idS;%^(hr^5n)Y!(_tO(Fdj^_T>3LVNHIdC!M}xD`A9^^+n$6M&q-x?S6; zmPm&iV_m7~xN~Ty745G19cac(vL-{c;`6~HcVY-llY*VU7@QMf&zgJzg$qPV#%f+k zkjDy_-b}~Wtj4gewKQ7MVKW}IN9fDKj7|}e0W`8jZ(;Y$oxDSlozn{uO@AuoejLdR zgrdF$xTzTw=0>H<%{P|*Lh44v>dJ&e{%3t4rFMXDyCN2%cVGm46)XQ!Scbwu@MA;{ zH)IfJ2J=@GF53krtZS(jBlVe^LH>wg)qR5W2J2)GzXQT+KH;mv)=*z)WHGp`+c#?Q ztN{4V&0_gIM8P)c8{zRjh4*Kbt>hqsy{HVeR`d$y47XK3o^pJix@1?kRd;e-MTJAC zWIbZK8r8{_4l>K=#l-lh8xIWTB<6ewxPJ@z^MT07nanZ-H$vH(-}|j}R6!v)HUHjn zG59=M{Y=?K^}{Ew8T&&0^Uu_RPDS8G6?4|h(MH7YiFZoO0qTt9jHjqZey3gT$`Y9# z?D1jKlCsSGJ(SKdxNOd>O<5GDci!UscF}}<4Lr|12YuQ;xS{`2Jf}Z#e(#X(t@^9o zI}R@k+&H6&*aKxd9)2%u480h~66>pnjBY_yl`YE^2u2tCvTQBihV7h$7!f70Fav`u zIn!F!O*4M{I7D>R`Zp9(ms^xf58=cStvM{=v4Tw8Lwx{6L_je>whlgJfK8w&giLX7 zdDU)I(L?SINs{4aXrlT)T2kbJqw*X~_|0FmtN<VhquwDenVb%bHQltm#PC2~YQMr|4=&h6xDbIcLyC z3FYl-!HRttJrNO4v_iY(xFIzjm(9y5WqV_=<{9w>KZJ-})Fh-@e;GTc!W2U$YZN$3 z_&x2qrF*;c>IzXa`Uk(tS?Eo83ALq>M|Sd>3_DLd>@}EePW7j-F32Hr8GcgW$1a?d zJ(JyY24tkN0kj5KM@uR=VEFJ9aGGNAnPi4{ykz+&4Fo3$-=hx?xrN^Vg0@-*$7B}x z;b66vR_r*x;s1IQhyXj;JZ=vli=!MA(a*LcB)=)$lDTJOWC-pfMCCfQ{4wA;?icH6 z9#1$<7^H%$*u)(dOFe70RG=3x#zo|lv6V|K-{idRTN}A<_`&wI$Xe^xyJK>5-b1Ca z-uYW+`+-HL6FJt=8`y$Z$%PI=z(v(ac$4Qd`2(6%q+O(#1JqqVM7~!0*2C`=ze}d4RE6Pz z2<{JtdOyhEgbm&0g1SqNcK*S=LoVz$>Vc1^YSjZ{ME?P%pR>w-R^575qUn{xSkeu1 z3=rwO2ai+iyB`2Je@7Y^iYVCMUM|jCadD|j^rh-EdS`D+qul3k~MuSb!;8S%tm z+I305W(pji7`te96|NP^&#UD`+nK62%X<}(dcOk05xZtFdmfB^fKmlL;6B65qGdGz zL9Yt%PcvYWP7S|l3Vs}v{kwK$If>hiX^ZXrg-(gKE*eS`Knc9ZdVpT1ua^Go~eu*k{M?0@}f;J zP&#ll`kWeTIPT-v}RPHc+f1+Lh zYmIVj;zx}4ng<2S0QzO(J*x-Jr4M@2R7?*ptq@VPqR0+~IBcg-`ctCzAq}tTubo6; zAN^B#o2NnfO=+8=Rm6YkpLh>%ex`Xos2=NsklV=B``^k(D;pH3HEv%f;Gj{0Q)P9= zR36qol*Fc+wWD+lCvQ*Ye2Xfc*0;SGdaHP-{3KazZKRmT83Y2dO@yi zP#$HP5q2y8q<#L_8BWMOm;EFiepkGdOV=7#&rr9oa*4eU0WAEuOtPgJXza;nkuUDV%k&## z)CC!MJpg~q|N7`{5ZHze>FUd~2%%L&vPZpc>cvR}xT$I16 ziENHY!V2?m zHqvA%1!nK7A*PQ6>plZR8fCW&mVsOV_eLDn#8Hd^%vr8hUVLmFNuhO(I_SJXyQ%y* zH;(Dg-Wq|^ZDTn~FPJ7b?xp2zmqA19734zBch2_Zd0HTeaEJ6dq5RHaKjAd@y=ckg z{IFK51KjBHPh-x&dTr-XwGs>72PL^C?8{Q^zV6&PGpxh!9KK=Kn^O5WSUtt@w1SbZ zuIT%3BCVL3xV~?Pyeh9-9WlzBDeCag$}49vi>*b~pm2Clde3=t^J+xqk;ghKD*w_h475b;nigOwXYLl=a~B zTUGkqu78tZp@9R^SGy}*2RXDFqD0|39H^%e8K|MT8}(D4-9?c9$v2^S79w|_pDR|l zG%}xzx++PfG0YWl-Pvz8sbEgJY$_mP*yZ`V&iC9vCMzyE(*OMrhWY)cEPO3wk+dZK zForU3e%AChc~iDpumKby85Uy$nHd#oM_rYR<%cC5R1X5P&ZXr=#2dG&yuYqdAX^C9 zItA=W;#AR7%d+V6;qg0?qz)Y0m;V^&Y=md}492g}jAb5* zkJGt-YJY|_WJ0enEQxR~k zS0B%dg*ii%$zEZeB!)l0Dw!x8llbQi4;xT5k>&&FWgmvP{ou{(&pD0FH!VRI8i=mG zC3By!gspByD`$D!SAcgfdq4r6Avnq0M6F@ZKOHP-1bTG$ARid%5`}c?H&(?Jv7rxPs4xG9DGJEzJLJSEojZ{3rXhRY{T-)eQ85!6!JFH+|EiO4CAxhZT5jWE#O!?NKgJCo4# zvgQ1*DdQ{_U4u*xI%Kd|8<8#wd?o?QwI&{sTS+vSAVS%FzYC7Mf1HS>I^U-%;gran z?%B6viQk~Y_)B=`Z>K&%X3%BXvh25HBTgN#bVo(!o?)J3h_-_|R~HZ$;~9bRj3nJ) z-tr!CTLw)LPt2#jZIR38t)ltMs5mgteCQgubuFEOH_~V}p1kvRpOU?wq(U8nZB*X5 z{w3EOxl%NzwEa(Ukle6kzHhQZY7XR86;AgQGQ@q%3YU2HPjT+6D4)U;DNH7O&5BJl zINY6d zmqM{mdVLWCrj8IU6H7{-c~KUgt3)xS0w;{pCAV-2CqFQ^x;q>HE*nebY7Bd2Z9hD- z6b5)J(x{BI-C)%IWs<`9u@~lu_RGV=gXG!j7{2?T+-%)1gR4=2)m&MO}`E{rVaY z8+MD|7Ve1^b;}b}IkOn0N(5zmyj`|$K|YV`42VLAs6`31f5R1yj3$)@3#Ids#Rxj{fv#lf=z=u2ghLcG8_wV0r;oGLI}gGIb_$lP_9nXh zj{QE($-D?}cbSK;$_iuFg87|E1JuK>9aZAGC{)yb2|zY5BZ^E>$b#4xA2BmXRZVpX zci(JYZh;s%aZ5ULI;Y5HB&KVD9GJQOv7kHnPJ-0-!4%6o(e7=+p`+}l$4VW)Wm4)& z{Wy&`7yJ~1%yWTp=R9+Y62|)~0MGciaWE3B^OD17zz|36oYTq#nYxpLl=WhL$E3K; zMqAA9me=C2>sd@wnR39K*3ECbM)f@Pr5z{7sJ^kbqeZy!^TD1}vhnS(4)vhi;af0^ zHaH_(C18mB7wLRInYaW2Q+{SF@V0_~1n!H>)!?ZZ~Z*r})3)rRkQu(ewCLDKa7 zyh}+TR&My)Q7}9_&|Xe}hWvan{uJl@)Zu&pvp0}r<(_IiY}x%cMaqCoU6YWqHpwZ? zb-<>jChUz>E#{$@u(*@-N0RPPDhH>Mw!;2tQZO-A{KIC+eP;m#EPwgWCi_4>l#FEg zH7ddC?FDF+;<0dHbGwzgk5Ej~(i76`rd~1V(_n5YMb{SIMQIYb6dA{b<%yk6@Lk^t z27_|Yn5rTD;S*i%oux&GJaEo}f%eb6JDi9YdQ|fg!xD9XRT)`GD`XzJRe^jH&&7)!waDY~o+(RNE>6V_R$GiE1W*^JbsP>f`z zDRXz1YXC0T=6~U!YNwHyJSg@jg$^$xTLc^K!b}faFp|M`xbq+85LlnU;HGb7c#^w7 z2FX=vwuM0>Eq4HQ&r>7pai{HvV@dyiGuS?1AoSp1HO z6kO-#*D8`P;~fa|C_ndayiT@UMpI7lHgPY0E84NM+QiO8>kfYo9)nxH4nkPjh>)9MYh3t2zw&Tp1)K1r!+Nm7J+ z8O}mOI>fl2sy5~1^p!tL&|}kAT-0R*umbf@e$e$A%Q@M*l@x6JF&=85`HtLy6*<8| z96jDw#FoBwgRs5uBH>n?PJSDuVSTa^3;dR)$sDQuwSCykBFdlIV%R^nIpk4fg1k;A) zSd?l3KBvook&_}2J|XzIpnembIM;dCjZJxC^#e* zyO(MWyXgxi784n8W&;ZG66oh2m2Vy1m>5d0sP(zvFXM_!$Yy4W{* zN&{i7P}sZt0Q2%PHfK6OA!pu^f}ovHt-Y26SKldbklZ*lbI^8Ti!$opG}8r}a@@tg0uhY?F`jz?oQpUC_b4?Z6JH1sW z4pH#==}+>;2H$5BO}NfDuNOpB^g3`P{SViLAA$8vEgNGWmS^{p7WA?^#XG9OWz{n3 zUO6F1i}U!(ft+agmqR?_42Wtm5T7nHAN&iBZwf|L82D?d=UHn||<(qKu;ucW~7ERh$!?6m!tOI_LSosLPs_8=c@&%OM}*o0ueS-5?kle-jgmG2L3Ls_Wz z42BFwq9e(z+8Im^we{VB(7G7XN9G;>foZju%vgvHhAT%~k=m`PU8V__#O(zA{H?f4 zfg+3iRGS}h+a<}qSqRvpKUuhWqjJSjona1JhY_cyN%#k+u0>Aj)Ja6N;GO6(scaD6 zs^Nwj@_1Kwxh6jw?f3aly3<+z_;V9bUbP;iT3^&0>VdIcyzsGY2t~eHO$$GRw;R!I zEP2bm&_$2;=zWR30~|(tAqbqhrxPNE9-3V8lKK zz^WT2dy-ow#Fj>&yUS&A&cq;8#8!N5Acx4VxA!8(F378s3K1$k69of1tf0q2htc0L zcz&;4+i!P2_GP~G-o9S8pVu7EHK3}VCTES4dfgB0z1kQAL3=tH-qJVog{e(QY$}g)qqx9fbtw|mya*;#c$XeLX16dqDWn(W=SIv%=6E4; zf1D;sIL)*Yt?Ds-t?-SX5vxVM z1nUK!n2TsJ`dWUVFjs{B2I=vN{`?MGYyf|Bld6s7?ov6>Du|YDAkJ{Piq3`P#V(;@ z2=(;$IQJ4Z$1_EPar99WL=7yauOi=&h5{l%e4`ku2TMM#6eb4<@;c%tcR08_x(gU@C?>Ur2AHG91Ei8== zv_J*(-EuLyJqwt`cE#)u28d6!#$(rO75e;TQ-|T#2ZE9BH)d3NvsLJQr8{=mfr?c7(2x(M5Mj5iL zTz8(){J=?H+W_j_%%z9IHZ0>?Kfb6*7mw${2A>b@=Dhmcqva5&)50Cia0N>s+w49k z5T9VdM(2}he@XUyg$Ryy0Hx-|*DM#>h(LNiaF zlq{NwcxcBKgJDucc1bwcBW^!nj^~JC)?h` z(G)=(umZ z{BzV&FR=Plw`RNLJ@dz@d^)=4J3!mSe`g|s=g`rDT8v&8npS~++V0WW;QOW5H9x5= zujD+~(dOxV&j*k@X6bp$=V?<@^|y#-pVA{^3W_;=7@v5s=DqMnOdd#|DqE#j^xPOr zki3a;Zb};zN_tg@y=}{hUEX^XXe4NuKD|8`rfjABLgJ(AR1mH-kt{zh`}+NpBUkq8PS^Xd-{2 z_QT|o>)*rv$ngx81*_w8o8UJ(UqE~ncHj>fMj{l~B-ke;{PPB~o0`eTi6>-24#twG z(@4i_>2e2$xOv`m&G=t=zo@T!#~_NfkOZ!NUz*HBjC+H}>!_auqj&I0+p^%S_EqgX`cxC&bd3W1aD3}R<+KA9pE7fZa z^@KS^-en`F++9Xp0S?(CD;*(P$znBdJa?g0%|46NnXH)8Ax?WK-oi482$)c;KCV6D z#rv_UnP2m!2qal%V0GLOzUk52?m1JcJ!U^o)q9r?*m$6Jrx_vIOl*J8#EPft8=Pxc-f0mNJW5p>kyr=tX}^+_Z$lndQKr^d^^hhp{Lrc0Iqp&2Kz7 z(^t{d%){H9mEvc`omRKK)^3>m$2LKkVPgc8Vu%S-OqK0N?anQm)ndQl&4KHg%!vj? zH!vkSgkcRF-n1~_RJ~!XY0fSzO@)ygs{XfBts6390y-uH?F>)S9Un3Vjn%`KQAS<& zi&*nut*Gg^4y2nZ`EBSqBfqDvM?w@v&OnZ^pN>K{=<&0P3qV8f(tk!Gm<60#d0;SEE6!>F;nGt6st?3dB<%!h@4{4;2 z`m|VqxXQx^>mh}L>bB*Fp)|Tq>_eL#4w?P3k{4H z7p;qEuAP*Jbcmx}%alSXh{dduW1Mf9FXTov``yYZwL}4^4H3JT=}4g*x}Q z#NJ*j1t||a`>v+veEt+%Ji*;mRuu6EeCn4Z0pvy+nK~J;rB53m8fgYthdc~tQ7$k5 zJRO+VOkK)5dM&+?XjF-w$po!jzS=-fGAwF)!7I4OCL0Mtd^rq{#NDu33}A0Eq_VK~ z(n)K#NrXJUc5sftIDIY1)J~n=@`)Yl2UiYcX%lwukem+UrrV2B_D$-~&oK2-`#4?H z<~QjWLOhEyMS!(+PbGp-=@6j@e%D2hObp>%0n&I{bcD!u`Mdl}DQQZF-(VuP{-1t| zCqt+GmZhsWGKmupru2^=@(^`$tA3LmwIcxcM53e{CQGn+^UJk{vnW4}S5hWz3^Jbj z*Amjc6*|Wt>>eM82Gf0)>*IM$?|B!~E@o&`>B2P{%WGpYz}r2L zssd^W(^IjLXZW4W%xqS-oYtSdZssrc!L(N7ha-1ML`1d*M~RsOpbip{RF#c^wCV^q zmtfGXidK>|gU=_f5UYbXW3W;d&Z62r5ioayK+~3#4NbZ0u#Gy~XYXq@yi84Fwc7PhBY92PyKJQF=2d0|^(}Xuaad7|A>`kNf$!mjrGh}YWA`+bplYNfihD;}LAM?U>5LEjV0Re2h&C)8 z_7S~%16B!31?%q4YOVS`**#cd0HZ@TLr;b0s3nv!#HLZ!L$O*;upevZxfP}j(%YSzh??~ub9s07PWzSD0_w^cjN@ZyYYUti zC7jVPh$7~mfDm42ffF;MM65DPc;ozJb7&KH{MGjuvXc%*l+J-MO<*3UVX2^L;WRf} z^fKf9ob@O&y&e-N3vpJj-?P3H=NJDX?U2Ol50tK=bBc|$(7u1`Obym zc70Z(=-!c64=zbuLUjapHrMubpnw-pa|)qLvnv;;pw4~1tcY5DAJSY>wv+Iz!o4FH zE9kRLU}!|vFcBN_p^g6t+DwDC&@lW(B7U=8yNtK$sOM z4npr}*t8cvOTyt9{M6JjQI<>>)ticK9a-x}mz4EOWXfh~^P7TalxJ!ukO2n&y1p-k z_#{%W78?k37Q9s+Sj1C0(OlsrK326PC{-gIzHP)MQW_mj{m?huzK3aIPUx&`Q|oRr zR0S?q^uG16D)O;oM|O+P6*>#$*h;Lfv*ZNv-7}{KGi%owJFD~8C3P}RMpVAb35jLa zwZn@n=+*SG2@oTHhcsif>MoUzBcf?00QAU5ZLJzzo@Pp*-YbyDpZvv~vL%nAkHX-k zzg-q^E{m&7gVfccSK~Srl#y`oA5?cBr^P%NbYARMNM+uT%4>g>V3gjbHsN5j( zl@?AT74cle_yg@m?+!$&%I{}Dhb!leI;=&}`5iJ-Rk-z`zK6Mk%R|u9_i-!BCeUj! z=vq3v^HUXMHdh^@!5Po4s|3(4dgx{QBXM3}lhIfFl_;%JT~0UbR{~WvO%@Ee?Tw*Jp5zRW=jS!M%mxm<1p<8WVfF6|k2@&R zn?96^ZMkD)YbSA|=YLlr(k&>Ir*-2?dP1n2vf(qcMV)|7KT1~p+HvXfqyVv-Ro^rN z447w=I95nbR_Yzj7;0_)00cQgO2%+;JY&eFv??YX4hA%;8+SXqsaQ5>ogh0sCJ?DN zNuJ?nT=;<5bD*v*|K`B5G#r`2SBE5M>oG1qasIB>sW4_uDZ7m}S2Te=X(yR?J`49p zV;b0H`30}_kN{9{arFz}c}_W&W`Y(}&J>@T8|L&b}z8&w00eFP4$o-$UD4>?Jb z@T5KR1|)ZVV4sYlaIK2%Rj|4#6X@w#FM-YCDZfin%+4$UmQ&cEPYZ>!r_xmlM0t2Zd?>CKq1xr+HNK%i~B!OtRWN`y)P+o06;(8NdqEq?Mi9ZdJ zRWz7@a3m!82|k(&QtHP^&}U73jw*NjxA)Zd^E0@)3CGbp=mHUC>!9UY!x3Wez=m0h zeuY+>AOrUq!4~Qq3w3`J$Db$Vchw~v^i+(a@!fJde0N7;HyierdbAW&GW-tib&Bw~ zq~kTjflStS)t~49OS_=!L{DG#fmLf!hgd6}-_Yh)s-ftp@uVSiZz=-;bJR6(Wo71F z+P4w*VUe=dRwA~+@-MKT$q_OPFDi^09~Rus^vJsUQz-P$>&1v!_wSxme7H=qD#;_i z5Dm+{zwdNT6#l~<59{smgMRZGq^D)apSqlJi#X7HE<4l9Nkn8`Ux*C`5<8@SBcpBY zEg{|LSy-0(C&20zAoB>-X5ir)oC*ipyAa*G%+7Y@j=JlLmsUQ#j`zO#QOW=*+f>*G z7<aFDNDJp`Gi$r ztO&FFt&E)}8%sIgeopy-9<><`Tq$<;_Q|GQ|8;ZQ_%`0EZBo;c@!&T6Z?Fno>m=F} z?4W9;5g)v1N^mC5(=r2~-*;^TFh4^VJ%(-*rC)!fp7^aNBprVgm9t$wqt|D#e*d+0 zZjR|78HCkm4L*lfGnr#1i+JaKxnP;NfiEafjO-D>1q&BDrfUxM30ToFOaYw5{XY$TeD>3ut}7#{{MlX_ z;e_4h)}_=KGXnPHq>70dohizr#)$Iu|851U+X}u&4eU~{>{xP!12Pz>0Xda6Q~&_% zdi(}VlUIB0a);_j%3-?Mb;R5S%l!{x)Z4&Q(Z3=eI47bHx9R>D~6kj#=|%(Ck3OG@9O35GuH|G!>xJKrj~XOuG@d zH^7YoZ1eEN^;7+&5ubsr5-^2W3c`|B{qUKaP($o`kM4UL20J) zZZCGp=qVUC%ktewaD18ueT#Mx1fLyFT;&Z|LMwvwSvHPBzA0a^x6CO}Mq~)`djvnH z(*jo`g1#2Iet_7~P}&=>x-;bYE#J5DE#n`ZVFK^gJs1aDJXFZ2Nm!1TL&OCTutpfv z+AhU7$Ppl^000KcNklr%1+72I&dYxmH-Kl9AW&BM(b4a z@N*AaMA6BJ^_U+2qC~>HTheQ-a&ho}RZ&r4tLgMC%wVYPqo+>h2oiNDr-J8WLP`+{RpL^7}}-)JItj`Ug6%X6;&_sWY{& zy*B+ZUOzU?__Lfa^*l1D6XhM^nKTqt@(?a3{LCpUi?_I9=O7RjhMwgt5?R?=f+K|_ zxXILWWUyxX^jR;Ep07jbr4FTxG&b5(lNKi_FV4nM=r?8Y8uIx2LbNaHxpWZUqyhI@J;~nuNck38ELJxf|KwX1 zlAQ{V|HW@n%apg^Ougj@y?D=@Ir9}N*f%JUSt2y0s$hc45yJVX!WT}PHeGthPqK;1 z_f0%$(F4^*Xio6VL~Hpjc{q{xX+pn~Za&n?erb46$IJ&Q$6+FjZE@*@5J_53$K#1% zEQGp`&kdyzem76>$%IXl!D~rcR8nbVWd)>N{sNcpk#P8sBeWRj_9158IO%vuSIia4 z^X8IXx+9&+^lz_sNfXy#S^2lrdd2hdr?J14I$T>wy&|t7_%~x0I3f?;3|YhuAXBB! z0ms5H>uARL(lIJ6Y0iT?!z&}9^bji8ob~HBaEnaxr__U`k~w)C=B0yrz6-3xVSFd? zIMdvEgJ&#xKLY%JCSt}o&=ZjWnr3BC+D`a-R3KYLMQiy_eSO0*#6F#LUm$!_dOlM1 zAb+ZYOCA>GEojGxV;wea+Kh4Jp*DZ)*7KF?xoFT>L76y0veU|&!jH_>R=jdF9uB3?4a z7uoUULQ4GZWS)@~V%q5#6HbR1Iq_9?H6av`pGAWB1LHnxiC;&HC`SxG->cUiZ>O$x z&Q2-#TV?vl!LqVLhc)&xQu!9un4iDP!#K4c`S==Y)6q~~jh~FOxA4$U|*L)4dZ(p7sJQZo# zl6*-?u?Il=l01D(j#)5byozD!Z7P(MQ-*73NKEh2MH^6GpW&VJAr-V&uUR+MeAcWtq;V*TgYMTjHHj&? zD26w<=|KaHUo#Wny&j8MvCW$k(f~9&==)B$ZmXrvX+3-P{Cx4^#TUpsw-}F?Uz|n5 zd_j*Medy3;rF@F~+f0mVd;~`>Bdj8S2a%V3D7c=`!}3I3)I$y)&{m4E1egyJiNwd! zGpXYCgJU}JZ;W|PgMpr4(%aF%{FlNw))UAHkoD)AHf`D!Yv?Y->yAN)<-JBS5;1A$ ztTi0Gu?mK-Mj%QujtZiuRN+hkNzW&@|Hz(!&g6^p5cDSA*GD4puMDH(m*gp9daYDx zhj_LP4iqf(TO(Us|75)U^5mHa=MV*ds;Z9?eT-!8YLsFYJZz5Q|(zz`dlRB}v-a%yDK3e94gUJpaEc zE8AS`7ENFLVPMFC?S%u|gwV?zy*xtPj4U2(FZXq4Bz27(C&qv5)y&$E_3jHPJ&r-8us*lObWw74!{96M6_?m z`u*PAmKzHedbwb;_$`|4n+qXzAqPswfp#zS%+Jd``. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst DataFrame.spark.frame DataFrame.spark.cache @@ -319,8 +320,8 @@ specific plotting methods of the form ``DataFrame.plot.``. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst - DataFrame.plot DataFrame.plot.area DataFrame.plot.barh DataFrame.plot.bar @@ -330,6 +331,10 @@ specific plotting methods of the form ``DataFrame.plot.``. DataFrame.plot.pie DataFrame.plot.scatter DataFrame.plot.density + +.. autosummary:: + :toctree: api/ + DataFrame.hist DataFrame.boxplot DataFrame.kde @@ -341,6 +346,7 @@ These can be accessed by ``DataFrame.pandas_on_spark.``. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst DataFrame.pandas_on_spark.apply_batch DataFrame.pandas_on_spark.transform_batch diff --git a/python/docs/source/reference/pyspark.pandas/indexing.rst b/python/docs/source/reference/pyspark.pandas/indexing.rst index 7ec4387bb679a..301e849ffe28a 100644 --- a/python/docs/source/reference/pyspark.pandas/indexing.rst +++ b/python/docs/source/reference/pyspark.pandas/indexing.rst @@ -129,8 +129,14 @@ in Spark. These can be accessed by ``Index.spark.``. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_attribute.rst Index.spark.column + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + Index.spark.transform Sorting @@ -308,9 +314,15 @@ in Spark. These can be accessed by ``MultiIndex.spark.``. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_attribute.rst MultiIndex.spark.data_type MultiIndex.spark.column + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + MultiIndex.spark.transform MultiIndex Sorting diff --git a/python/docs/source/reference/pyspark.pandas/io.rst b/python/docs/source/reference/pyspark.pandas/io.rst index 118dd49a4ada9..fd41a03699cac 100644 --- a/python/docs/source/reference/pyspark.pandas/io.rst +++ b/python/docs/source/reference/pyspark.pandas/io.rst @@ -69,6 +69,11 @@ Generic Spark I/O :toctree: api/ read_spark_io + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + DataFrame.spark.to_spark_io Flat File / CSV diff --git a/python/docs/source/reference/pyspark.pandas/series.rst b/python/docs/source/reference/pyspark.pandas/series.rst index 01fb5aa87fb15..88d1861c6ccf0 100644 --- a/python/docs/source/reference/pyspark.pandas/series.rst +++ b/python/docs/source/reference/pyspark.pandas/series.rst @@ -270,8 +270,14 @@ in Spark. These can be accessed by ``Series.spark.``. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_attribute.rst Series.spark.column + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + Series.spark.transform Series.spark.apply @@ -304,6 +310,7 @@ Datetime Properties .. autosummary:: :toctree: api/ + :template: autosummary/accessor_attribute.rst Series.dt.date Series.dt.year @@ -333,6 +340,7 @@ Datetime Methods .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst Series.dt.normalize Series.dt.strftime @@ -353,6 +361,7 @@ like ``Series.str.``. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst Series.str.capitalize Series.str.cat @@ -416,10 +425,16 @@ the ``Series.cat`` accessor. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_attribute.rst Series.cat.categories Series.cat.ordered Series.cat.codes + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + Series.cat.rename_categories Series.cat.reorder_categories Series.cat.add_categories @@ -438,8 +453,8 @@ specific plotting methods of the form ``Series.plot.``. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst - Series.plot Series.plot.area Series.plot.bar Series.plot.barh @@ -449,6 +464,10 @@ specific plotting methods of the form ``Series.plot.``. Series.plot.line Series.plot.pie Series.plot.kde + +.. autosummary:: + :toctree: api/ + Series.hist Serialization / IO / Conversion @@ -476,6 +495,7 @@ These can be accessed by ``Series.pandas_on_spark.``. .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst Series.pandas_on_spark.transform_batch diff --git a/python/docs/source/reference/pyspark.sql/spark_session.rst b/python/docs/source/reference/pyspark.sql/spark_session.rst index f25dbab5f6b9b..f242e4439cf4c 100644 --- a/python/docs/source/reference/pyspark.sql/spark_session.rst +++ b/python/docs/source/reference/pyspark.sql/spark_session.rst @@ -29,12 +29,21 @@ See also :class:`SparkSession`. :toctree: api/ SparkSession.active + +.. autosummary:: + :toctree: api/ + :template: autosummary/accessor_method.rst + SparkSession.builder.appName SparkSession.builder.config SparkSession.builder.enableHiveSupport SparkSession.builder.getOrCreate SparkSession.builder.master SparkSession.builder.remote + +.. autosummary:: + :toctree: api/ + SparkSession.catalog SparkSession.conf SparkSession.createDataFrame @@ -58,8 +67,13 @@ Spark Connect Only .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst SparkSession.builder.create + +.. autosummary:: + :toctree: api/ + SparkSession.addArtifact SparkSession.addArtifacts SparkSession.copyFromLocalToFs From d971dc461f5c461fb4972e83fe70ae8b2ef27eeb Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Mon, 27 Nov 2023 08:59:24 +0900 Subject: [PATCH 12/40] [SPARK-46084][PS][FOLLOWUP] More refactoring by using `create_map` ### What changes were proposed in this pull request? This PR follows-up for https://github.com/apache/spark/pull/43993 to make more refactoring for `CategoricalOps`. ### Why are the changes needed? To optimize performance/debuggability/readability by using official API ### Does this PR introduce _any_ user-facing change? No, it's internal refactoring ### How was this patch tested? The existing CI should pass. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44015 from itholic/refactor_remaining_create_map. Authored-by: Haejoon Lee Signed-off-by: Hyukjin Kwon --- python/pyspark/pandas/data_type_ops/categorical_ops.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyspark/pandas/data_type_ops/categorical_ops.py b/python/pyspark/pandas/data_type_ops/categorical_ops.py index bbaded42be905..824666b5819b3 100644 --- a/python/pyspark/pandas/data_type_ops/categorical_ops.py +++ b/python/pyspark/pandas/data_type_ops/categorical_ops.py @@ -15,6 +15,7 @@ # limitations under the License. # +from itertools import chain from typing import cast, Any, Union import pandas as pd @@ -134,7 +135,7 @@ def _to_cat(index_ops: IndexOpsLike) -> IndexOpsLike: if len(categories) == 0: scol = F.lit(None) else: - scol = F.lit(None) - for code, category in reversed(list(enumerate(categories))): - scol = F.when(index_ops.spark.column == F.lit(code), F.lit(category)).otherwise(scol) + kvs = chain(*[(F.lit(code), F.lit(category)) for code, category in enumerate(categories)]) + map_scol = F.create_map(*kvs) + scol = map_scol[index_ops.spark.column] return index_ops._with_new_scol(scol) From ef27b9b15687dad416b6353409b1b44bc1451885 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Mon, 27 Nov 2023 09:00:11 +0900 Subject: [PATCH 13/40] [SPARK-46099][PS][DOCS] Refactor "Supported pandas API" generation script ### What changes were proposed in this pull request? This PR proposes to refactor the script used to generate the [Supported pandas API](https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/supported_pandas_api.html) documentation. The script has been restructured for better readability and maintainability. The refactoring includes: - Simplifying complex functions and breaking them into smaller, more manageable pieces. - Improving variable and function naming for clarity. - Adding comprehensive docstrings in the NumPy docstyle. - Streamlining the flow of the script to enhance logical coherence. ### Why are the changes needed? The previous version of the script was hard to understand and maintain due to its complexity and lack of documentation. This refactoring makes the script more accessible to new contributors and easier to modify or extend in the future. It also ensures that the script adheres to best practices in Python coding, making it a more reliable tool for generating accurate and up-to-date documentation. ### Does this PR introduce _any_ user-facing change? No user-facing changes. This PR only affects the internal documentation generation process. ### How was this patch tested? Tested by generating the documentation manually and verifying that the output remains consistent with the previous version. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44010 from itholic/refactor_doc_gen_script. Authored-by: Haejoon Lee Signed-off-by: Hyukjin Kwon --- python/pyspark/pandas/supported_api_gen.py | 188 ++++++++++++++------- 1 file changed, 124 insertions(+), 64 deletions(-) diff --git a/python/pyspark/pandas/supported_api_gen.py b/python/pyspark/pandas/supported_api_gen.py index 27d5cd4b37f9d..1f893520d2cef 100644 --- a/python/pyspark/pandas/supported_api_gen.py +++ b/python/pyspark/pandas/supported_api_gen.py @@ -33,13 +33,11 @@ from pyspark.loose_version import LooseVersion from pyspark.pandas.exceptions import PandasNotImplementedError +# Constants MAX_MISSING_PARAMS_SIZE = 5 -COMMON_PARAMETER_SET = { - "kwargs", - "args", - "cls", -} # These are not counted as missing parameters. +COMMON_PARAMETER_SET = {"kwargs", "args", "cls"} MODULE_GROUP_MATCH = [(pd, ps), (pdw, psw), (pdg, psg)] +PANDAS_LATEST_VERSION = "2.1.3" RST_HEADER = """ ===================== @@ -73,6 +71,10 @@ @unique class Implemented(Enum): + """ + Enumeration of implementation statuses. + """ + IMPLEMENTED = "Y" NOT_IMPLEMENTED = "N" PARTIALLY_IMPLEMENTED = "P" @@ -80,7 +82,7 @@ class Implemented(Enum): class SupportedStatus(NamedTuple): """ - Defines a supported status for specific pandas API + Defines a supported status for specific pandas API. """ implemented: str @@ -89,47 +91,108 @@ class SupportedStatus(NamedTuple): def generate_supported_api(output_rst_file_path: str) -> None: """ - Generate supported APIs status dictionary. + Generate the supported APIs status dictionary and write it to an RST file. Parameters ---------- output_rst_file_path : str The path to the document file in RST format. + """ + _check_pandas_version() + all_supported_status = _collect_supported_status() + _write_rst(output_rst_file_path, all_supported_status) + - Write supported APIs documentation. +def _check_pandas_version() -> None: """ - pandas_latest_version = "2.1.3" - if LooseVersion(pd.__version__) != LooseVersion(pandas_latest_version): + Check if the installed pandas version matches the expected version. + """ + if LooseVersion(pd.__version__) != LooseVersion(PANDAS_LATEST_VERSION): msg = ( - "Warning: Latest version of pandas (%s) is required to generate the documentation; " - "however, your version was %s" % (pandas_latest_version, pd.__version__) + f"Warning: pandas {PANDAS_LATEST_VERSION} is required; your version is {pd.__version__}" ) warnings.warn(msg, UserWarning) raise ImportError(msg) + +def _collect_supported_status() -> Dict[Tuple[str, str], Dict[str, SupportedStatus]]: + """ + Collect the supported status across multiple module paths. + """ all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]] = {} for pd_module_group, ps_module_group in MODULE_GROUP_MATCH: pd_modules = _get_pd_modules(pd_module_group) _update_all_supported_status( all_supported_status, pd_modules, pd_module_group, ps_module_group ) - _write_rst(output_rst_file_path, all_supported_status) + return all_supported_status + + +def _get_pd_modules(pd_module_group: Any) -> List[str]: + """ + Get sorted list of pandas member names from a pandas module. + + Parameters + ---------- + pd_module_group : Any + Importable pandas module. + + Returns + ------- + List[str] + Sorted list of member names. + """ + return sorted(m[0] for m in getmembers(pd_module_group, isclass) if not m[0].startswith("_")) + + +def _update_all_supported_status( + all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]], + pd_modules: List[str], + pd_module_group: Any, + ps_module_group: Any, +) -> None: + """ + Update the supported status dictionary with status from multiple modules. + + Parameters + ---------- + all_supported_status : Dict[Tuple[str, str], Dict[str, SupportedStatus]] + The dictionary to update with supported statuses. + pd_modules : List[str] + List of module names in pandas. + pd_module_group : Any + Importable pandas module group. + ps_module_group : Any + Corresponding pyspark.pandas module group. + """ + pd_modules.append("") # Include General Function APIs + for module_name in pd_modules: + supported_status = _create_supported_by_module( + module_name, pd_module_group, ps_module_group + ) + if supported_status: + all_supported_status[(module_name, ps_module_group.__name__)] = supported_status def _create_supported_by_module( module_name: str, pd_module_group: Any, ps_module_group: Any ) -> Dict[str, SupportedStatus]: """ - Retrieves supported status of pandas module + Create a dictionary of supported status for a specific pandas module. Parameters ---------- module_name : str - Class name that exists in the path of the module. + Name of the module in pandas. pd_module_group : Any - Specific path of importable pandas module. - ps_module_group: Any - Specific path of importable pyspark.pandas module. + Importable pandas module. + ps_module_group : Any + Corresponding pyspark.pandas module. + + Returns + ------- + Dict[str, SupportedStatus] + Dictionary of supported status for the module. """ pd_module = getattr(pd_module_group, module_name) if module_name else pd_module_group try: @@ -157,7 +220,7 @@ def _organize_by_implementation_status( ps_module_group: Any, ) -> Dict[str, SupportedStatus]: """ - Check the implementation status and parameters of both modules. + Organize functions by implementation status between pandas and pyspark.pandas. Parameters ---------- @@ -171,6 +234,11 @@ def _organize_by_implementation_status( Specific path of importable pandas module. ps_module_group: Any Specific path of importable pyspark.pandas module. + + Returns + ------- + Dict[str, SupportedStatus] + Dictionary of implementation status. """ pd_dict = {} for pd_func_name, pd_func in pd_funcs.items(): @@ -214,7 +282,7 @@ def _transform_missing( ps_module_path: str, ) -> str: """ - Transform missing parameters into table information string. + Transform missing parameters into a formatted string for table display. Parameters ---------- @@ -229,6 +297,11 @@ def _transform_missing( ps_module_path : str Path string of pyspark.pandas module. + Returns + ------- + str + Formatted string representing missing parameters. + Examples -------- >>> _transform_missing("DataFrame", "add", {"axis", "fill_value", "level"}, @@ -251,47 +324,6 @@ def _transform_missing( return missing_str -def _get_pd_modules(pd_module_group: Any) -> List[str]: - """ - Returns sorted pandas member list from pandas module path. - - Parameters - ---------- - pd_module_group : Any - Specific path of importable pandas module. - """ - return sorted([m[0] for m in getmembers(pd_module_group, isclass) if not m[0].startswith("_")]) - - -def _update_all_supported_status( - all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]], - pd_modules: List[str], - pd_module_group: Any, - ps_module_group: Any, -) -> None: - """ - Updates supported status across multiple module paths. - - Parameters - ---------- - all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]] - Data that stores the supported status across multiple module paths. - pd_modules: List[str] - Name list of pandas modules. - pd_module_group : Any - Specific path of importable pandas module. - ps_module_group: Any - Specific path of importable pyspark.pandas module. - """ - pd_modules += [""] # for General Function APIs - for module_name in pd_modules: - supported_status = _create_supported_by_module( - module_name, pd_module_group, ps_module_group - ) - if supported_status: - all_supported_status[(module_name, ps_module_group.__name__)] = supported_status - - def _write_table( module_name: str, module_path: str, @@ -299,7 +331,18 @@ def _write_table( w_fd: TextIO, ) -> None: """ - Write table by using Sphinx list-table directive. + Write the support status in a table format using Sphinx list-table directive. + + Parameters + ---------- + module_name : str + The name of the module whose support status is being documented. + module_path : str + The import path of the module in the documentation. + supported_status : Dict[str, SupportedStatus] + A dictionary mapping each function name to its support status. + w_fd : TextIO + An open file descriptor where the table will be written. """ lines = [] if module_name: @@ -336,7 +379,17 @@ def _write_table( def _escape_func_str(func_str: str) -> str: """ - Transforms which affecting rst data format. + Escape function names to conform to RST format. + + Parameters + ---------- + func_str : str + Function name to escape. + + Returns + ------- + str + Escaped function name. """ # TODO: Take into account that this function can create links incorrectly # We can create alias links or links to parent methods @@ -351,7 +404,14 @@ def _write_rst( all_supported_status: Dict[Tuple[str, str], Dict[str, SupportedStatus]], ) -> None: """ - Writes the documentation to the target file path. + Write the final RST file with the collected support status. + + Parameters + ---------- + output_rst_file_path : str + Path to the output RST file. + all_supported_status : Dict + Collected support status data. """ with open(output_rst_file_path, "w") as w_fd: w_fd.write(RST_HEADER) From e1a2255f99be88e776295f30f995b339c3e4b5af Mon Sep 17 00:00:00 2001 From: hannahkamundson Date: Mon, 27 Nov 2023 10:38:22 +0800 Subject: [PATCH 14/40] [SPARK-45699][BUILD][CORE][SQL][SS][CONNECT][MLLIB][ML][DSTREAM][GRAPHX][K8S][UI] Fixing all compilation warnings related to widening conversions ### What changes were proposed in this pull request? 1. Change the silencing of the widening conversion compilation warnings in the parent `pom.xml` and `SparkBuild` to throw an error 2. All widening conversion compilation warnings were removed. This almost exclusively involved adding `.toDouble` to longs. However, it also involved some `.toFloat` on ints and longs. ### Why are the changes needed? It allows us to upgrade to Scala 2.13 without adding a bunch of compilation issues. This is removing the following compilation error ```shell [error] /Users/yangjie01/SourceCode/git/spark-mine-sbt/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala:1207:60: Widening conversion from Long to Double is deprecated because it loses precision. Write `.toDouble` instead. [quickfixable] [error] Applicable -Wconf / nowarn filters for this fatal warning: msg=, cat=deprecation, site=org.apache.spark.scheduler.TaskSetManager.checkSpeculatableTasks [error] foundTasks = checkAndSubmitSpeculatableTasks(timeMs, threshold, customizedThreshold = true) ``` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? No tests were added. For every profile (including the base profile), I ran `mvn clean compile test-compile`. I then `grep`ed any lines that had the word `Wide` in it. I determined I was done when no output remained. Here is a script of what was run: ```shell mvn clean compile test-compile |& tee output.txt cat output.txt | grep .*Wide.* |& tee output-widening.txt mvn clean compile test-compile -Pspark-ganglia-lgpl |& tee output-spark-ganglia-lgpl.txt cat output-spark-ganglia-lgpl.txt | grep .*Wide.* |& tee output-spark-ganglia-lgpl-widening.txt mvn clean compile test-compile -Pkinesis-asl |& tee output-kinesis-asl.txt cat output-kinesis-asl.txt | grep .*Wide.* |& tee output-kinesis-asl-widening.txt mvn clean compile test-compile -Pdocker-integration-tests |& tee output-docker-integration-tests.txt cat output-docker-integration-tests.txt | grep .*Wide.* |& tee output-docker-integration-tests-widening.txt mvn clean compile test-compile -Pyarn \& tee output-yarn.txt cat output-yarn.txt | grep .*Wide.* |& tee output-yarn-widening.txt mvn clean compile test-compile -Pkubernetes |& tee output-kubernetes.txt cat output-kubernetes.txt | grep .*Wide.* |& tee output-kubernetes-widening.txt mvn clean compile test-compile -Pkubernetes-integration-tests |& tee output-kubernetes-integration-tests.txt cat output-integration-tests.txt | grep .*Wide.* |& tee output-integration-tests-widening.txt mvn clean compile test-compile -Phive-thriftserver |& tee output-hive-thriftserver.txt cat output-thriftserver.txt | grep .*Wide.* |& tee output-thriftserver-widening.txt mvn clean compile test-compile -Phadoop-cloud |& tee output-hadoop-cloud.txt cat output-hadoop-cloud.txt | grep .*Wide.* |& tee output-hadoop-cloud-widening.txt ``` ### Was this patch authored or co-authored using generative AI tooling? No Closes #43890 from hannahkamundson/SPARK-45699. Authored-by: hannahkamundson Signed-off-by: yangjie01 --- .../types/UTF8StringPropertyCheckSuite.scala | 4 +- .../client/arrow/ArrowVectorReader.scala | 6 +-- .../kafka010/DirectKafkaInputDStream.scala | 2 +- .../spark/streaming/kafka010/KafkaRDD.scala | 2 +- .../kafka010/DirectKafkaStreamSuite.scala | 2 +- .../input/FixedLengthBinaryInputFormat.scala | 2 +- .../spark/metrics/sink/StatsdReporter.scala | 4 +- .../apache/spark/partial/CountEvaluator.scala | 5 +- .../spark/partial/GroupedCountEvaluator.scala | 4 +- .../apache/spark/resource/ResourceUtils.scala | 2 +- .../apache/spark/scheduler/MapStatus.scala | 8 +-- .../spark/scheduler/TaskSetManager.scala | 7 +-- .../scala/org/apache/spark/util/Clock.scala | 2 +- .../util/random/StratifiedSamplingUtils.scala | 4 +- .../apache/spark/benchmark/Benchmark.scala | 2 +- .../deploy/history/EventLogTestHelper.scala | 2 +- .../spark/status/AppStatusStoreSuite.scala | 54 ++++++++++--------- .../apache/spark/graphx/lib/SVDPlusPlus.scala | 2 +- .../spark/graphx/lib/PageRankSuite.scala | 2 +- .../GeneralizedLinearRegression.scala | 2 +- .../org/apache/spark/ml/stat/ANOVATest.scala | 2 +- .../org/apache/spark/ml/stat/FValueTest.scala | 2 +- .../spark/mllib/clustering/LDAOptimizer.scala | 2 +- .../mllib/clustering/StreamingKMeans.scala | 2 +- .../spark/mllib/fpm/AssociationRules.scala | 4 +- .../mllib/linalg/distributed/RowMatrix.scala | 5 +- .../correlation/SpearmanCorrelation.scala | 2 +- .../spark/mllib/stat/test/ChiSqTest.scala | 2 +- .../mllib/stat/test/StreamingTestMethod.scala | 2 +- pom.xml | 2 +- project/SparkBuild.scala | 2 +- .../cluster/k8s/ExecutorRollPlugin.scala | 17 +++--- .../plans/logical/basicLogicalOperators.scala | 3 +- .../statsEstimation/EstimationUtils.scala | 4 +- .../sql/catalyst/util/QuantileSummaries.scala | 2 +- .../ui/StreamingQueryStatisticsPage.scala | 16 +++--- .../sql/StatisticsCollectionTestBase.scala | 8 +-- .../PassThroughEncodingSuite.scala | 2 +- .../streaming/EventTimeWatermarkSuite.scala | 5 +- .../streaming/receiver/RateLimiter.scala | 4 +- .../spark/streaming/ui/StreamingPage.scala | 14 ++--- .../apache/spark/streaming/ui/UIUtils.scala | 8 +-- .../ExecutorAllocationManagerSuite.scala | 4 +- .../scheduler/RateControllerSuite.scala | 2 +- 44 files changed, 124 insertions(+), 110 deletions(-) diff --git a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala index ab488e18ba3f4..75c56451592e4 100644 --- a/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala +++ b/common/unsafe/src/test/scala/org/apache/spark/unsafe/types/UTF8StringPropertyCheckSuite.scala @@ -80,7 +80,9 @@ class UTF8StringPropertyCheckSuite extends AnyFunSuite with ScalaCheckDrivenProp test("compare") { forAll { (s1: String, s2: String) => - assert(Math.signum(toUTF8(s1).compareTo(toUTF8(s2))) === Math.signum(s1.compareTo(s2))) + assert(Math.signum { + toUTF8(s1).compareTo(toUTF8(s2)).toFloat + } === Math.signum(s1.compareTo(s2).toFloat)) } } diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowVectorReader.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowVectorReader.scala index 488208574809b..53d8d46e62689 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowVectorReader.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/arrow/ArrowVectorReader.scala @@ -134,7 +134,7 @@ private[arrow] class SmallIntVectorReader(v: SmallIntVector) private[arrow] class IntVectorReader(v: IntVector) extends TypedArrowVectorReader[IntVector](v) { override def getInt(i: Int): Int = vector.get(i) override def getLong(i: Int): Long = getInt(i) - override def getFloat(i: Int): Float = getInt(i) + override def getFloat(i: Int): Float = getInt(i).toFloat override def getDouble(i: Int): Double = getInt(i) override def getString(i: Int): String = String.valueOf(getInt(i)) override def getJavaDecimal(i: Int): JBigDecimal = JBigDecimal.valueOf(getInt(i)) @@ -143,8 +143,8 @@ private[arrow] class IntVectorReader(v: IntVector) extends TypedArrowVectorReade private[arrow] class BigIntVectorReader(v: BigIntVector) extends TypedArrowVectorReader[BigIntVector](v) { override def getLong(i: Int): Long = vector.get(i) - override def getFloat(i: Int): Float = getLong(i) - override def getDouble(i: Int): Double = getLong(i) + override def getFloat(i: Int): Float = getLong(i).toFloat + override def getDouble(i: Int): Double = getLong(i).toDouble override def getString(i: Int): String = String.valueOf(getLong(i)) override def getJavaDecimal(i: Int): JBigDecimal = JBigDecimal.valueOf(getLong(i)) override def getTimestamp(i: Int): Timestamp = toJavaTimestamp(getLong(i) * MICROS_PER_SECOND) diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala index f5967a74ad339..c412486ce197e 100644 --- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala +++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala @@ -146,7 +146,7 @@ private[spark] class DirectKafkaInputDStream[K, V]( val maxRateLimitPerPartition = ppc.maxRatePerPartition(tp) val backpressureRate = lag / totalLag.toDouble * rate tp -> (if (maxRateLimitPerPartition > 0) { - Math.min(backpressureRate, maxRateLimitPerPartition)} else backpressureRate) + Math.min(backpressureRate, maxRateLimitPerPartition.toDouble)} else backpressureRate) } case None => offsets.map { case (tp, offset) => tp -> ppc.maxRatePerPartition(tp).toDouble } } diff --git a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala index 286b073125ff0..6c57091bc3c46 100644 --- a/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala +++ b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala @@ -98,7 +98,7 @@ private[spark] class KafkaRDD[K, V]( if (compacted) { super.countApprox(timeout, confidence) } else { - val c = count() + val c = count().toDouble new PartialResult(new BoundedDouble(c, 1.0, c, c), true) } diff --git a/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala index faf114108fac5..28f0906258303 100644 --- a/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala +++ b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala @@ -805,7 +805,7 @@ private[streaming] class ConstantEstimator(@volatile private var rate: Long) time: Long, elements: Long, processingDelay: Long, - schedulingDelay: Long): Option[Double] = Some(rate) + schedulingDelay: Long): Option[Double] = Some(rate.toDouble) } private[streaming] class ConstantRateController(id: Int, estimator: RateEstimator, rate: Long) diff --git a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala index 978afaffab30b..4897cf694ae8e 100644 --- a/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala +++ b/core/src/main/scala/org/apache/spark/input/FixedLengthBinaryInputFormat.scala @@ -74,7 +74,7 @@ private[spark] class FixedLengthBinaryInputFormat if (defaultSize < recordLength) { recordLength.toLong } else { - (Math.floor(defaultSize / recordLength) * recordLength).toLong + defaultSize / recordLength * recordLength } } diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala index 877f04b1adc01..189d390d37999 100644 --- a/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala +++ b/core/src/main/scala/org/apache/spark/metrics/sink/StatsdReporter.scala @@ -124,9 +124,9 @@ private[spark] class StatsdReporter( private def reportTimer(name: String, timer: Timer)(implicit socket: DatagramSocket): Unit = { val snapshot = timer.getSnapshot - send(fullName(name, "max"), format(convertDuration(snapshot.getMax)), TIMER) + send(fullName(name, "max"), format(convertDuration(snapshot.getMax.toDouble)), TIMER) send(fullName(name, "mean"), format(convertDuration(snapshot.getMean)), TIMER) - send(fullName(name, "min"), format(convertDuration(snapshot.getMin)), TIMER) + send(fullName(name, "min"), format(convertDuration(snapshot.getMin.toDouble)), TIMER) send(fullName(name, "stddev"), format(convertDuration(snapshot.getStdDev)), TIMER) send(fullName(name, "p50"), format(convertDuration(snapshot.getMedian)), TIMER) send(fullName(name, "p75"), format(convertDuration(snapshot.get75thPercentile)), TIMER) diff --git a/core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala index cbee136871012..a974ca2f1a05b 100644 --- a/core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala +++ b/core/src/main/scala/org/apache/spark/partial/CountEvaluator.scala @@ -35,7 +35,7 @@ private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double) override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { - new BoundedDouble(sum, 1.0, sum, sum) + new BoundedDouble(sum.toDouble, 1.0, sum.toDouble, sum.toDouble) } else if (outputsMerged == 0 || sum == 0) { new BoundedDouble(0, 0.0, 0.0, Double.PositiveInfinity) } else { @@ -57,7 +57,8 @@ private[partial] object CountEvaluator { val low = dist.inverseCumulativeProbability((1 - confidence) / 2) val high = dist.inverseCumulativeProbability((1 + confidence) / 2) // Add 'sum' to each because distribution is just of remaining count, not observed - new BoundedDouble(sum + dist.getNumericalMean, confidence, sum + low, sum + high) + new BoundedDouble( + sum + dist.getNumericalMean, confidence, (sum + low).toDouble, (sum + high).toDouble) } diff --git a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala index d2b4187df5d50..7cd60815fadbe 100644 --- a/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala +++ b/core/src/main/scala/org/apache/spark/partial/GroupedCountEvaluator.scala @@ -41,7 +41,9 @@ private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, conf override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { - sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap + sums.map { case (key, sum) => + (key, new BoundedDouble(sum.toDouble, 1.0, sum.toDouble, sum.toDouble)) + }.toMap } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { diff --git a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala index 00c655f4a4f4d..fe08e8337f76f 100644 --- a/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala +++ b/core/src/main/scala/org/apache/spark/resource/ResourceUtils.scala @@ -476,7 +476,7 @@ private[spark] object ResourceUtils extends Logging { if (maxTaskPerExec < (execAmount * numParts / taskAmount)) { val origTaskAmount = treq.amount val taskReqStr = s"${origTaskAmount}/${numParts}" - val resourceNumSlots = Math.floor(execAmount * numParts / taskAmount).toInt + val resourceNumSlots = (execAmount * numParts / taskAmount).toInt val message = s"The configuration of resource: ${treq.resourceName} " + s"(exec = ${execAmount}, task = ${taskReqStr}, " + s"runnable tasks = ${resourceNumSlots}) will " + diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala index d10cf55ed0d10..113521453ad7b 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala @@ -95,7 +95,7 @@ private[spark] object MapStatus { } else if (size <= 1L) { 1 } else { - math.min(255, math.ceil(math.log(size) / math.log(LOG_BASE)).toInt).toByte + math.min(255, math.ceil(math.log(size.toDouble) / math.log(LOG_BASE)).toInt).toByte } } @@ -276,12 +276,12 @@ private[spark] object HighlyCompressedMapStatus { val skewSizeThreshold = Math.max( medianSize * accurateBlockSkewedFactor, - sortedSizes(totalNumBlocks - maxAccurateSkewedBlockNumber) + sortedSizes(totalNumBlocks - maxAccurateSkewedBlockNumber).toDouble ) - Math.min(shuffleAccurateBlockThreshold, skewSizeThreshold) + Math.min(shuffleAccurateBlockThreshold.toDouble, skewSizeThreshold) } else { // Disable skew detection if accurateBlockSkewedFactor <= 0 - shuffleAccurateBlockThreshold + shuffleAccurateBlockThreshold.toDouble } val hugeBlockSizes = mutable.Map.empty[Int, Byte] diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala index 6157a3e46c875..d17e6735c4ecf 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala @@ -809,7 +809,7 @@ private[spark] class TaskSetManager( info.markFinished(TaskState.FINISHED, clock.getTimeMillis()) if (speculationEnabled) { - successfulTaskDurations.insert(info.duration) + successfulTaskDurations.insert(info.duration.toDouble) taskProcessRateCalculator.foreach(_.updateAvgTaskProcessRate(tid, result)) } removeRunningTask(tid) @@ -1196,7 +1196,7 @@ private[spark] class TaskSetManager( val timeMs = clock.getTimeMillis() if (numSuccessfulTasks >= minFinishedForSpeculation) { val medianDuration = successfulTaskDurations.percentile() - val threshold = max(speculationMultiplier * medianDuration, minTimeToSpeculation) + val threshold = max(speculationMultiplier * medianDuration, minTimeToSpeculation.toDouble) // TODO: Threshold should also look at standard deviation of task durations and have a lower // bound based on that. logDebug("Task length threshold for speculation: " + threshold) @@ -1204,7 +1204,8 @@ private[spark] class TaskSetManager( } else if (isSpeculationThresholdSpecified && speculationTasksLessEqToSlots) { val threshold = speculationTaskDurationThresOpt.get logDebug(s"Tasks taking longer time than provided speculation threshold: $threshold") - foundTasks = checkAndSubmitSpeculatableTasks(timeMs, threshold, customizedThreshold = true) + foundTasks = checkAndSubmitSpeculatableTasks( + timeMs, threshold.toDouble, customizedThreshold = true) } // avoid more warning logs. if (foundTasks) { diff --git a/core/src/main/scala/org/apache/spark/util/Clock.scala b/core/src/main/scala/org/apache/spark/util/Clock.scala index 226f15d3d38c2..e0cb3f4188e6d 100644 --- a/core/src/main/scala/org/apache/spark/util/Clock.scala +++ b/core/src/main/scala/org/apache/spark/util/Clock.scala @@ -85,7 +85,7 @@ private[spark] class SystemClock extends Clock { return currentTime } - val pollTime = math.max(waitTime / 10.0, minPollTime).toLong + val pollTime = math.max(waitTime / 10.0, minPollTime.toDouble).toLong while (true) { currentTime = System.currentTimeMillis() diff --git a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala index f08cf44e4e12b..08e2ea01f623e 100644 --- a/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala +++ b/core/src/main/scala/org/apache/spark/util/random/StratifiedSamplingUtils.scala @@ -98,8 +98,8 @@ private[spark] object StratifiedSamplingUtils extends Logging { if (acceptResult.areBoundsEmpty) { val n = counts.get(key) val sampleSize = math.ceil(n * fraction).toLong - val lmbd1 = PoissonBounds.getLowerBound(sampleSize) - val lmbd2 = PoissonBounds.getUpperBound(sampleSize) + val lmbd1 = PoissonBounds.getLowerBound(sampleSize.toDouble) + val lmbd2 = PoissonBounds.getUpperBound(sampleSize.toDouble) acceptResult.acceptBound = lmbd1 / n acceptResult.waitListBound = (lmbd2 - lmbd1) / n } diff --git a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala index 0b33e2a9426ce..e7315d6119be0 100644 --- a/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala +++ b/core/src/test/scala/org/apache/spark/benchmark/Benchmark.scala @@ -163,7 +163,7 @@ private[spark] class Benchmark( // scalastyle:on assert(runTimes.nonEmpty) val best = runTimes.min - val avg = runTimes.sum / runTimes.size + val avg = runTimes.sum.toDouble / runTimes.size val stdev = if (runTimes.size > 1) { math.sqrt(runTimes.map(time => (time - avg) * (time - avg)).sum / (runTimes.size - 1)) } else 0 diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala index ac89f60955eed..0161917f8853d 100644 --- a/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala +++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogTestHelper.scala @@ -56,7 +56,7 @@ object EventLogTestHelper { eventStr: String, desiredSize: Long): Seq[String] = { val stringLen = eventStr.getBytes(StandardCharsets.UTF_8).length - val repeatCount = Math.floor(desiredSize / stringLen).toInt + val repeatCount = (desiredSize / stringLen).toInt (0 until repeatCount).map { _ => writer.writeEvent(eventStr, flushLogger = true) eventStr diff --git a/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala index ccf6c9184cc96..f2b795764b7e8 100644 --- a/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala +++ b/core/src/test/scala/org/apache/spark/status/AppStatusStoreSuite.scala @@ -170,40 +170,44 @@ class AppStatusStoreSuite extends SparkFunSuite { assert(actualQuantiles === expectedQuantiles) } - assertQuantiles(_.executorDeserializeTime, summary.executorDeserializeTime) - assertQuantiles(_.executorDeserializeCpuTime, summary.executorDeserializeCpuTime) - assertQuantiles(_.executorRunTime, summary.executorRunTime) - assertQuantiles(_.executorRunTime, summary.executorRunTime) - assertQuantiles(_.executorCpuTime, summary.executorCpuTime) - assertQuantiles(_.resultSize, summary.resultSize) - assertQuantiles(_.jvmGCTime, summary.jvmGcTime) - assertQuantiles(_.resultSerializationTime, summary.resultSerializationTime) - assertQuantiles(_.memoryBytesSpilled, summary.memoryBytesSpilled) - assertQuantiles(_.diskBytesSpilled, summary.diskBytesSpilled) - assertQuantiles(_.peakExecutionMemory, summary.peakExecutionMemory) - assertQuantiles(_.inputMetrics.bytesRead, summary.inputMetrics.bytesRead) - assertQuantiles(_.inputMetrics.recordsRead, summary.inputMetrics.recordsRead) - assertQuantiles(_.outputMetrics.bytesWritten, summary.outputMetrics.bytesWritten) - assertQuantiles(_.outputMetrics.recordsWritten, summary.outputMetrics.recordsWritten) - assertQuantiles(_.shuffleReadMetrics.remoteBlocksFetched, + assertQuantiles(_.executorDeserializeTime.toDouble, summary.executorDeserializeTime) + assertQuantiles(_.executorDeserializeCpuTime.toDouble, summary.executorDeserializeCpuTime) + assertQuantiles(_.executorRunTime.toDouble, summary.executorRunTime) + assertQuantiles(_.executorRunTime.toDouble, summary.executorRunTime) + assertQuantiles(_.executorCpuTime.toDouble, summary.executorCpuTime) + assertQuantiles(_.resultSize.toDouble, summary.resultSize) + assertQuantiles(_.jvmGCTime.toDouble, summary.jvmGcTime) + assertQuantiles(_.resultSerializationTime.toDouble, summary.resultSerializationTime) + assertQuantiles(_.memoryBytesSpilled.toDouble, summary.memoryBytesSpilled) + assertQuantiles(_.diskBytesSpilled.toDouble, summary.diskBytesSpilled) + assertQuantiles(_.peakExecutionMemory.toDouble, summary.peakExecutionMemory) + assertQuantiles(_.inputMetrics.bytesRead.toDouble, summary.inputMetrics.bytesRead) + assertQuantiles(_.inputMetrics.recordsRead.toDouble, summary.inputMetrics.recordsRead) + assertQuantiles(_.outputMetrics.bytesWritten.toDouble, summary.outputMetrics.bytesWritten) + assertQuantiles(_.outputMetrics.recordsWritten.toDouble, + summary.outputMetrics.recordsWritten) + assertQuantiles(_.shuffleReadMetrics.remoteBlocksFetched.toDouble, summary.shuffleReadMetrics.remoteBlocksFetched) - assertQuantiles(_.shuffleReadMetrics.localBlocksFetched, + assertQuantiles(_.shuffleReadMetrics.localBlocksFetched.toDouble, summary.shuffleReadMetrics.localBlocksFetched) - assertQuantiles(_.shuffleReadMetrics.fetchWaitTime, + assertQuantiles(_.shuffleReadMetrics.fetchWaitTime.toDouble, summary.shuffleReadMetrics.fetchWaitTime) - assertQuantiles(_.shuffleReadMetrics.remoteBytesRead, + assertQuantiles(_.shuffleReadMetrics.remoteBytesRead.toDouble, summary.shuffleReadMetrics.remoteBytesRead) - assertQuantiles(_.shuffleReadMetrics.remoteBytesReadToDisk, + assertQuantiles(_.shuffleReadMetrics.remoteBytesReadToDisk.toDouble, summary.shuffleReadMetrics.remoteBytesReadToDisk) assertQuantiles( - t => t.shuffleReadMetrics.localBytesRead + t.shuffleReadMetrics.remoteBytesRead, + t => t.shuffleReadMetrics.localBytesRead + t.shuffleReadMetrics.remoteBytesRead.toDouble, summary.shuffleReadMetrics.readBytes) assertQuantiles( - t => t.shuffleReadMetrics.localBlocksFetched + t.shuffleReadMetrics.remoteBlocksFetched, + t => t.shuffleReadMetrics.localBlocksFetched + + t.shuffleReadMetrics.remoteBlocksFetched.toDouble, summary.shuffleReadMetrics.totalBlocksFetched) - assertQuantiles(_.shuffleWriteMetrics.bytesWritten, summary.shuffleWriteMetrics.writeBytes) - assertQuantiles(_.shuffleWriteMetrics.writeTime, summary.shuffleWriteMetrics.writeTime) - assertQuantiles(_.shuffleWriteMetrics.recordsWritten, + assertQuantiles(_.shuffleWriteMetrics.bytesWritten.toDouble, + summary.shuffleWriteMetrics.writeBytes) + assertQuantiles(_.shuffleWriteMetrics.writeTime.toDouble, + summary.shuffleWriteMetrics.writeTime) + assertQuantiles(_.shuffleWriteMetrics.recordsWritten.toDouble, summary.shuffleWriteMetrics.writeRecords) } finally { appStore.close() diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala index d7099c5c953c1..bc6fab45810eb 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala @@ -87,7 +87,7 @@ object SVDPlusPlus { val gJoinT0 = g.outerJoinVertices(t0) { (vid: VertexId, vd: (Array[Double], Array[Double], Double, Double), msg: Option[(Long, Double)]) => - (vd._1, vd._2, msg.get._2 / msg.get._1 - u, 1.0 / scala.math.sqrt(msg.get._1)) + (vd._1, vd._2, msg.get._2 / msg.get._1 - u, 1.0 / scala.math.sqrt(msg.get._1.toDouble)) }.cache() materialize(gJoinT0) g.unpersist() diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala index caa2fdcdf5d2b..666790958c353 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala @@ -321,7 +321,7 @@ class PageRankSuite extends SparkFunSuite with LocalSparkContext { val rank = if (vid < source) { 0.0 } else { - a * Math.pow(1 - resetProb, vid - source) + a * Math.pow(1 - resetProb, vid.toDouble - source) } vid -> rank } diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 6e26a78e9c7e6..aa39a3e177eeb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -1418,7 +1418,7 @@ class GeneralizedLinearRegressionSummary private[regression] ( case Row(label: Double, pred: Double, weight: Double) => (label, pred, weight) } - family.aic(t, deviance, numInstances, weightSum) + 2 * rank + family.aic(t, deviance, numInstances.toDouble, weightSum) + 2 * rank } } diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala index d7b13f1bf25f3..482bb7fdc2105 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/ANOVATest.scala @@ -224,7 +224,7 @@ private[ml] object ANOVATest { // mean square within val msw = sswn / dfwn val fValue = msb / msw - val pValue = 1 - new FDistribution(dfbn, dfwn).cumulativeProbability(fValue) + val pValue = 1 - new FDistribution(dfbn.toDouble, dfwn.toDouble).cumulativeProbability(fValue) val degreesOfFreedom = dfbn + dfwn (pValue, degreesOfFreedom, fValue) } diff --git a/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala b/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala index 89579dfcbb0c3..e2ce6cf7214f7 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/stat/FValueTest.scala @@ -135,7 +135,7 @@ private[ml] object FValueTest { } else Iterator.empty }.reduceByKey(_ + _ ).mapPartitions { iter => - val fd = new FDistribution(1, degreesOfFreedom) + val fd = new FDistribution(1.0, degreesOfFreedom.toDouble) iter.map { case (col, sumForCov) => // Cov(X,Y) = Sum(((Xi - Avg(X)) * ((Yi-Avg(Y))) / (N-1) val covariance = sumForCov / (numSamples - 1) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index dbcf9017f1748..234ecbc460638 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -525,7 +525,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer with Logging { updateLambda(batchResult, batchSize) logphatOption.foreach(_ /= nonEmptyDocsN.toDouble) - logphatOption.foreach(updateAlpha(_, nonEmptyDocsN)) + logphatOption.foreach(updateAlpha(_, nonEmptyDocsN.toDouble)) this } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala index ed6e3ea966b26..17b28ed3eba5d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/StreamingKMeans.scala @@ -106,7 +106,7 @@ class StreamingKMeansModel @Since("1.2.0") ( val numNewPoints = pointStats.iterator.map { case (_, (_, n)) => n }.sum - math.pow(decayFactor, numNewPoints) + math.pow(decayFactor, numNewPoints.toDouble) } // apply discount to weights diff --git a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala index 06c7754691953..79f482347289a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/fpm/AssociationRules.scala @@ -91,8 +91,8 @@ class AssociationRules private[fpm] ( .map { case (antecedent, ((consequent, freqUnion), freqAntecedent)) => new Rule(antecedent.toArray, consequent.toArray, - freqUnion, - freqAntecedent, + freqUnion.toDouble, + freqAntecedent.toDouble, // the consequent contains always only one element itemSupport.get(consequent.head)) }.filter(_.confidence >= minConfidence) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 2bd4877ffc72e..37bf9d45f6646 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -633,7 +633,7 @@ class RowMatrix @Since("1.0.0") ( val gamma = if (threshold < 1e-6) { Double.PositiveInfinity } else { - 10 * math.log(numCols()) / threshold + 10 * math.log(numCols().toDouble) / threshold } val summary = Statistics.colStats(rows.map((_, 1.0)), Seq("normL2")) @@ -823,7 +823,8 @@ class RowMatrix @Since("1.0.0") ( + s"as it's bigger than maxResultSize ($maxDriverResultSizeInBytes Bytes)") val numerator = math.log(rows.getNumPartitions) - val denominator = math.log(maxDriverResultSizeInBytes) - math.log(aggregatedObjectSizeInBytes) + val denominator = math.log(maxDriverResultSizeInBytes.toDouble) - + math.log(aggregatedObjectSizeInBytes.toDouble) val desiredTreeDepth = math.ceil(numerator / denominator) if (desiredTreeDepth > 4) { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala index aa0bf51ebcd25..28c2b5d5027ab 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/correlation/SpearmanCorrelation.scala @@ -70,7 +70,7 @@ private[stat] object SpearmanCorrelation extends Correlation with Logging { val output = flush() preCol = j preVal = v - startRank = rank + startRank = rank.toDouble cachedUids += uid output } else { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala index ead9f887fe811..d42df3e2f0ddf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala @@ -201,7 +201,7 @@ private[spark] object ChiSqTest extends Logging { counts.foreach { case ((label, value), c) => val i = value2Index(value) val j = label2Index(label) - contingency.update(i, j, c) + contingency.update(i, j, c.toDouble) } ChiSqTest.chiSquaredMatrix(contingency, methodName) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala index 8f3d0f8b3214c..cf0fd388fa749 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/StreamingTestMethod.scala @@ -131,7 +131,7 @@ private[stat] object StudentTTest extends StreamingTestMethod with Logging { statsA: StatCounter, statsB: StatCounter): StreamingTestResult = { def studentDF(sample1: StatisticalSummaryValues, sample2: StatisticalSummaryValues): Double = - sample1.getN + sample2.getN - 2 + sample1.getN + sample2.getN - 2.0 new StreamingTestResult( tTester.get.homoscedasticTTest(statsA, statsB), diff --git a/pom.xml b/pom.xml index ac096a19804db..6ed16d88b0dc4 100644 --- a/pom.xml +++ b/pom.xml @@ -2978,7 +2978,7 @@ TODO(SPARK-33805): Undo the corresponding deprecated usage suppression rule after fixed. --> -Wconf:msg=^(?=.*?method|value|type|object|trait|inheritance)(?=.*?deprecated)(?=.*?since 2.13).+$:e - -Wconf:msg=^(?=.*?Widening conversion from)(?=.*?is deprecated because it loses precision).+$:s + -Wconf:msg=^(?=.*?Widening conversion from)(?=.*?is deprecated because it loses precision).+$:e -Wconf:cat=deprecation&msg=Auto-application to \`\(\)\` is deprecated:e 2.6 - 3.13.0 + 3.14.0 2.11.1 4.1.17 From 86971665ed4786a5d4269c9371d3e2d6751a49d2 Mon Sep 17 00:00:00 2001 From: jdesjean Date: Tue, 28 Nov 2023 08:31:53 +0900 Subject: [PATCH 25/40] [SPARK-45957][CONNECT] Avoid generating execution plan for non-executable commands ### What changes were proposed in this pull request? Remove the metric response for non executable commands (and the executedPlan generation) ### Why are the changes needed? SQL command can be of 2 types: 1) Executable (i.e. `show tables`). They are eagerly executed and return a response. The execution can generate metrics that should be returned to the user. 2) Non executable. They are lazy and are not executed. As such they should not generate metrics. We currently generate a executedPlan for both command & relations to attach the metrics. This is a performance concern for relations as generating the optimized & physical can take some time. Furthermore, streaming SQL relations cannot generate a physical plan in the same way (i.e. they need to use read / write stream). ### Does this PR introduce _any_ user-facing change? Yes, SQL non-executable commands will no longer return a metric response. This is a backward compatible change. ### How was this patch tested? Unit ### Was this patch authored or co-authored using generative AI tooling? No Closes #43851 from jdesjean/SPARK-45957. Authored-by: jdesjean Signed-off-by: Hyukjin Kwon --- .../connect/planner/SparkConnectPlanner.scala | 21 ++++++++++++---- .../sql/connect/utils/MetricGenerator.scala | 6 +++++ .../sql/tests/streaming/test_streaming.py | 24 +++++++++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 3ac093b5e0b42..abfc063139056 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -2563,6 +2563,8 @@ class SparkConnectPlanner( // To avoid explicit handling of the result on the client, we build the expected input // of the relation on the server. The client has to simply forward the result. val result = SqlCommandResult.newBuilder() + // Only filled when isCommand + val metrics = ExecutePlanResponse.Metrics.newBuilder() if (isCommand) { // Convert the results to Arrow. val schema = df.schema @@ -2596,10 +2598,10 @@ class SparkConnectPlanner( proto.LocalRelation .newBuilder() .setData(ByteString.copyFrom(bytes)))) + metrics.addAllMetrics(MetricGenerator.transformPlan(df).asJava) } else { - // Trigger assertExecutedPlanPrepared to ensure post ReadyForExecution before finished - // executedPlan is currently called by createMetricsResponse below - df.queryExecution.assertExecutedPlanPrepared() + // No execution triggered for relations. Manually set ready + tracker.setReadyForExecution() result.setRelation( proto.Relation .newBuilder() @@ -2622,8 +2624,17 @@ class SparkConnectPlanner( .setSqlCommandResult(result) .build()) - // Send Metrics - responseObserver.onNext(MetricGenerator.createMetricsResponse(sessionHolder, df)) + // Send Metrics when isCommand (i.e. show tables) which is eagerly executed & has metrics + // Skip metrics when !isCommand (i.e. select 1) which is not executed & doesn't have metrics + if (isCommand) { + responseObserver.onNext( + ExecutePlanResponse + .newBuilder() + .setSessionId(sessionHolder.sessionId) + .setServerSideSessionId(sessionHolder.serverSessionId) + .setMetrics(metrics.build) + .build) + } } private def handleRegisterUserDefinedFunction( diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala index c9bba653e8a8f..e2e4128311871 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/utils/MetricGenerator.scala @@ -51,6 +51,12 @@ private[connect] object MetricGenerator extends AdaptiveSparkPlanHelper { allChildren(p).flatMap(c => transformPlan(c, p.id)) } + private[connect] def transformPlan( + rows: DataFrame): Seq[ExecutePlanResponse.Metrics.MetricObject] = { + val executedPlan = rows.queryExecution.executedPlan + transformPlan(executedPlan, executedPlan.id) + } + private def transformPlan( p: SparkPlan, parentId: Int): Seq[ExecutePlanResponse.Metrics.MetricObject] = { diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py index a905a87a3b4d6..2b9072c34befe 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming.py +++ b/python/pyspark/sql/tests/streaming/test_streaming.py @@ -382,6 +382,30 @@ def test_streaming_write_to_table(self): result = self.spark.sql("SELECT value FROM output_table").collect() self.assertTrue(len(result) > 0) + def test_streaming_with_temporary_view(self): + """ + This verifies createOrReplaceTempView() works with a streaming dataframe. An SQL + SELECT query on such a table results in a streaming dataframe and the streaming query works + as expected. + """ + with self.table("input_table", "this_query"): + self.spark.sql("CREATE TABLE input_table (value string) USING parquet") + self.spark.sql("INSERT INTO input_table VALUES ('a'), ('b'), ('c')") + df = self.spark.readStream.table("input_table") + self.assertTrue(df.isStreaming) + # Create a temp view + df.createOrReplaceTempView("test_view") + # Create a select query + view_df = self.spark.sql("SELECT CONCAT('view_', value) as vv from test_view") + self.assertTrue(view_df.isStreaming) + q = view_df.writeStream.format("memory").queryName("this_query").start() + q.processAllAvailable() + q.stop() + result = self.spark.sql("SELECT * FROM this_query ORDER BY vv").collect() + self.assertEqual( + set([Row(value="view_a"), Row(value="view_b"), Row(value="view_c")]), set(result) + ) + class StreamingTests(StreamingTestsMixin, ReusedSQLTestCase): pass From 11ac856919815f7ef2e534e205d1ed83398de136 Mon Sep 17 00:00:00 2001 From: panbingkun Date: Tue, 28 Nov 2023 08:46:39 +0900 Subject: [PATCH 26/40] [SPARK-46103][FOLLOWUP] Keep Sphinx version consistency in spark-rm ### What changes were proposed in this pull request? The pr aims to keep Sphinx version consistency in `spark-rm`. ### Why are the changes needed? To avoid unexpected behavior in published documents. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manually test. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44032 from panbingkun/SPARK-46103_FOLLOWUP. Authored-by: panbingkun Signed-off-by: Hyukjin Kwon --- dev/create-release/spark-rm/Dockerfile | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/dev/create-release/spark-rm/Dockerfile b/dev/create-release/spark-rm/Dockerfile index dbb851d74a565..9cfe78570421e 100644 --- a/dev/create-release/spark-rm/Dockerfile +++ b/dev/create-release/spark-rm/Dockerfile @@ -37,12 +37,7 @@ ENV DEBCONF_NONINTERACTIVE_SEEN true # These arguments are just for reuse and not really meant to be customized. ARG APT_INSTALL="apt-get install --no-install-recommends -y" -# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. -# See also https://github.com/sphinx-doc/sphinx/issues/7551. -# We should use the latest Sphinx version once this is fixed. -# TODO(SPARK-35375): Jinja2 3.0.0+ causes error when building with Sphinx. -# See also https://issues.apache.org/jira/browse/SPARK-35375. -ARG PIP_PKGS="sphinx==3.0.4 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.8.0 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==2.11.3 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==1.5.3 pyarrow==3.0.0 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.59.3 protobuf==4.21.6 grpcio-status==1.59.3 googleapis-common-protos==1.56.4" +ARG PIP_PKGS="sphinx==4.2.0 mkdocs==1.1.2 numpy==1.20.3 pydata_sphinx_theme==0.13.3 ipython==7.19.0 nbsphinx==0.8.0 numpydoc==1.1.0 jinja2==3.1.2 twine==3.4.1 sphinx-plotly-directive==0.1.3 sphinx-copybutton==0.5.2 pandas==1.5.3 pyarrow==3.0.0 plotly==5.4.0 markupsafe==2.0.1 docutils<0.17 grpcio==1.59.3 protobuf==4.21.6 grpcio-status==1.59.3 googleapis-common-protos==1.56.4" ARG GEM_PKGS="bundler:2.3.8" # Install extra needed repos and refresh. From 2430e87ac93952ae7e296faf49734f65af29f9ed Mon Sep 17 00:00:00 2001 From: Max Gekk Date: Tue, 28 Nov 2023 08:47:46 +0900 Subject: [PATCH 27/40] [SPARK-46115][SQL] Restrict charsets in `encode()` ### What changes were proposed in this pull request? In the PR, I propose to restrict the supported charsets in the `encode()` functions by the list from [the doc](https://spark.apache.org/docs/latest/api/sql/#encode): ``` 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16' ``` and introduce the SQL config `spark.sql.legacy.javaCharsets` for restoring the previous behaviour. ### Why are the changes needed? Currently the list of supported charsets in `encode()` is not stable and fully depends on the used JDK version. So, sometimes user code might not work because a devop changed Java version in Spark cluster. ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? By running new checks: ``` $ PYSPARK_PYTHON=python3 build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z string-functions.sql" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44020 from MaxGekk/restrict-charsets-in-encode-2. Authored-by: Max Gekk Signed-off-by: Hyukjin Kwon --- .../explain-results/function_encode.explain | 2 +- .../function_to_binary_with_format.explain | 2 +- docs/sql-migration-guide.md | 1 + .../sql/tests/pandas/test_pandas_map.py | 2 +- .../expressions/stringExpressions.scala | 25 ++++++- .../apache/spark/sql/internal/SQLConf.scala | 11 ++++ .../ansi/string-functions.sql.out | 54 +++++++++++++-- .../analyzer-results/string-functions.sql.out | 54 +++++++++++++-- .../typeCoercion/native/concat.sql.out | 18 ++--- .../typeCoercion/native/elt.sql.out | 8 +-- .../sql-tests/inputs/string-functions.sql | 6 ++ .../results/ansi/string-functions.sql.out | 66 +++++++++++++++++++ .../results/string-functions.sql.out | 66 +++++++++++++++++++ .../org/apache/spark/sql/ExplainSuite.scala | 8 +-- 14 files changed, 288 insertions(+), 35 deletions(-) diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain index 56da919abf4c5..2f65436059230 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8) AS encode(g, UTF-8)#0] +Project [encode(g#0, UTF-8, false) AS encode(g, UTF-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain index e9513f0103c81..b62ccccc0c15e 100644 --- a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_binary_with_format.explain @@ -1,2 +1,2 @@ -Project [encode(g#0, UTF-8) AS to_binary(g, utf-8)#0] +Project [encode(g#0, UTF-8, false) AS to_binary(g, utf-8)#0] +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 5c00ce6558513..664bccf26651b 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -29,6 +29,7 @@ license: | - Since Spark 4.0, `spark.sql.hive.metastore` drops the support of Hive prior to 2.0.0 as they require JDK 8 that Spark does not support anymore. Users should migrate to higher versions. - Since Spark 4.0, `spark.sql.parquet.compression.codec` drops the support of codec name `lz4raw`, please use `lz4_raw` instead. - Since Spark 4.0, when overflowing during casting timestamp to byte/short/int under non-ansi mode, Spark will return null instead a wrapping value. +- Since Spark 4.0, the `encode()` function supports only the following charsets 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'. To restore the previous behavior when the function accepts charsets of the current JDK used by Spark, set `spark.sql.legacy.javaCharsets` to `true`. ## Upgrading from Spark SQL 3.4 to 3.5 diff --git a/python/pyspark/sql/tests/pandas/test_pandas_map.py b/python/pyspark/sql/tests/pandas/test_pandas_map.py index 304b78049b20f..ec9f208d08f9b 100644 --- a/python/pyspark/sql/tests/pandas/test_pandas_map.py +++ b/python/pyspark/sql/tests/pandas/test_pandas_map.py @@ -110,7 +110,7 @@ def func(iterator): df = ( self.spark.range(10, numPartitions=3) .select(col("id").cast("string").alias("str")) - .withColumn("bin", encode(col("str"), "utf8")) + .withColumn("bin", encode(col("str"), "utf-8")) ) actual = df.mapInPandas(func, "str string, bin binary").collect() expected = df.collect() diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 0d3239423b22c..90cfd13875d0c 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2685,18 +2685,26 @@ case class StringDecode(bin: Expression, charset: Expression) since = "1.5.0", group = "string_funcs") // scalastyle:on line.size.limit -case class Encode(value: Expression, charset: Expression) +case class Encode(value: Expression, charset: Expression, legacyCharsets: Boolean) extends BinaryExpression with ImplicitCastInputTypes with NullIntolerant { + def this(value: Expression, charset: Expression) = + this(value, charset, SQLConf.get.legacyJavaCharsets) + override def left: Expression = value override def right: Expression = charset override def dataType: DataType = BinaryType override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + private val supportedCharsets = Set( + "US-ASCII", "ISO-8859-1", "UTF-8", "UTF-16BE", "UTF-16LE", "UTF-16") + protected override def nullSafeEval(input1: Any, input2: Any): Any = { val toCharset = input2.asInstanceOf[UTF8String].toString try { - input1.asInstanceOf[UTF8String].toString.getBytes(toCharset) + if (legacyCharsets || supportedCharsets.contains(toCharset.toUpperCase(Locale.ROOT))) { + input1.asInstanceOf[UTF8String].toString.getBytes(toCharset) + } else throw new UnsupportedEncodingException } catch { case _: UnsupportedEncodingException => throw QueryExecutionErrors.invalidCharsetError(prettyName, toCharset) @@ -2706,10 +2714,17 @@ case class Encode(value: Expression, charset: Expression) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, (string, charset) => { val toCharset = ctx.freshName("toCharset") + val sc = JavaCode.global( + ctx.addReferenceObj("supportedCharsets", supportedCharsets), + supportedCharsets.getClass) s""" String $toCharset = $charset.toString(); try { - ${ev.value} = $string.toString().getBytes($toCharset); + if ($legacyCharsets || $sc.contains($toCharset.toUpperCase(java.util.Locale.ROOT))) { + ${ev.value} = $string.toString().getBytes($toCharset); + } else { + throw new java.io.UnsupportedEncodingException(); + } } catch (java.io.UnsupportedEncodingException e) { throw QueryExecutionErrors.invalidCharsetError("$prettyName", $toCharset); }""" @@ -2720,6 +2735,10 @@ case class Encode(value: Expression, charset: Expression) newLeft: Expression, newRight: Expression): Encode = copy(value = newLeft, charset = newRight) } +object Encode { + def apply(value: Expression, charset: Expression): Encode = new Encode(value, charset) +} + /** * Converts the input expression to a binary value based on the supplied format. */ diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 5133c40bc6faa..d4e5c6a3d1e04 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4584,6 +4584,15 @@ object SQLConf { .checkValue(_ > 0, "The number of stack traces in the DataFrame context must be positive.") .createWithDefault(1) + val LEGACY_JAVA_CHARSETS = buildConf("spark.sql.legacy.javaCharsets") + .internal() + .doc("When set to true, the functions like `encode()` can use charsets from JDK while " + + "encoding or decoding string values. If it is false, such functions support only one of " + + "the charsets: 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'.") + .version("4.0.0") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * @@ -5474,6 +5483,8 @@ class SQLConf extends Serializable with Logging with SqlApiConf { def stackTracesInDataFrameContext: Int = getConf(SQLConf.STACK_TRACES_IN_DATAFRAME_CONTEXT) + def legacyJavaCharsets: Boolean = getConf(SQLConf.LEGACY_JAVA_CHARSETS) + /** ********************** SQLConf functionality methods ************ */ /** Set Spark SQL configuration properties. */ diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out index 9c210a713de3d..9d8705e3e8620 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/ansi/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -640,17 +640,59 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x] +- OneRowRelation +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(true)) + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query analysis +Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(false)) + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query analysis +Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -704,7 +746,7 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index 9c210a713de3d..9d8705e3e8620 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -384,21 +384,21 @@ Project [btrim(xyxtrimyyx, xy) AS btrim(xyxtrimyyx, xy)#x] -- !query SELECT btrim(encode(" xyz ", 'utf-8')) -- !query analysis -Project [btrim(encode( xyz , utf-8)) AS btrim(encode( xyz , utf-8))#x] +Project [btrim(encode( xyz , utf-8, false)) AS btrim(encode( xyz , utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('yxTomxx', 'utf-8'), encode('xyz', 'utf-8')) -- !query analysis -Project [btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +Project [btrim(encode(yxTomxx, utf-8, false), encode(xyz, utf-8, false)) AS btrim(encode(yxTomxx, utf-8), encode(xyz, utf-8))#x] +- OneRowRelation -- !query SELECT btrim(encode('xxxbarxxx', 'utf-8'), encode('x', 'utf-8')) -- !query analysis -Project [btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +Project [btrim(encode(xxxbarxxx, utf-8, false), encode(x, utf-8, false)) AS btrim(encode(xxxbarxxx, utf-8), encode(x, utf-8))#x] +- OneRowRelation @@ -640,17 +640,59 @@ Project [rpad(cast(0x57 as string), 5, abc) AS rpad(X'57', 5, abc)#x] +- OneRowRelation +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(true)) + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query analysis +Project [encode(hello, WINDOWS-1252, true) AS encode(hello, WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, true) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query analysis +SetCommand (spark.sql.legacy.javaCharsets,Some(false)) + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query analysis +Project [encode(hello, WINDOWS-1252, false) AS encode(hello, WINDOWS-1252)#x] ++- OneRowRelation + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query analysis +Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] ++- SubqueryAlias t + +- LocalRelation [scol#x, ecol#x] + + -- !query select encode('hello', 'Windows-xxx') -- !query analysis -Project [encode(hello, Windows-xxx) AS encode(hello, Windows-xxx)#x] +Project [encode(hello, Windows-xxx, false) AS encode(hello, Windows-xxx)#x] +- OneRowRelation -- !query select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol) -- !query analysis -Project [encode(scol#x, ecol#x) AS encode(scol, ecol)#x] +Project [encode(scol#x, ecol#x, false) AS encode(scol, ecol)#x] +- SubqueryAlias t +- LocalRelation [scol#x, ecol#x] @@ -704,7 +746,7 @@ org.apache.spark.sql.AnalysisException -- !query select decode(encode('abc', 'utf-8'), 'utf-8') -- !query analysis -Project [decode(encode(abc, utf-8), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +Project [decode(encode(abc, utf-8, false), utf-8) AS decode(encode(abc, utf-8), utf-8)#x] +- OneRowRelation diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out index 676737a4fea8e..1b19753b1f6de 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/concat.sql.out @@ -11,7 +11,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#xL as string), col2#x), cast(col3#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x] + +- Project [id#xL AS col1#xL, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x] +- Range (0, 10, step=1, splits=None) @@ -29,7 +29,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, cast(col2#xL as string)), concat(col3#x, cast(col4#x as string))), cast(col5#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1, splits=None) @@ -46,7 +46,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), cast(concat(col3#x, col4#x) as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -67,7 +67,7 @@ FROM ( -- !query analysis Project [concat(cast(col1#x as string), cast(col2#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] +- Range (0, 10, step=1, splits=None) @@ -84,7 +84,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(cast(col1#x as string), cast(col2#x as string)), cast(col3#x as string)), cast(col4#x as string)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -101,7 +101,7 @@ FROM ( -- !query analysis Project [concat(concat(cast(col1#x as string), cast(col2#x as string)), concat(cast(col3#x as string), cast(col4#x as string))) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -122,7 +122,7 @@ FROM ( -- !query analysis Project [concat(col1#x, col2#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] +- Range (0, 10, step=1, splits=None) @@ -139,7 +139,7 @@ FROM ( -- !query analysis Project [concat(concat(concat(col1#x, col2#x), col3#x), col4#x) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -156,7 +156,7 @@ FROM ( -- !query analysis Project [concat(concat(col1#x, col2#x), concat(col3#x, col4#x)) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out index 5a9b5ddbafa39..4d897a329cfe1 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/typeCoercion/native/elt.sql.out @@ -13,7 +13,7 @@ FROM ( -- !query analysis Project [elt(2, col1#x, cast(col2#xL as string), col3#x, cast(col4#x as string), cast(col5#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col4#x, cast(id#xL as double) AS col5#x] + +- Project [prefix_ AS col1#x, id#xL AS col2#xL, cast((id#xL + cast(1 as bigint)) as string) AS col3#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col4#x, cast(id#xL as double) AS col5#x] +- Range (0, 10, step=1, splits=None) @@ -30,7 +30,7 @@ FROM ( -- !query analysis Project [elt(3, col1#x, col2#x, cast(col3#x as string), cast(col4#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8) AS col4#x] + +- Project [cast(id#xL as string) AS col1#x, cast((id#xL + cast(1 as bigint)) as string) AS col2#x, encode(cast((id#xL + cast(2 as bigint)) as string), utf-8, false) AS col3#x, encode(cast((id#xL + cast(3 as bigint)) as string), utf-8, false) AS col4#x] +- Range (0, 10, step=1, splits=None) @@ -51,7 +51,7 @@ FROM ( -- !query analysis Project [elt(1, cast(col1#x as string), cast(col2#x as string), false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] +- Range (0, 10, step=1, splits=None) @@ -72,5 +72,5 @@ FROM ( -- !query analysis Project [elt(2, col1#x, col2#x, false) AS col#x] +- SubqueryAlias __auto_generated_subquery_name - +- Project [encode(cast(id#xL as string), utf-8) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8) AS col2#x] + +- Project [encode(cast(id#xL as string), utf-8, false) AS col1#x, encode(cast((id#xL + cast(1 as bigint)) as string), utf-8, false) AS col2#x] +- Range (0, 10, step=1, splits=None) diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 0fbf211ec5c5e..645f6bcb8327c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -118,6 +118,12 @@ SELECT rpad('abc', 5, x'57'); SELECT rpad(x'57', 5, 'abc'); -- encode +set spark.sql.legacy.javaCharsets=true; +select encode('hello', 'WINDOWS-1252'); +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol); +set spark.sql.legacy.javaCharsets=false; +select encode('hello', 'WINDOWS-1252'); +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol); select encode('hello', 'Windows-xxx'); select encode(scol, ecol) from values('hello', 'Windows-xxx') as t(scol, ecol); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out index 082ff03efacb3..89bb20fc1bff4 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/string-functions.sql.out @@ -803,6 +803,72 @@ struct Wabca +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query schema +struct +-- !query output +spark.sql.legacy.javaCharsets true + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query schema +struct +-- !query output +hello + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct +-- !query output +hello + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query schema +struct +-- !query output +spark.sql.legacy.javaCharsets false + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + -- !query select encode('hello', 'Windows-xxx') -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 7914092037887..6d90a50915788 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -735,6 +735,72 @@ struct Wabca +-- !query +set spark.sql.legacy.javaCharsets=true +-- !query schema +struct +-- !query output +spark.sql.legacy.javaCharsets true + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query schema +struct +-- !query output +hello + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct +-- !query output +hello + + +-- !query +set spark.sql.legacy.javaCharsets=false +-- !query schema +struct +-- !query output +spark.sql.legacy.javaCharsets false + + +-- !query +select encode('hello', 'WINDOWS-1252') +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + +-- !query +select encode(scol, ecol) from values('hello', 'WINDOWS-1252') as t(scol, ecol) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkIllegalArgumentException +{ + "errorClass" : "INVALID_PARAMETER_VALUE.CHARSET", + "sqlState" : "22023", + "messageParameters" : { + "charset" : "WINDOWS-1252", + "functionName" : "`encode`", + "parameter" : "`charset`" + } +} + + -- !query select encode('hello', 'Windows-xxx') -- !query schema diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala index 8b5ffe560a1fa..da04674b99205 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala @@ -193,8 +193,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite """.stripMargin) checkKeywordsExistsInExplain(df2, "Project [concat(cast(id#xL as string), cast((id#xL + 1) as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]") + "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + + "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") val df3 = sql( """ @@ -209,8 +209,8 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite """.stripMargin) checkKeywordsExistsInExplain(df3, "Project [concat(cast(id#xL as string), " + - "cast(encode(cast((id#xL + 2) as string), utf-8) as string), " + - "cast(encode(cast((id#xL + 3) as string), utf-8) as string)) AS col#x]") + "cast(encode(cast((id#xL + 2) as string), utf-8, false) as string), " + + "cast(encode(cast((id#xL + 3) as string), utf-8, false) as string)) AS col#x]") } } From 753b2f23206464006c45f8d9b2747e56a09808a0 Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Tue, 28 Nov 2023 08:48:41 +0900 Subject: [PATCH 28/40] [SPARK-46121][PYTHON][DOCS] Refine docstring of `concat/array_position/element_at/try_element_at` ### What changes were proposed in this pull request? This pr refine docstring of `concat/array_position/element_at/try_element_at` and add some new examples. ### Why are the changes needed? To improve PySpark documentation ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass Github Actions ### Was this patch authored or co-authored using generative AI tooling? No Closes #44039 from LuciferYang/SPARK-46121. Authored-by: yangjie01 Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/functions/builtin.py | 241 +++++++++++++++++++++--- 1 file changed, 210 insertions(+), 31 deletions(-) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 8723c5fc4b9d4..d985b9e6138f5 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -12368,7 +12368,7 @@ def array_join( @_try_remote_functions def concat(*cols: "ColumnOrName") -> Column: """ - Concatenates multiple input columns together into a single column. + Collection function: Concatenates multiple input columns together into a single column. The function works with strings, numeric, binary and compatible array columns. .. versionadded:: 1.5.0 @@ -12392,19 +12392,61 @@ def concat(*cols: "ColumnOrName") -> Column: Examples -------- + Example 1: Concatenating string columns + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd']) - >>> df = df.select(concat(df.s, df.d).alias('s')) - >>> df.collect() - [Row(s='abcd123')] - >>> df - DataFrame[s: string] + >>> df.select(sf.concat(df.s, df.d)).show() + +------------+ + |concat(s, d)| + +------------+ + | abcd123| + +------------+ + + Example 2: Concatenating array columns + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c']) - >>> df = df.select(concat(df.a, df.b, df.c).alias("arr")) - >>> df.collect() - [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)] - >>> df - DataFrame[arr: array] + >>> df.select(sf.concat(df.a, df.b, df.c)).show() + +---------------+ + |concat(a, b, c)| + +---------------+ + |[1, 2, 3, 4, 5]| + | NULL| + +---------------+ + + Example 3: Concatenating numeric columns + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c']) + >>> df.select(sf.concat(df.a, df.b, df.c)).show() + +---------------+ + |concat(a, b, c)| + +---------------+ + | 123| + +---------------+ + + Example 4: Concatenating binary columns + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(bytearray(b'abc'), bytearray(b'def'))], ['a', 'b']) + >>> df.select(sf.concat(df.a, df.b)).show() + +-------------------+ + | concat(a, b)| + +-------------------+ + |[61 62 63 64 65 66]| + +-------------------+ + + Example 5: Concatenating mixed types of columns + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(1,"abc",3,"def")], ['a','b','c','d']) + >>> df.select(sf.concat(df.a, df.b, df.c, df.d)).show() + +------------------+ + |concat(a, b, c, d)| + +------------------+ + | 1abc3def| + +------------------+ """ return _invoke_function_over_seq_of_columns("concat", cols) @@ -12412,7 +12454,7 @@ def concat(*cols: "ColumnOrName") -> Column: @_try_remote_functions def array_position(col: "ColumnOrName", value: Any) -> Column: """ - Collection function: Locates the position of the first occurrence of the given value + Array function: Locates the position of the first occurrence of the given value in the given array. Returns null if either of the arguments are null. .. versionadded:: 2.4.0 @@ -12439,9 +12481,62 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: Examples -------- - >>> df = spark.createDataFrame([(["c", "b", "a"],), ([],)], ['data']) - >>> df.select(array_position(df.data, "a")).collect() - [Row(array_position(data, a)=3), Row(array_position(data, a)=0)] + Example 1: Finding the position of a string in an array of strings + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(["c", "b", "a"],)], ['data']) + >>> df.select(sf.array_position(df.data, "a")).show() + +-----------------------+ + |array_position(data, a)| + +-----------------------+ + | 3| + +-----------------------+ + + Example 2: Finding the position of a string in an empty array + + >>> from pyspark.sql import functions as sf + >>> from pyspark.sql.types import ArrayType, StringType, StructField, StructType + >>> schema = StructType([StructField("data", ArrayType(StringType()), True)]) + >>> df = spark.createDataFrame([([],)], schema=schema) + >>> df.select(sf.array_position(df.data, "a")).show() + +-----------------------+ + |array_position(data, a)| + +-----------------------+ + | 0| + +-----------------------+ + + Example 3: Finding the position of an integer in an array of integers + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([1, 2, 3],)], ['data']) + >>> df.select(sf.array_position(df.data, 2)).show() + +-----------------------+ + |array_position(data, 2)| + +-----------------------+ + | 2| + +-----------------------+ + + Example 4: Finding the position of a non-existing value in an array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(["c", "b", "a"],)], ['data']) + >>> df.select(sf.array_position(df.data, "d")).show() + +-----------------------+ + |array_position(data, d)| + +-----------------------+ + | 0| + +-----------------------+ + + Example 5: Finding the position of a value in an array with nulls + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([([None, "b", "a"],)], ['data']) + >>> df.select(sf.array_position(df.data, "a")).show() + +-----------------------+ + |array_position(data, a)| + +-----------------------+ + | 3| + +-----------------------+ """ return _invoke_function("array_position", _to_java_column(col), value) @@ -12449,10 +12544,14 @@ def array_position(col: "ColumnOrName", value: Any) -> Column: @_try_remote_functions def element_at(col: "ColumnOrName", extraction: Any) -> Column: """ - Collection function: Returns element of array at given index in `extraction` if col is array. - Returns value for the given key in `extraction` if col is map. If position is negative - then location of the element will start from end, if number is outside the - array boundaries then None will be returned. + Collection function: + (array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will + throw an error. If index < 0, accesses elements from the last to the first. + If 'spark.sql.ansi.enabled' is set to true, an exception will be thrown if the index is out + of array boundaries instead of returning NULL. + + (map, key) - Returns value for given key in `extraction` if col is map. The function always + returns NULL if the key is not contained in the map. .. versionadded:: 2.4.0 @@ -12481,15 +12580,49 @@ def element_at(col: "ColumnOrName", extraction: Any) -> Column: Examples -------- + Example 1: Getting the first element of an array + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) - >>> df.select(element_at(df.data, 1)).collect() - [Row(element_at(data, 1)='a')] - >>> df.select(element_at(df.data, -1)).collect() - [Row(element_at(data, -1)='c')] + >>> df.select(sf.element_at(df.data, 1)).show() + +-------------------+ + |element_at(data, 1)| + +-------------------+ + | a| + +-------------------+ + + Example 2: Getting the last element of an array using negative index + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) + >>> df.select(sf.element_at(df.data, -1)).show() + +--------------------+ + |element_at(data, -1)| + +--------------------+ + | c| + +--------------------+ + + Example 3: Getting a value from a map using a key + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data']) + >>> df.select(sf.element_at(df.data, sf.lit("a"))).show() + +-------------------+ + |element_at(data, a)| + +-------------------+ + | 1.0| + +-------------------+ + + Example 4: Getting a non-existing value from a map using a key + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data']) - >>> df.select(element_at(df.data, lit("a"))).collect() - [Row(element_at(data, a)=1.0)] + >>> df.select(sf.element_at(df.data, sf.lit("c"))).show() + +-------------------+ + |element_at(data, c)| + +-------------------+ + | NULL| + +-------------------+ """ return _invoke_function_over_columns("element_at", col, lit(extraction)) @@ -12497,6 +12630,7 @@ def element_at(col: "ColumnOrName", extraction: Any) -> Column: @_try_remote_functions def try_element_at(col: "ColumnOrName", extraction: "ColumnOrName") -> Column: """ + Collection function: (array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will throw an error. If index < 0, accesses elements from the last to the first. The function always returns NULL if the index exceeds the length of the array. @@ -12515,15 +12649,60 @@ def try_element_at(col: "ColumnOrName", extraction: "ColumnOrName") -> Column: Examples -------- + Example 1: Getting the first element of an array + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) - >>> df.select(try_element_at(df.data, lit(1)).alias('r')).collect() - [Row(r='a')] - >>> df.select(try_element_at(df.data, lit(-1)).alias('r')).collect() - [Row(r='c')] + >>> df.select(sf.try_element_at(df.data, sf.lit(1))).show() + +-----------------------+ + |try_element_at(data, 1)| + +-----------------------+ + | a| + +-----------------------+ + + Example 2: Getting the last element of an array using negative index + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) + >>> df.select(sf.try_element_at(df.data, sf.lit(-1))).show() + +------------------------+ + |try_element_at(data, -1)| + +------------------------+ + | c| + +------------------------+ + + Example 3: Getting a value from a map using a key + + >>> from pyspark.sql import functions as sf >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data']) - >>> df.select(try_element_at(df.data, lit("a")).alias('r')).collect() - [Row(r=1.0)] + >>> df.select(sf.try_element_at(df.data, sf.lit("a"))).show() + +-----------------------+ + |try_element_at(data, a)| + +-----------------------+ + | 1.0| + +-----------------------+ + + Example 4: Getting a non-existing element from an array + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data']) + >>> df.select(sf.try_element_at(df.data, sf.lit(4))).show() + +-----------------------+ + |try_element_at(data, 4)| + +-----------------------+ + | NULL| + +-----------------------+ + + Example 5: Getting a non-existing value from a map using a key + + >>> from pyspark.sql import functions as sf + >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data']) + >>> df.select(sf.try_element_at(df.data, sf.lit("c"))).show() + +-----------------------+ + |try_element_at(data, c)| + +-----------------------+ + | NULL| + +-----------------------+ """ return _invoke_function_over_columns("try_element_at", col, extraction) From ec7d07c635c487fb19e04c48ebdffa7752015330 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Tue, 28 Nov 2023 10:38:29 +0900 Subject: [PATCH 29/40] [SPARK-46111][DOCS][PYTHON] Add copyright to the PySpark official documentation ### What changes were proposed in this pull request? This PR proposes to add the Apache Spark Foundation copyright notice to the bottom of the PySpark official documentation. ### Why are the changes needed? Our current documentation is missing the copyright. The addition of the copyright notice is necessary to ensure compliance with the Apache Software Foundation's requirements for project documentation. ### Does this PR introduce _any_ user-facing change? No API changes, but users will now see the Apache Spark Foundation copyright notice at the bottom of each page of the PySpark documentation as below: ## Before Screenshot 2023-11-27 at 11 49 38 AM ## After Screenshot 2023-11-27 at 11 35 35 AM ### How was this patch tested? Manually build the docs and confirm. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44026 from itholic/add_copyright. Authored-by: Haejoon Lee Signed-off-by: Hyukjin Kwon --- python/docs/source/_templates/spark_footer.html | 3 +++ python/docs/source/conf.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 python/docs/source/_templates/spark_footer.html diff --git a/python/docs/source/_templates/spark_footer.html b/python/docs/source/_templates/spark_footer.html new file mode 100644 index 0000000000000..684482b0c2cdf --- /dev/null +++ b/python/docs/source/_templates/spark_footer.html @@ -0,0 +1,3 @@ +

diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py index 81083c007b346..de7ab953c5386 100644 --- a/python/docs/source/conf.py +++ b/python/docs/source/conf.py @@ -12,6 +12,7 @@ # All configuration values have a default; values that are commented out # serve to show the default. +from datetime import datetime import sys import os import shutil @@ -124,7 +125,8 @@ # General information about the project. project = 'PySpark' -copyright = '' +# We have our custom "spark_footer.html" template, using copyright for the current year. +copyright = f"Copyright @ {datetime.now().year}" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -194,6 +196,7 @@ # further. For a list of options available for each theme, see the # documentation. html_theme_options = { + "footer_start": ["spark_footer", "sphinx-version"], "navbar_end": ["version-switcher", "theme-switcher"], "logo": { "image_light": "_static/spark-logo-light.png", From 4f59e1b663812a47ec1906b40dc59f6ed5342e50 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 28 Nov 2023 10:46:28 +0900 Subject: [PATCH 30/40] [SPARK-46126][PYTHON][TESTS] Fix the doctest in pyspark.pandas.frame.DataFrame.to_dict (Python 3.12) ### What changes were proposed in this pull request? This PR proposes to fix doctest, `pyspark.pandas.frame.DataFrame.to_dict`, compatible with Python 3.12. ``` File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 2515, in pyspark.pandas.frame.DataFrame.to_dict Failed example: df.to_dict(into=OrderedDict) Expected: OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) Got: OrderedDict({'col1': OrderedDict({'row1': 1, 'row2': 2}), 'col2': OrderedDict({'row1': 0.5, 'row2': 0.75})}) ``` ### Why are the changes needed? For the proper test for Python 3.12. It is failing, see https://github.com/apache/spark/actions/runs/7006848931/job/19059702970 ### Does this PR introduce _any_ user-facing change? No. A bit of user-facing doc change but very trival. ### How was this patch tested? Fixed unittests. Manually tested via: ```bash python/run-tests --python-executable=python3 --testnames 'pyspark.pandas.frame' ... Tests passed in 721 seconds ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44042 from HyukjinKwon/SPARK-46126. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon --- python/pyspark/pandas/frame.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 4ecc85ce8f795..b53f5adfbaa81 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -2512,9 +2512,8 @@ def to_dict(self, orient: str = "dict", into: Type = dict) -> Union[List, Mappin You can also specify the mapping type. >>> from collections import OrderedDict, defaultdict - >>> df.to_dict(into=OrderedDict) - OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), \ -('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) + >>> df.to_dict(into=OrderedDict) # doctest: +ELLIPSIS + OrderedDict(...) If you want a `defaultdict`, you need to initialize it: From dbc8756bdac823be42ed10bc011415f405905497 Mon Sep 17 00:00:00 2001 From: Angerszhuuuu Date: Tue, 28 Nov 2023 11:04:14 +0800 Subject: [PATCH 31/40] [SPARK-46006][YARN][FOLLOWUP] YarnAllocator set target executor number to 0 to cancel pending allocate request when driver stop ### What changes were proposed in this pull request? YarnAllocator set target executor number to 0 to cancel pending allocate request when driver stop Now for this issue we do: 1. AllocationFailure should not be treated as exitCausedByApp when driver is shutting down https://github.com/apache/spark/pull/38622 2. Avoid new allocation requests when sc.stop stuck https://github.com/apache/spark/pull/43906 3. Cancel pending allocation request, this pr https://github.com/apache/spark/pull/44036 ### Why are the changes needed? Avoid unnecessary allocate request ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? MT ### Was this patch authored or co-authored using generative AI tooling? No Closes #44036 from AngersZhuuuu/SPARK-46006-FOLLOWUP. Authored-by: Angerszhuuuu Signed-off-by: Kent Yao --- .../scala/org/apache/spark/deploy/yarn/YarnAllocator.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala index f8afbc81c1211..5d24870bbcda3 100644 --- a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala +++ b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala @@ -385,7 +385,10 @@ private[yarn] class YarnAllocator( this.hostToLocalTaskCountPerResourceProfileId = hostToLocalTaskCountPerResourceProfileId if (resourceProfileToTotalExecs.isEmpty) { - targetNumExecutorsPerResourceProfileId.clear() + // Set target executor number to 0 to cancel pending allocate request. + targetNumExecutorsPerResourceProfileId.keys.foreach { rp => + targetNumExecutorsPerResourceProfileId(rp) = 0 + } allocatorNodeHealthTracker.setSchedulerExcludedNodes(excludedNodes) true } else { From 2800b5849309645657b9d308557009b31e14084e Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Tue, 28 Nov 2023 11:07:14 +0800 Subject: [PATCH 32/40] [SPARK-46116][DOCS][PYTHON] Adding "Stack Overflow" and "Mailing Lists" link into PySpark doc homepage ### What changes were proposed in this pull request? This PR proposes to enhance the PySpark documentation by adding more items for a "Useful links"including "Stack Overflow", "Dev Mailing List" and the "User Mailing List". ### Why are the changes needed? It is aimed at improving user engagement and providing quick access to community support and discussions. This approach is inspired by the [Pandas documentation](https://pandas.pydata.org/docs/index.html), which effectively uses a similar section for community engagement. The "Stack Overflow" will lead users to a curated list of StackOverflow questions tagged with `pyspark`, while the mailing lists will offer platforms for deeper discussions and insights within the Spark community. ### Does this PR introduce _any_ user-facing change? No API change, but the main page of the PySpark documentation will be updated to include a new "Useful links"as below: Screenshot 2023-11-27 at 5 29 19 PM ## Linked pages for each items ### Stack Overflow Screenshot 2023-11-27 at 2 59 31 PM ### Dev Mailing List Screenshot 2023-11-27 at 3 00 23 PM ### User Mailing List Screenshot 2023-11-27 at 3 00 44 PM ### How was this patch tested? Manually build doc & verify each links. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44033 from itholic/improve_useful_links. Authored-by: Haejoon Lee Signed-off-by: Kent Yao --- python/docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/docs/source/index.rst b/python/docs/source/index.rst index b3233744c5eb1..72a846290fe9e 100644 --- a/python/docs/source/index.rst +++ b/python/docs/source/index.rst @@ -24,7 +24,7 @@ PySpark Overview **Date**: |today| **Version**: |release| **Useful links**: -|binder|_ | `GitHub `_ | `Issues `_ | |examples|_ | `Community `_ +|binder|_ | `GitHub `_ | `Issues `_ | |examples|_ | `Community `_ | `Stack Overflow `_ | `Dev Mailing List `_ | `User Mailing List `_ PySpark is the Python API for Apache Spark. It enables you to perform real-time, large-scale data processing in a distributed environment using Python. It also provides a PySpark From 158f87621570b82206178d1847d84749538baa04 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 28 Nov 2023 14:40:04 +0900 Subject: [PATCH 33/40] [SPARK-46127][PYTHON][TESTS] Disable pyspark.tests.test_worker.WorkerSegfaultNonDaemonTest.test_python_segfault with Python 3.12 ### What changes were proposed in this pull request? This PR disables `pyspark.tests.test_worker.WorkerSegfaultNonDaemonTest.test_python_segfault` with Python 3.12 for now. ### Why are the changes needed? This test is flaky, and stops the tests run till the end, e.g., see https://github.com/apache/spark/actions/runs/7006848931/job/19059701743 How `faulthandler` is used is correct, as documented in the standard Python documentation. So I do believe this is a bug from Python 3.12. I will track separately in Python side. ### Does this PR introduce _any_ user-facing change? No, test-only. ### How was this patch tested? Manually: ```bash python/run-tests --python-executable=python3 --testnames 'pyspark.tests.test_worker WorkerSegfaultNonDaemonTest.test_python_segfault' ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44044 from HyukjinKwon/SPARK-46127. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon --- python/pyspark/tests/test_worker.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py index 2d675811fb9bc..bab853967b9e1 100644 --- a/python/pyspark/tests/test_worker.py +++ b/python/pyspark/tests/test_worker.py @@ -230,6 +230,7 @@ def conf(cls): _conf.set("spark.python.worker.faulthandler.enabled", "true") return _conf + @unittest.skipIf(sys.version_info < (3, 12), "SPARK-46130: Flaky with Python 3.12") def test_python_segfault(self): try: From 984e797e02ca245b684fb18614f3378b9a559ab5 Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 28 Nov 2023 14:40:25 +0900 Subject: [PATCH 34/40] [SPARK-46131][PYTHON][INFRA] Install torchvision for Python 3.12 build ### What changes were proposed in this pull request? This PR adds `torchvision` into the testing image for Python 3.12. ### Why are the changes needed? To continue Python 3.12 build, and see what are failing. Currently it fails as below: https://github.com/apache/spark/actions/runs/7006848931/job/19059702169#step:12:4236 ``` ====================================================================== ERROR [0.001s]: test_end_to_end_run_distributedly (pyspark.ml.tests.connect.test_parity_torch_distributor.TorchDistributorDistributedUnitTestsOnConnect.test_end_to_end_run_distributedly) ---------------------------------------------------------------------- Traceback (most recent call last): File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 495, in test_end_to_end_run_distributedly train_fn = create_training_function(self.mnist_dir_path) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 60, in create_training_function from torchvision import transforms, datasets ModuleNotFoundError: No module named 'torchvision' ====================================================================== ERROR [0.001s]: test_end_to_end_run_locally (pyspark.ml.tests.connect.test_parity_torch_distributor.TorchDistributorLocalUnitTestsIIOnConnect.test_end_to_end_run_locally) ---------------------------------------------------------------------- Traceback (most recent call last): File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 402, in test_end_to_end_run_locally train_fn = create_training_function(self.mnist_dir_path) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 60, in create_training_function from torchvision import transforms, datasets ModuleNotFoundError: No module named 'torchvision' ====================================================================== ERROR [0.001s]: test_end_to_end_run_locally (pyspark.ml.tests.connect.test_parity_torch_distributor.TorchDistributorLocalUnitTestsOnConnect.test_end_to_end_run_locally) ---------------------------------------------------------------------- Traceback (most recent call last): File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 402, in test_end_to_end_run_locally train_fn = create_training_function(self.mnist_dir_path) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/__w/spark/spark/python/pyspark/ml/torch/tests/test_distributor.py", line 60, in create_training_function from torchvision import transforms, datasets ModuleNotFoundError: No module named 'torchvision' ---------------------------------------------------------------------- Ran 23 tests in 50.860s ``` and this pr fixes it ### Does this PR introduce _any_ user-facing change? No, dev-only. ### How was this patch tested? Manually tested. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44045 from HyukjinKwon/SPARK-46131. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon --- dev/infra/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/infra/Dockerfile b/dev/infra/Dockerfile index 10ae49b71665f..7348c6af1e059 100644 --- a/dev/infra/Dockerfile +++ b/dev/infra/Dockerfile @@ -138,4 +138,5 @@ RUN python3.12 -m pip install numpy 'pyarrow>=14.0.0' 'six==1.16.0' 'pandas<=2.1 RUN python3.12 -m pip install 'grpcio==1.59.3' 'grpcio-status==1.59.3' 'protobuf==4.25.1' 'googleapis-common-protos==1.56.4' # TODO(SPARK-46078) Use official one instead of nightly build when it's ready RUN python3.12 -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu +RUN python3.12 -m pip install torchvision --index-url https://download.pytorch.org/whl/cpu RUN python3.12 -m pip install torcheval From bfb08823b490c17943d75b0fa24a6838ef1e2634 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 28 Nov 2023 15:13:07 +0900 Subject: [PATCH 35/40] [SPARK-32407][SPARK-35375][INFRA][DOCS] Delete comments on `Sphinx` and `Jinja2` ### What changes were proposed in this pull request? Delete comments on `Sphinx` and `Jinja2` ### Why are the changes needed? they had been upgraded in https://github.com/apache/spark/pull/44012 ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44046 from zhengruifeng/infra_Sphinx_nit. Authored-by: Ruifeng Zheng Signed-off-by: Hyukjin Kwon --- .github/workflows/build_and_test.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ccc437269bfa7..01e5458340af2 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -692,10 +692,6 @@ jobs: - name: Install Python linter dependencies if: inputs.branch != 'branch-3.3' && inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5' run: | - # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. - # See also https://github.com/sphinx-doc/sphinx/issues/7551. - # Jinja2 3.0.0+ causes error when building with Sphinx. - # See also https://issues.apache.org/jira/browse/SPARK-35375. python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==23.9.1' python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.59.3' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' - name: Python linter @@ -745,10 +741,6 @@ jobs: Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" - name: Install dependencies for documentation generation run: | - # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. - # See also https://github.com/sphinx-doc/sphinx/issues/7551. - # Jinja2 3.0.0+ causes error when building with Sphinx. - # See also https://issues.apache.org/jira/browse/SPARK-35375. # Pin the MarkupSafe to 2.0.1 to resolve the CI error. # See also https://issues.apache.org/jira/browse/SPARK-38279. python3.9 -m pip install 'sphinx==4.2.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 'markupsafe==2.0.1' 'pyzmq<24.0.0' From 486439334702439807ad83fd4dc54884ede4f6eb Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 28 Nov 2023 16:34:37 +0900 Subject: [PATCH 36/40] [SPARK-46127][TESTS][PYTHON][FOLLOW-UP] Fix skip condition from " < (3, 12)" to " > (3, 11)" ### What changes were proposed in this pull request? This PR proposes to fix the test condition from " < (3, 12)" to " > (3, 11)". ### Why are the changes needed? Incorrect condition. The test has to be skipped with Python 3.12+ ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manually. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44050 from HyukjinKwon/SPARK-46127-followup. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon --- python/pyspark/tests/test_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/tests/test_worker.py b/python/pyspark/tests/test_worker.py index bab853967b9e1..9e8a05d18347c 100644 --- a/python/pyspark/tests/test_worker.py +++ b/python/pyspark/tests/test_worker.py @@ -230,7 +230,7 @@ def conf(cls): _conf.set("spark.python.worker.faulthandler.enabled", "true") return _conf - @unittest.skipIf(sys.version_info < (3, 12), "SPARK-46130: Flaky with Python 3.12") + @unittest.skipIf(sys.version_info > (3, 11), "SPARK-46130: Flaky with Python 3.12") def test_python_segfault(self): try: From 04c9583cecdf929e8ac57eb07a8c53f488c47671 Mon Sep 17 00:00:00 2001 From: Jungtaek Lim Date: Tue, 28 Nov 2023 17:05:11 +0900 Subject: [PATCH 37/40] [SPARK-45833][SS][DOCS] Document the new introduction of state data source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? This PR proposes to add a new doc page describing the new data source, `state data source`. Worth noting that we explicitly mention the data source as experimental, so that we do not close the opportunity to improve further if it's backward incompatible. ### Why are the changes needed? The data source is an user-facing one and would be evolved over time, hence we'll need to document and publicize it. ### Does this PR introduce _any_ user-facing change? Yes, doc change. ### How was this patch tested? Built the docs directory with jekyll. Here are relevant screenshots. 스크린샷 2023-11-28 오후 5 02 47 ![structured-streaming-state-data-source-1](https://github.com/apache/spark/assets/1317309/c65cef9f-750b-4c00-a289-c705386a538b) ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43920 from HeartSaVioR/SPARK-45833. Authored-by: Jungtaek Lim Signed-off-by: Jungtaek Lim --- .../structured-streaming-programming-guide.md | 8 + .../structured-streaming-state-data-source.md | 248 ++++++++++++++++++ 2 files changed, 256 insertions(+) create mode 100644 docs/structured-streaming-state-data-source.md diff --git a/docs/structured-streaming-programming-guide.md b/docs/structured-streaming-programming-guide.md index 547834c7f9e3a..33b9453a18c37 100644 --- a/docs/structured-streaming-programming-guide.md +++ b/docs/structured-streaming-programming-guide.md @@ -2452,6 +2452,14 @@ Specifically for built-in HDFS state store provider, users can check the state s it is best if cache missing count is minimized that means Spark won't waste too much time on loading checkpointed state. User can increase Spark locality waiting configurations to avoid loading state store providers in different executors across batches. +#### State Data Source (Experimental) + +Apache Spark provides a streaming state related data source that provides the ability to manipulate state stores in the checkpoint. Users can run the batch query with State Data Source to get the visibility of the states for existing streaming query. + +As of Spark 4.0, the data source only supports read feature. See [State Data Source Integration Guide](structured-streaming-state-data-source.html) for more details. + +NOTE: this data source is currently marked as experimental - source options and the behavior (output) might be subject to change. + ## Starting Streaming Queries Once you have defined the final result DataFrame/Dataset, all that is left is for you to start the streaming computation. To do that, you have to use the `DataStreamWriter` ([Python](api/python/reference/pyspark.ss/api/pyspark.sql.streaming.DataStreamWriter.html#pyspark.sql.streaming.DataStreamWriter)/[Scala](api/scala/org/apache/spark/sql/streaming/DataStreamWriter.html)/[Java](api/java/org/apache/spark/sql/streaming/DataStreamWriter.html) docs) diff --git a/docs/structured-streaming-state-data-source.md b/docs/structured-streaming-state-data-source.md new file mode 100644 index 0000000000000..a9353861c532c --- /dev/null +++ b/docs/structured-streaming-state-data-source.md @@ -0,0 +1,248 @@ +--- +layout: global +displayTitle: State Data Source Integration Guide +title: State Data Source Integration Guide +license: | + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--- + +State data source Guide in Structured Streaming (Experimental) + +## Overview + +State data source provides functionality to manipulate the state from the checkpoint. + +As of Spark 4.0, state data source provides the read functionality with a batch query. Additional functionalities including write is on the future roadmap. + +NOTE: this data source is currently marked as experimental - source options and the behavior (output) might be subject to change. + +## Reading state key-values from the checkpoint + +State data source enables reading key-value pairs from the state store in the checkpoint, via running a separate batch query. +Users can leverage the functionality to cover two major use cases described below: + +* Construct a test checking both output and the state. It is non-trivial to deduce the key-value of the state from the output, and having visibility of the state would be a huge win on testing. +* Investigate an incident against stateful streaming query. If users observe the incorrect output and want to track how it came up, having visibility of the state would be required. + +Users can read an instance of state store, which is matched to a single stateful operator in most cases. This means, users can expect that they can read the entire key-value pairs in the state for a single stateful operator. + +Note that there could be an exception, e.g. stream-stream join, which leverages multiple state store instances internally. The data source abstracts the internal representation away from users and +provides a user-friendly approach to read the state. See the section for stream-stream join for more details. + +### Creating a State store for Batch Queries (all defaults) + +
+ +
+{% highlight python %} + +df = spark \ +.read \ +.format("statestore") \ +.load("") + +{% endhighlight %} +
+ +
+{% highlight scala %} + +val df = spark +.read +.format("statestore") +.load("") + +{% endhighlight %} +
+ +
+{% highlight java %} + +Dataset df = spark +.read() +.format("statestore") +.load(""); + +{% endhighlight %} +
+ +
+ +Each row in the source has the following schema: + + + + + + + + + + + + + + + + + + +
ColumnTypeNote
keystruct (depends on the type for state key)
valuestruct (depends on the type for state value)
_partition_idintmetadata column (hidden unless specified with SELECT)
+ +The nested columns for key and value heavily depend on the input schema of the stateful operator as well as the type of operator. +Users are encouraged to query about the schema via df.schema() / df.printSchema() first to understand the type of output. + +The following options must be set for the source. + + + + + + + + +
Optionvaluemeaning
pathstringSpecify the root directory of the checkpoint location. You can either specify the path via option("path", `path`) or load(`path`).
+ +The following configurations are optional: + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Optionvaluedefaultmeaning
batchIdnumeric valuelatest committed batchRepresents the target batch to read from. This option is used when users want to perform time-travel. The batch should be committed but not yet cleaned up.
operatorIdnumeric value0Represents the target operator to read from. This option is used when the query is using multiple stateful operators.
storeNamestringDEFAULTRepresents the target state store name to read from. This option is used when the stateful operator uses multiple state store instances. It is not required except stream-stream join.
joinSidestring ("left" or "right")(none)Represents the target side to read from. This option is used when users want to read the state from stream-stream join.
+ +### Reading state for Stream-stream join + +Structured Streaming implements the stream-stream join feature via leveraging multiple instances of state store internally. +These instances logically compose buffers to store the input rows for left and right. + +Since it is more obvious to users to reason about, the data source provides the option 'joinSide' to read the buffered input for specific side of the join. +To enable the functionality to read the internal state store instance directly, we also allow specifying the option 'storeName', with restriction that 'storeName' and 'joinSide' cannot be specified together. + +## State metadata source + +Before querying the state from existing checkpoint via state data source, users would like to understand the information for the checkpoint, especially about state operator. This includes which operators and state store instances are available in the checkpoint, available range of batch IDs, etc. + +Structured Streaming provides a data source named "State metadata source" to provide the state-related metadata information from the checkpoint. + +Note: The metadata is constructed when the streaming query is running with Spark 4.0+. The existing checkpoint which has been running with lower Spark version does not have the metadata and will be unable to query/use with this metadata source. It is required to run the streaming query pointing the existing checkpoint in Spark 4.0+ to construct the metadata before querying. + +### Creating a State metadata store for Batch Queries + +
+ +
+{% highlight python %} + +df = spark \ +.read \ +.format("state-metadata") \ +.load("") + +{% endhighlight %} +
+ +
+{% highlight scala %} + +val df = spark +.read +.format("state-metadata") +.load("") + +{% endhighlight %} +
+ +
+{% highlight java %} + +Dataset df = spark +.read() +.format("state-metadata") +.load(""); + +{% endhighlight %} +
+ +
+ +Each row in the source has the following schema: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ColumnTypeNote
operatorIdint
operatorNamestring
stateStoreNameint
numPartitionsint
minBatchIdintThe minimum batch ID available for querying state. The value could be invalid if the streaming query taking the checkpoint is running, as cleanup would run.
maxBatchIdintThe maximum batch ID available for querying state. The value could be invalid if the streaming query taking the checkpoint is running, as the query will commit further batches.
_numColsPrefixKeyintmetadata column (hidden unless specified with SELECT)
+ +One of the major use cases of this data source is to identify the operatorId to query if the query has multiple stateful operators, e.g. stream-stream join followed by deduplication. +The column 'operatorName' helps users to identify the operatorId for given operator. + +Additionally, if users want to query about an internal state store instance for a stateful operator (e.g. stream-stream join), the column 'stateStoreName' would be useful to determine the target. From b5c94f1c02c66d422956260af6eba9527588ecf8 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Tue, 28 Nov 2023 16:54:52 +0800 Subject: [PATCH 38/40] [SPARK-46103][INFRA][FOLLOWUP] Unpin jinja2 in Python linter ### What changes were proposed in this pull request? https://github.com/apache/spark/pull/44012 unpinned jinja2 in doc build, this PR unpin it in Python linter. this pr is only for master branch, and won't affect branch-3.x daily build ### Why are the changes needed? to be consistent with requirements. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #44051 from zhengruifeng/infra_linter_jinja. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 01e5458340af2..3bfd1abb48d9c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -692,7 +692,7 @@ jobs: - name: Install Python linter dependencies if: inputs.branch != 'branch-3.3' && inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5' run: | - python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==23.9.1' + python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc jinja2 'black==23.9.1' python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.59.3' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' - name: Python linter run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python From f2ea75f24690e14c1cccdd469159e16e443b3418 Mon Sep 17 00:00:00 2001 From: Haejoon Lee Date: Tue, 28 Nov 2023 18:13:52 +0900 Subject: [PATCH 39/40] [SPARK-32407][SPARK-35375][FOLLOWUP] Delete remaining comments on `Sphinx` and `Jinja2` ### What changes were proposed in this pull request? This is followup for https://github.com/apache/spark/pull/44046 to remove remaining comments. ### Why are the changes needed? We don't need those comments anymore since it's fixed. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? The existing CI should pass ### Was this patch authored or co-authored using generative AI tooling? No. Closes #44052 from itholic/SPARK-35375. Authored-by: Haejoon Lee Signed-off-by: Hyukjin Kwon --- docs/README.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/docs/README.md b/docs/README.md index 87d68c2f86499..99ccf69dbaee5 100644 --- a/docs/README.md +++ b/docs/README.md @@ -52,13 +52,6 @@ Note: If you are on a system with both Ruby 1.9 and Ruby 2.0 you may need to rep To generate SQL and Python API docs, you'll need to install these libraries: - Run the following command from $SPARK_HOME: ```sh $ pip install --upgrade -r dev/requirements.txt From a6cda2302c2962072af104c5d012329b06cbf166 Mon Sep 17 00:00:00 2001 From: Wenchen Fan Date: Tue, 28 Nov 2023 12:53:13 +0100 Subject: [PATCH 40/40] [SPARK-45760][SQL][FOLLOWUP] Inline With inside conditional branches ### What changes were proposed in this pull request? This is a followup of https://github.com/apache/spark/pull/43623 to fix a regression. For `With` inside conditional branches, they may not be evaluated at all and we should not pull out the common expressions into a `Project`, but just inline. ### Why are the changes needed? avoid perf regression ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? new test ### Was this patch authored or co-authored using generative AI tooling? No Closes #43978 from cloud-fan/with. Authored-by: Wenchen Fan Signed-off-by: Wenchen Fan --- .../sql/catalyst/expressions/Expression.scala | 5 + .../expressions/conditionalExpressions.scala | 19 ++- .../expressions/nullExpressions.scala | 8 ++ .../optimizer/RewriteWithExpression.scala | 119 ++++++++++++------ .../RewriteWithExpressionSuite.scala | 79 +++++++++++- 5 files changed, 185 insertions(+), 45 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala index 0dc70c6c3947c..2cc813bd30556 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Expression.scala @@ -513,6 +513,11 @@ trait ConditionalExpression extends Expression { */ def alwaysEvaluatedInputs: Seq[Expression] + /** + * Return a copy of itself with a new `alwaysEvaluatedInputs`. + */ + def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): ConditionalExpression + /** * Return groups of branches. For each group, at least one branch will be hit at runtime, * so that we can eagerly evaluate the common expressions of a group. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala index 28a7db51621fd..9ee2f2bb41417 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/conditionalExpressions.scala @@ -56,6 +56,10 @@ case class If(predicate: Expression, trueValue: Expression, falseValue: Expressi */ override def alwaysEvaluatedInputs: Seq[Expression] = predicate :: Nil + override def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): If = { + copy(predicate = alwaysEvaluatedInputs.head) + } + override def branchGroups: Seq[Seq[Expression]] = Seq(Seq(trueValue, falseValue)) final override val nodePatterns : Seq[TreePattern] = Seq(IF) @@ -165,8 +169,15 @@ case class CaseWhen( final override val nodePatterns : Seq[TreePattern] = Seq(CASE_WHEN) - override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): Expression = - super.legacyWithNewChildren(newChildren) + override protected def withNewChildrenInternal(newChildren: IndexedSeq[Expression]): CaseWhen = { + if (newChildren.length % 2 == 0) { + copy(branches = newChildren.grouped(2).map { case Seq(a, b) => (a, b) }.toSeq) + } else { + copy( + branches = newChildren.dropRight(1).grouped(2).map { case Seq(a, b) => (a, b) }.toSeq, + elseValue = newChildren.lastOption) + } + } // both then and else expressions should be considered. @transient @@ -213,6 +224,10 @@ case class CaseWhen( */ override def alwaysEvaluatedInputs: Seq[Expression] = children.head :: Nil + override def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): CaseWhen = { + withNewChildrenInternal(alwaysEvaluatedInputs.toIndexedSeq ++ children.drop(1)) + } + override def branchGroups: Seq[Seq[Expression]] = { // We look at subexpressions in conditions and values of `CaseWhen` separately. It is // because a subexpression in conditions will be run no matter which condition is matched diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala index 0e9e375b8acf8..4ccb369f5e2b2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala @@ -70,6 +70,10 @@ case class Coalesce(children: Seq[Expression]) */ override def alwaysEvaluatedInputs: Seq[Expression] = children.head :: Nil + override def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): Coalesce = { + withNewChildrenInternal(alwaysEvaluatedInputs.toIndexedSeq ++ children.drop(1)) + } + override def branchGroups: Seq[Seq[Expression]] = if (children.length > 1) { // If there is only one child, the first child is already covered by // `alwaysEvaluatedInputs` and we should exclude it here. @@ -290,6 +294,10 @@ case class NaNvl(left: Expression, right: Expression) */ override def alwaysEvaluatedInputs: Seq[Expression] = left :: Nil + override def withNewAlwaysEvaluatedInputs(alwaysEvaluatedInputs: Seq[Expression]): NaNvl = { + copy(left = alwaysEvaluatedInputs.head) + } + override def branchGroups: Seq[Seq[Expression]] = Seq(children) override def eval(input: InternalRow): Any = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala index c5bd71b4a7d1f..cf2c77069a195 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpression.scala @@ -19,7 +19,8 @@ package org.apache.spark.sql.catalyst.optimizer import scala.collection.mutable -import org.apache.spark.sql.catalyst.expressions.{Alias, CommonExpressionDef, CommonExpressionRef, Expression, With} +import org.apache.spark.SparkException +import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.TreePattern.{COMMON_EXPR_REF, WITH_EXPRESSION} @@ -35,56 +36,92 @@ object RewriteWithExpression extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = { plan.transformWithPruning(_.containsPattern(WITH_EXPRESSION)) { case p if p.expressions.exists(_.containsPattern(WITH_EXPRESSION)) => - var newChildren = p.children - var newPlan: LogicalPlan = p.transformExpressionsUp { - case With(child, defs) => - val refToExpr = mutable.HashMap.empty[Long, Expression] - val childProjections = Array.fill(newChildren.size)(mutable.ArrayBuffer.empty[Alias]) + val inputPlans = p.children.toArray + var newPlan: LogicalPlan = p.mapExpressions { expr => + rewriteWithExprAndInputPlans(expr, inputPlans) + } + newPlan = newPlan.withNewChildren(inputPlans.toIndexedSeq) + if (p.output == newPlan.output) { + newPlan + } else { + Project(p.output, newPlan) + } + } + } + + private def rewriteWithExprAndInputPlans( + e: Expression, + inputPlans: Array[LogicalPlan]): Expression = { + if (!e.containsPattern(WITH_EXPRESSION)) return e + e match { + case w: With => + // Rewrite nested With expressions first + val child = rewriteWithExprAndInputPlans(w.child, inputPlans) + val defs = w.defs.map(rewriteWithExprAndInputPlans(_, inputPlans)) + val refToExpr = mutable.HashMap.empty[Long, Expression] + val childProjections = Array.fill(inputPlans.length)(mutable.ArrayBuffer.empty[Alias]) + + defs.zipWithIndex.foreach { case (CommonExpressionDef(child, id), index) => + if (child.containsPattern(COMMON_EXPR_REF)) { + throw SparkException.internalError( + "Common expression definition cannot reference other Common expression definitions") + } - defs.zipWithIndex.foreach { case (CommonExpressionDef(child, id), index) => - if (CollapseProject.isCheap(child)) { - refToExpr(id) = child - } else { - val childProjectionIndex = newChildren.indexWhere( - c => child.references.subsetOf(c.outputSet) - ) - if (childProjectionIndex == -1) { - // When we cannot rewrite the common expressions, force to inline them so that the - // query can still run. This can happen if the join condition contains `With` and - // the common expression references columns from both join sides. - // TODO: things can go wrong if the common expression is nondeterministic. We - // don't fix it for now to match the old buggy behavior when certain - // `RuntimeReplaceable` did not use the `With` expression. - // TODO: we should calculate the ref count and also inline the common expression - // if it's ref count is 1. - refToExpr(id) = child - } else { - val alias = Alias(child, s"_common_expr_$index")() - childProjections(childProjectionIndex) += alias - refToExpr(id) = alias.toAttribute - } - } + if (CollapseProject.isCheap(child)) { + refToExpr(id) = child + } else { + val childProjectionIndex = inputPlans.indexWhere( + c => child.references.subsetOf(c.outputSet) + ) + if (childProjectionIndex == -1) { + // When we cannot rewrite the common expressions, force to inline them so that the + // query can still run. This can happen if the join condition contains `With` and + // the common expression references columns from both join sides. + // TODO: things can go wrong if the common expression is nondeterministic. We + // don't fix it for now to match the old buggy behavior when certain + // `RuntimeReplaceable` did not use the `With` expression. + // TODO: we should calculate the ref count and also inline the common expression + // if it's ref count is 1. + refToExpr(id) = child + } else { + val alias = Alias(child, s"_common_expr_$index")() + childProjections(childProjectionIndex) += alias + refToExpr(id) = alias.toAttribute } + } + } + + for (i <- inputPlans.indices) { + val projectList = childProjections(i) + if (projectList.nonEmpty) { + inputPlans(i) = Project(inputPlans(i).output ++ projectList, inputPlans(i)) + } + } - newChildren = newChildren.zip(childProjections).map { case (child, projections) => - if (projections.nonEmpty) { - Project(child.output ++ projections, child) - } else { - child - } + child.transformWithPruning(_.containsPattern(COMMON_EXPR_REF)) { + case ref: CommonExpressionRef => + if (!refToExpr.contains(ref.id)) { + throw SparkException.internalError("Undefined common expression id " + ref.id) } + refToExpr(ref.id) + } + case c: ConditionalExpression => + val newAlwaysEvaluatedInputs = c.alwaysEvaluatedInputs.map( + rewriteWithExprAndInputPlans(_, inputPlans)) + val newExpr = c.withNewAlwaysEvaluatedInputs(newAlwaysEvaluatedInputs) + // Use transformUp to handle nested With. + newExpr.transformUpWithPruning(_.containsPattern(WITH_EXPRESSION)) { + case With(child, defs) => + // For With in the conditional branches, they may not be evaluated at all and we can't + // pull the common expressions into a project which will always be evaluated. Inline it. + val refToExpr = defs.map(d => d.id -> d.child).toMap child.transformWithPruning(_.containsPattern(COMMON_EXPR_REF)) { case ref: CommonExpressionRef => refToExpr(ref.id) } } - newPlan = newPlan.withNewChildren(newChildren) - if (p.output == newPlan.output) { - newPlan - } else { - Project(p.output, newPlan) - } + case other => other.mapChildren(rewriteWithExprAndInputPlans(_, inputPlans)) } } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala index c625379eb5ffd..a386e9bf4efe6 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RewriteWithExpressionSuite.scala @@ -17,9 +17,10 @@ package org.apache.spark.sql.catalyst.optimizer +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, CommonExpressionDef, CommonExpressionRef, With} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Coalesce, CommonExpressionDef, CommonExpressionRef, With} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor @@ -57,7 +58,7 @@ class RewriteWithExpressionSuite extends PlanTest { ) } - test("nested WITH expression") { + test("nested WITH expression in the definition expression") { val a = testRelation.output.head val commonExprDef = CommonExpressionDef(a + a) val ref = new CommonExpressionRef(commonExprDef) @@ -85,6 +86,57 @@ class RewriteWithExpressionSuite extends PlanTest { ) } + test("nested WITH expression in the main expression") { + val a = testRelation.output.head + val commonExprDef = CommonExpressionDef(a + a) + val ref = new CommonExpressionRef(commonExprDef) + val innerExpr = With(ref + ref, Seq(commonExprDef)) + val innerCommonExprName = "_common_expr_0" + + val b = testRelation.output.last + val outerCommonExprDef = CommonExpressionDef(b + b) + val outerRef = new CommonExpressionRef(outerCommonExprDef) + val outerExpr = With(outerRef * outerRef + innerExpr, Seq(outerCommonExprDef)) + val outerCommonExprName = "_common_expr_0" + + val plan = testRelation.select(outerExpr.as("col")) + val rewrittenInnerExpr = (a + a).as(innerCommonExprName) + val rewrittenOuterExpr = (b + b).as(outerCommonExprName) + val finalExpr = rewrittenOuterExpr.toAttribute * rewrittenOuterExpr.toAttribute + + (rewrittenInnerExpr.toAttribute + rewrittenInnerExpr.toAttribute) + comparePlans( + Optimizer.execute(plan), + testRelation + .select((testRelation.output :+ rewrittenInnerExpr): _*) + .select((testRelation.output :+ rewrittenInnerExpr.toAttribute :+ rewrittenOuterExpr): _*) + .select(finalExpr.as("col")) + .analyze + ) + } + + test("correlated nested WITH expression is not supported") { + val b = testRelation.output.last + val outerCommonExprDef = CommonExpressionDef(b + b) + val outerRef = new CommonExpressionRef(outerCommonExprDef) + + val a = testRelation.output.head + // The inner expression definition references the outer expression + val commonExprDef1 = CommonExpressionDef(a + a + outerRef) + val ref1 = new CommonExpressionRef(commonExprDef1) + val innerExpr1 = With(ref1 + ref1, Seq(commonExprDef1)) + + val outerExpr1 = With(outerRef + innerExpr1, Seq(outerCommonExprDef)) + intercept[SparkException](Optimizer.execute(testRelation.select(outerExpr1.as("col")))) + + val commonExprDef2 = CommonExpressionDef(a + a) + val ref2 = new CommonExpressionRef(commonExprDef2) + // The inner main expression references the outer expression + val innerExpr2 = With(ref2 + outerRef, Seq(commonExprDef1)) + + val outerExpr2 = With(outerRef + innerExpr2, Seq(outerCommonExprDef)) + intercept[SparkException](Optimizer.execute(testRelation.select(outerExpr2.as("col")))) + } + test("WITH expression in filter") { val a = testRelation.output.head val commonExprDef = CommonExpressionDef(a + a) @@ -154,4 +206,27 @@ class RewriteWithExpressionSuite extends PlanTest { ) ) } + + test("WITH expression inside conditional expression") { + val a = testRelation.output.head + val commonExprDef = CommonExpressionDef(a + a) + val ref = new CommonExpressionRef(commonExprDef) + val expr = Coalesce(Seq(a, With(ref * ref, Seq(commonExprDef)))) + val inlinedExpr = Coalesce(Seq(a, (a + a) * (a + a))) + val plan = testRelation.select(expr.as("col")) + // With in the conditional branches is always inlined. + comparePlans(Optimizer.execute(plan), testRelation.select(inlinedExpr.as("col"))) + + val expr2 = Coalesce(Seq(With(ref * ref, Seq(commonExprDef)), a)) + val plan2 = testRelation.select(expr2.as("col")) + val commonExprName = "_common_expr_0" + // With in the always-evaluated branches can still be optimized. + comparePlans( + Optimizer.execute(plan2), + testRelation + .select((testRelation.output :+ (a + a).as(commonExprName)): _*) + .select(Coalesce(Seq(($"$commonExprName" * $"$commonExprName"), a)).as("col")) + .analyze + ) + } }