From 7be00b6082f287817853c5b16e0dd12baded7763 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Fri, 10 Nov 2023 18:38:28 +0000 Subject: [PATCH] Add DataFrame.persist, and notes on execution model (#307) * wip: add notes on execution model * reword * remove column mentions for now * remove to_array * use persist instead * remove note on propagation * update purpose and scope * reduce execution_model * Update spec/API_specification/dataframe_api/dataframe_object.py Co-authored-by: Ralf Gommers * Update spec/purpose_and_scope.md --------- Co-authored-by: Ralf Gommers --- .../dataframe_api/dataframe_object.py | 35 +++++++++++++ spec/design_topics/execution_model.md | 49 +++++++++++++++++++ spec/design_topics/index.rst | 1 + spec/purpose_and_scope.md | 3 +- 4 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 spec/design_topics/execution_model.md diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 359171bb..142090e8 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -929,3 +929,38 @@ def join( present in both `self` and `other`. """ ... + + def persist(self) -> Self: + """Hint that computation prior to this point should not be repeated. + + This is intended as a hint, rather than as a directive. Implementations + which do not separate lazy vs eager execution may ignore this method and + treat it as a no-op. + + .. note:: + This method may trigger execution. If necessary, it should be called + at most once per dataframe, and as late as possible in the pipeline. + + For example, do this + + .. code-block:: python + + df: DataFrame + df = df.persist() + features = [] + for column_name in df.column_names: + if df.col(column_name).std() > 0: + features.append(column_name) + + instead of this: + + .. code-block:: python + + df: DataFrame + features = [] + for column_name in df.column_names: + # Do NOT do this! + if df.persist().col(column_name).std() > 0: + features.append(column_name) + """ + ... diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md new file mode 100644 index 00000000..c81c7767 --- /dev/null +++ b/spec/design_topics/execution_model.md @@ -0,0 +1,49 @@ +# Execution model + +## Scope + +The vast majority of the Dataframe API is designed to be agnostic of the +underlying execution model. + +However, there are some methods which, depending on the implementation, may +not be supported in some cases. + +For example, let's consider the following: +```python +df: DataFrame +features = [] +for column_name in df.column_names: + if df.col(column_name).std() > 0: + features.append(column_name) +return features +``` +If `df` is a lazy dataframe, then the call `df.col(column_name).std() > 0` returns +a (ducktyped) Python boolean scalar. No issues so far. Problem is, +what happens when `if df.col(column_name).std() > 0` is called? + +Under the hood, Python will call `(df.col(column_name).std() > 0).__bool__()` in +order to extract a Python boolean. This is a problem for "lazy" implementations, +as the laziness needs breaking in order to evaluate the above. + +Dask and Polars both require that `.compute` (resp. `.collect`) be called beforehand +for such an operation to be executed: + ```python + In [1]: import dask.dataframe as dd + + In [2]: pandas_df = pd.DataFrame({"x": [1, 2, 3], "y": 1}) + + In [3]: df = dd.from_pandas(pandas_df, npartitions=2) + + In [4]: scalar = df.x.std() > 0 + + In [5]: if scalar: + ...: print('scalar is positive') + ...: + --------------------------------------------------------------------------- + [...] + + TypeError: Trying to convert dd.Scalar to a boolean value. Because Dask objects are lazily evaluated, they cannot be converted to a boolean value or used in boolean conditions like if statements. Try calling .compute() to force computation prior to converting to a boolean value or using in a conditional statement. + ``` + +Whether such computation succeeds or raises is currently not defined by the Standard and may vary across +implementations. diff --git a/spec/design_topics/index.rst b/spec/design_topics/index.rst index 023da1a9..9f11ca89 100644 --- a/spec/design_topics/index.rst +++ b/spec/design_topics/index.rst @@ -8,3 +8,4 @@ Design topics & constraints backwards_compatibility data_interchange python_builtin_types + execution_model diff --git a/spec/purpose_and_scope.md b/spec/purpose_and_scope.md index dfc6d138..f09a2e25 100644 --- a/spec/purpose_and_scope.md +++ b/spec/purpose_and_scope.md @@ -125,9 +125,10 @@ See the [use cases](use_cases.md) section for details on the exact use cases con Implementation details of the dataframes and execution of operations. This includes: - How data is represented and stored (whether the data is in memory, disk, distributed) -- Expectations on when the execution is happening (in an eager or lazy way) +- Expectations on when the execution is happening (in an eager or lazy way) (see `execution model` for some caveats) - Other execution details + **Rationale:** The API defined in this document needs to be used by libraries as diverse as Ibis, Dask, Vaex or cuDF. The data can live in databases, distributed systems, disk or GPU memory. Any decision that involves assumptions on where the data is stored, or where execution happens