From 7be00b6082f287817853c5b16e0dd12baded7763 Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Fri, 10 Nov 2023 18:38:28 +0000
Subject: [PATCH] Add DataFrame.persist, and notes on execution model (#307)

* wip: add notes on execution model

* reword

* remove column mentions for now

* remove to_array

* use persist instead

* remove note on propagation

* update purpose and scope

* reduce execution_model

* Update spec/API_specification/dataframe_api/dataframe_object.py

Co-authored-by: Ralf Gommers <ralf.gommers@gmail.com>

* Update spec/purpose_and_scope.md

---------

Co-authored-by: Ralf Gommers <ralf.gommers@gmail.com>
---
 .../dataframe_api/dataframe_object.py         | 35 +++++++++++++
 spec/design_topics/execution_model.md         | 49 +++++++++++++++++++
 spec/design_topics/index.rst                  |  1 +
 spec/purpose_and_scope.md                     |  3 +-
 4 files changed, 87 insertions(+), 1 deletion(-)
 create mode 100644 spec/design_topics/execution_model.md

diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py
index 359171bb..142090e8 100644
--- a/spec/API_specification/dataframe_api/dataframe_object.py
+++ b/spec/API_specification/dataframe_api/dataframe_object.py
@@ -929,3 +929,38 @@ def join(
             present in both `self` and `other`.
         """
         ...
+
+    def persist(self) -> Self:
+        """Hint that computation prior to this point should not be repeated.
+
+        This is intended as a hint, rather than as a directive. Implementations
+        which do not separate lazy vs eager execution may ignore this method and
+        treat it as a no-op.
+
+        .. note::
+            This method may trigger execution. If necessary, it should be called
+            at most once per dataframe, and as late as possible in the pipeline.
+
+            For example, do this
+
+            .. code-block:: python
+
+                df: DataFrame
+                df = df.persist()
+                features = []
+                for column_name in df.column_names:
+                    if df.col(column_name).std() > 0:
+                        features.append(column_name)
+
+            instead of this:
+
+            .. code-block:: python
+
+                df: DataFrame
+                features = []
+                for column_name in df.column_names:
+                    # Do NOT do this!
+                    if df.persist().col(column_name).std() > 0:
+                        features.append(column_name)
+        """
+        ...
diff --git a/spec/design_topics/execution_model.md b/spec/design_topics/execution_model.md
new file mode 100644
index 00000000..c81c7767
--- /dev/null
+++ b/spec/design_topics/execution_model.md
@@ -0,0 +1,49 @@
+# Execution model
+
+## Scope
+
+The vast majority of the Dataframe API is designed to be agnostic of the
+underlying execution model.
+
+However, there are some methods which, depending on the implementation, may
+not be supported in some cases.
+
+For example, let's consider the following:
+```python
+df: DataFrame
+features = []
+for column_name in df.column_names:
+    if df.col(column_name).std() > 0:
+        features.append(column_name)
+return features
+```
+If `df` is a lazy dataframe, then the call `df.col(column_name).std() > 0` returns
+a (ducktyped) Python boolean scalar. No issues so far. Problem is,
+what happens when `if df.col(column_name).std() > 0` is called?
+
+Under the hood, Python will call `(df.col(column_name).std() > 0).__bool__()` in
+order to extract a Python boolean. This is a problem for "lazy" implementations,
+as the laziness needs breaking in order to evaluate the above.
+
+Dask and Polars both require that `.compute` (resp. `.collect`) be called beforehand
+for such an operation to be executed:
+  ```python
+  In [1]: import dask.dataframe as dd
+  
+  In [2]: pandas_df = pd.DataFrame({"x": [1, 2, 3], "y": 1})
+  
+  In [3]: df = dd.from_pandas(pandas_df, npartitions=2)
+  
+  In [4]: scalar = df.x.std() > 0
+  
+  In [5]: if scalar:
+     ...:     print('scalar is positive')
+     ...:
+  ---------------------------------------------------------------------------
+  [...]
+  
+  TypeError: Trying to convert dd.Scalar<gt-bbc3..., dtype=bool> to a boolean value. Because Dask objects are lazily evaluated, they cannot be converted to a boolean value or used in boolean conditions like if statements. Try calling .compute() to force computation prior to converting to a boolean value or using in a conditional statement.
+  ```
+
+Whether such computation succeeds or raises is currently not defined by the Standard and may vary across
+implementations.
diff --git a/spec/design_topics/index.rst b/spec/design_topics/index.rst
index 023da1a9..9f11ca89 100644
--- a/spec/design_topics/index.rst
+++ b/spec/design_topics/index.rst
@@ -8,3 +8,4 @@ Design topics & constraints
    backwards_compatibility
    data_interchange
    python_builtin_types
+   execution_model
diff --git a/spec/purpose_and_scope.md b/spec/purpose_and_scope.md
index dfc6d138..f09a2e25 100644
--- a/spec/purpose_and_scope.md
+++ b/spec/purpose_and_scope.md
@@ -125,9 +125,10 @@ See the [use cases](use_cases.md) section for details on the exact use cases con
 Implementation details of the dataframes and execution of operations. This includes:
 
 - How data is represented and stored (whether the data is in memory, disk, distributed)
-- Expectations on when the execution is happening (in an eager or lazy way)
+- Expectations on when the execution is happening (in an eager or lazy way) (see `execution model` for some caveats)
 - Other execution details
 
+
 **Rationale:** The API defined in this document needs to be used by libraries as diverse as Ibis,
 Dask, Vaex or cuDF. The data can live in databases, distributed systems, disk or GPU memory.
 Any decision that involves assumptions on where the data is stored, or where execution happens