Skip to content

Commit

Permalink
Merge pull request #712 from DHI/from_pandas
Browse files Browse the repository at this point in the history
Create dataset from {pandas,polars} dataframe
  • Loading branch information
ecomodeller committed Aug 7, 2024
2 parents 5c512c0 + f6413bf commit 822fd18
Show file tree
Hide file tree
Showing 11 changed files with 429 additions and 50 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,5 @@ docs/api/
.venv/

.testmondata
objects.json
objects.json
.jupyter_cache/
2 changes: 2 additions & 0 deletions docs/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ quartodoc:
- open
- read
- read_pfs
- from_pandas
- from_polars
- title: Dataset
desc: ""
contents:
Expand Down
23 changes: 23 additions & 0 deletions docs/user-guide/dfs0.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,29 @@ df = pd.read_csv(
df.to_dfs0("mauna_loa_co2.dfs0")
```

```{python}
import polars as pl
import mikeio
from datetime import datetime
df = pl.DataFrame(
{
"time": [datetime(2021, 1, 1), datetime(2021, 1, 2)],
"A": [1.0, 2.0],
"B": [4.0, 5.0],
}
)
ds = mikeio.from_polars(
df,
items={
"A": mikeio.ItemInfo(mikeio.EUMType.Water_Level),
"B": mikeio.ItemInfo(mikeio.EUMType.Discharge),
},
)
ds
```

## Dfs0 example notebooks

* [Dfs0](https://nbviewer.jupyter.org/github/DHI/mikeio/blob/main/notebooks/Dfs0%20-%20Timeseries.ipynb) - read, write, to_dataframe, non-equidistant, accumulated timestep, extrapolation
Expand Down
4 changes: 3 additions & 1 deletion mikeio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
if "64" not in architecture()[0]:
raise Exception("This library has not been tested for a 32 bit system.")

from .dataset import DataArray, Dataset
from .dataset import DataArray, Dataset, from_pandas, from_polars
from .dfs import Dfs0, Dfs1, Dfs2, Dfs3
from .dfsu import Dfsu, Mesh
from .eum import EUMType, EUMUnit, ItemInfo
Expand Down Expand Up @@ -209,4 +209,6 @@ def open(filename: str | Path, **kwargs: Any) -> Any:
"read_xyz",
"read",
"open",
"from_pandas",
"from_polars",
]
4 changes: 2 additions & 2 deletions mikeio/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ._dataarray import DataArray
from ._dataset import Dataset
from ._dataset import Dataset, from_pandas, from_polars

__all__ = ["DataArray", "Dataset"]
__all__ = ["DataArray", "Dataset", "from_pandas", "from_polars"]
11 changes: 6 additions & 5 deletions mikeio/dataset/_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,11 +220,12 @@ def _guess_dims(
ndim_no_time = ndim if (len(dims) == 0) else ndim - 1

if isinstance(geometry, GeometryUndefined):
DIMS_MAPPING = {
0: tuple(),
1: ("x",),
2: ("y", "x"),
3: ("z", "y", "x"),

DIMS_MAPPING: Mapping[int, Sequence[Any]] = {
0: [],
1: ["x"],
2: ["y", "x"],
3: ["z", "y", "x"],
}
spdims = DIMS_MAPPING[ndim_no_time]
else:
Expand Down
165 changes: 165 additions & 0 deletions mikeio/dataset/_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

if TYPE_CHECKING:
import xarray
import polars as pl

from ._dataarray import DataArray
from ._data_utils import _to_safe_name, _get_time_idx_list, _n_selected_timesteps
Expand Down Expand Up @@ -1897,3 +1898,167 @@ def __repr__(self) -> str:
out.append(f" {i}: {item}")

return str.join("\n", out)


def from_pandas(
df: pd.DataFrame,
items: Mapping[str, ItemInfo] | Sequence[ItemInfo] | ItemInfo | None = None,
) -> "Dataset":
"""Create a Dataset from a pandas DataFrame
Parameters
----------
df: pd.DataFrame
DataFrame with time index
items: Mapping[str, ItemInfo] | Sequence[ItemInfo] | ItemInfo | None, optional
Mapping of item names to ItemInfo objects, or a sequence of ItemInfo objects, or a single ItemInfo object.
Returns
-------
Dataset
time series dataset
Examples
--------
```{python}
import pandas as pd
import mikeio
df = pd.DataFrame(
{
"A": [1, 2, 3],
"B": [4, 5, 6],
},
index=pd.date_range("20210101", periods=3, freq="D"),
)
ds = mikeio.from_pandas(df, items={"A": mikeio.ItemInfo(mikeio.EUMType.Water_Level),
"B": mikeio.ItemInfo(mikeio.EUMType.Discharge)})
ds
```
"""

if not isinstance(df.index, pd.DatetimeIndex):
# look for datetime column
for col in df.columns:
if isinstance(df[col].iloc[0], pd.Timestamp):
df.index = df[col]
df = df.drop(columns=col)
break
if not isinstance(df.index, pd.DatetimeIndex):
raise ValueError(
"Dataframe index must be a DatetimeIndex or contain a datetime column."
)

ncol = df.values.shape[1]
data = [df.values[:, i] for i in range(ncol)]

item_list = _parse_items(df.columns, items)

das = {
item.name: DataArray(data=d, item=item, time=df.index)
for d, item in zip(data, item_list)
}
ds = Dataset(das)
return ds


def from_polars(
df: "pl.DataFrame",
items: Mapping[str, ItemInfo] | Sequence[ItemInfo] | ItemInfo | None = None,
datetime_col: str | None = None,
) -> "Dataset":
"""Create a Dataset from a polars DataFrame
Parameters
----------
df: pl.DataFrame
DataFrame
items: Mapping[str, ItemInfo] | Sequence[ItemInfo] | ItemInfo | None, optional
Mapping of item names to ItemInfo objects, or a sequence of ItemInfo objects, or a single ItemInfo object.
datetime_col: str, optional
Name of the column containing datetime information, default is to use the first datetime column found.
Returns
-------
Dataset
time series dataset
Examples
--------
```{python}
import polars as pl
import mikeio
from datetime import datetime
df = pl.DataFrame(
{
"time": [datetime(2021, 1, 1), datetime(2021, 1, 2)],
"A": [1.0, 2.0],
"B": [4.0, 5.0],
}
)
ds = mikeio.from_polars(
df,
items={
"A": mikeio.ItemInfo(mikeio.EUMType.Water_Level),
"B": mikeio.ItemInfo(mikeio.EUMType.Discharge),
},
)
ds
```
"""

import polars as pl

if datetime_col is None:
for col, dtype in zip(df.columns, df.dtypes):
if isinstance(dtype, pl.Datetime):
datetime_col = col
break

if datetime_col is None:
raise ValueError("Datetime column not found. Please specify datetime_col.")

time = pd.DatetimeIndex(df[datetime_col])
df = df.drop(datetime_col)

# convert the polars dataframe to list of numpy arrays
array = df.to_numpy()
data = [array[:, i] for i in range(array.shape[1])]

item_list = _parse_items(df.columns, items)

das = {
item.name: DataArray(data=d, item=item, time=time)
for d, item in zip(data, item_list)
}
ds = Dataset(das)
return ds


def _parse_items(
column_names: Sequence[str],
items: Mapping[str, ItemInfo] | Sequence[ItemInfo] | ItemInfo | None = None,
) -> List[ItemInfo]:
if items is None:
item_list: List[ItemInfo] = [ItemInfo(name) for name in column_names]
elif isinstance(items, ItemInfo):
eum_type = items.type
eum_unit = items.unit
item_list = [ItemInfo(name, eum_type, eum_unit) for name in column_names]

elif isinstance(items, Mapping):
item_list = [
ItemInfo(name, items[name].type, items[name].unit) for name in column_names
]
elif isinstance(items, Sequence):
item_list = [
ItemInfo(col, item.type, item.unit)
for col, item in zip(column_names, items)
]
else:
raise TypeError("items must be a mapping, sequence or ItemInfo")

return item_list
2 changes: 1 addition & 1 deletion mikeio/dfs/_dfs0.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ def dataframe_to_dfs0(
unit: EUMUnit, optional
Same unit for all items
items: list[ItemInfo]
Different types, units for each items, similar to `create`
Different types, units for each items
title: str, optional
Title of dfs0 file
dtype : np.dtype, optional
Expand Down
120 changes: 81 additions & 39 deletions notebooks/Dfs0 - Timeseries.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ dev = ["pytest",
"mypy==1.11.1",
]

test = ["pytest", "pytest-cov", "xarray","mypy==1.6.1","shapely","pyproj"]
test = ["pytest", "pytest-cov", "xarray","mypy==1.6.1","shapely","pyproj", "polars"]

notebooks= [
"nbformat",
Expand Down
Loading

0 comments on commit 822fd18

Please sign in to comment.