Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: Optimize read_excel nrows #46894

Merged
merged 15 commits into from
Jun 5, 2022
11 changes: 11 additions & 0 deletions asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,15 @@ def time_read_excel(self, engine):
read_excel(fname, engine=engine)


class ReadExcelNRows(ReadExcel):
def time_read_excel(self, engine):
if engine == "xlrd":
fname = self.fname_excel_xls
elif engine == "odf":
fname = self.fname_odf
else:
fname = self.fname_excel
read_excel(fname, engine=engine, nrows=10)


from ..pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ Performance improvements
- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
- Performance improvement in :func:`factorize` (:issue:`46109`)
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
- Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.bug_fixes:
Expand Down
30 changes: 28 additions & 2 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
Generic,
Literal,
Mapping,
Sequence,
TypeVar,
cast,
overload,
Expand Down Expand Up @@ -54,7 +55,12 @@
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import is_file_like
from pandas.core.dtypes.common import (
is_bool,
is_file_like,
is_integer,
is_list_like,
)

from pandas.core.shared_docs import _shared_docs

Expand Down Expand Up @@ -175,12 +181,32 @@ def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT:


def validate_header_arg(header: object) -> None:
if isinstance(header, bool):
if header is None:
return
if is_integer(header):
header = cast(int, header)
if header < 0:
# GH 27779
raise ValueError(
"Passing negative integer to header is invalid. "
"For no header, use header=None instead"
)
return
if is_list_like(header):
header = cast(Sequence, header)
if not all(map(is_integer, header)):
raise ValueError("header must be integer or list of integers")
if any(i < 0 for i in header):
raise ValueError("cannot specify multi-index header with negative integers")
return
if is_bool(header):
raise TypeError(
"Passing a bool to header is invalid. Use header=None for no header or "
"header=int or list-like of ints to specify "
"the row(s) making up the column names"
)
# GH 16338
raise ValueError("header must be integer or list of integers")


@overload
Expand Down
101 changes: 99 additions & 2 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import abc
import datetime
from functools import partial
from io import BytesIO
import os
from textwrap import fill
Expand Down Expand Up @@ -70,6 +71,7 @@
pop_header_name,
)
from pandas.io.parsers import TextParser
from pandas.io.parsers.readers import validate_integer

_read_excel_doc = (
"""
Expand Down Expand Up @@ -563,7 +565,7 @@ def get_sheet_by_index(self, index: int):
pass

@abc.abstractmethod
def get_sheet_data(self, sheet, convert_float: bool):
def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None):
pass

def raise_if_bad_sheet_by_index(self, index: int) -> None:
Expand All @@ -577,6 +579,99 @@ def raise_if_bad_sheet_by_name(self, name: str) -> None:
if name not in self.sheet_names:
raise ValueError(f"Worksheet named '{name}' not found")

def _check_skiprows_func(
self,
skiprows: Callable,
rows_to_use: int,
) -> int:
"""
Determine how many file rows are required to obtain `nrows` data
rows when `skiprows` is a function.

Parameters
----------
skiprows : function
The function passed to read_excel by the user.
rows_to_use : int
The number of rows that will be needed for the header and
the data.

Returns
-------
int
"""
i = 0
rows_used_so_far = 0
while rows_used_so_far < rows_to_use:
if not skiprows(i):
rows_used_so_far += 1
i += 1
return i

def _calc_rows(
self,
header: int | Sequence[int] | None,
index_col: int | Sequence[int] | None,
skiprows: Sequence[int] | int | Callable[[int], object] | None,
nrows: int | None,
) -> int | None:
"""
If nrows specified, find the number of rows needed from the
file, otherwise return None.


Parameters
----------
header : int, list of int, or None
See read_excel docstring.
index_col : int, list of int, or None
See read_excel docstring.
skiprows : list-like, int, callable, or None
See read_excel docstring.
nrows : int or None
See read_excel docstring.

Returns
-------
int or None
"""
if nrows is None:
return None
if header is None:
header_rows = 1
elif is_integer(header):
header = cast(int, header)
header_rows = 1 + header
else:
header = cast(Sequence, header)
header_rows = 1 + header[-1]
rhshadrach marked this conversation as resolved.
Show resolved Hide resolved
# If there is a MultiIndex header and an index then there is also
# a row containing just the index name(s)
if is_list_like(header) and index_col is not None:
header = cast(Sequence, header)
if len(header) > 1:
header_rows += 1
if skiprows is None:
return header_rows + nrows
if is_integer(skiprows):
skiprows = cast(int, skiprows)
return header_rows + nrows + skiprows
if is_list_like(skiprows):

def f(skiprows: Sequence, x: int) -> bool:
return x in skiprows

skiprows = cast(Sequence, skiprows)
return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows)
if callable(skiprows):
return self._check_skiprows_func(
skiprows,
header_rows + nrows,
)
ahawryluk marked this conversation as resolved.
Show resolved Hide resolved
# else unexpected skiprows type: read_excel will not optimize
# the number of rows read from file
return None

def parse(
self,
sheet_name: str | int | list[int] | list[str] | None = 0,
Expand Down Expand Up @@ -613,6 +708,7 @@ def parse(
)

validate_header_arg(header)
validate_integer("nrows", nrows)

ret_dict = False

Expand Down Expand Up @@ -643,7 +739,8 @@ def parse(
else: # assume an integer if not a string
sheet = self.get_sheet_by_index(asheetname)

data = self.get_sheet_data(sheet, convert_float)
file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
data = self.get_sheet_data(sheet, convert_float, file_rows_needed)
if hasattr(sheet, "close"):
# pyxlsb opens two TemporaryFiles
sheet.close()
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str):
raise ValueError(f"sheet {name} not found")

def get_sheet_data(
self, sheet, convert_float: bool
self, sheet, convert_float: bool, file_rows_needed: int | None = None
) -> list[list[Scalar | NaTType]]:
"""
Parse an ODF Table into a list of lists
Expand Down Expand Up @@ -148,6 +148,8 @@ def get_sheet_data(
empty_rows = 0
for _ in range(row_repeat):
table.append(table_row)
if file_rows_needed is not None and len(table) >= file_rows_needed:
break

# Make our table square
for row in table:
Expand Down
6 changes: 5 additions & 1 deletion pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,9 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.value

def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
) -> list[list[Scalar]]:

if self.book.read_only:
sheet.reset_dimensions()
Expand All @@ -603,6 +605,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
if converted_row:
last_row_with_data = row_number
data.append(converted_row)
if file_rows_needed is not None and len(data) >= file_rows_needed:
break

# Trim trailing empty rows
data = data[: last_row_with_data + 1]
Expand Down
9 changes: 8 additions & 1 deletion pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,12 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.v

def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
def get_sheet_data(
self,
sheet,
convert_float: bool,
file_rows_needed: int | None = None,
) -> list[list[Scalar]]:
data: list[list[Scalar]] = []
prevous_row_number = -1
# When sparse=True the rows can have different lengths and empty rows are
Expand All @@ -94,6 +99,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
data.extend([[]] * (row_number - prevous_row_number - 1))
data.append(converted_row)
prevous_row_number = row_number
if file_rows_needed is not None and len(data) >= file_rows_needed:
break
if data:
# extend rows to max_width
max_width = max(len(data_row) for data_row in data)
Expand Down
16 changes: 13 additions & 3 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from __future__ import annotations

from datetime import time

import numpy as np

from pandas._typing import StorageOptions
from pandas._typing import (
Scalar,
StorageOptions,
)
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc

Expand Down Expand Up @@ -56,7 +61,9 @@ def get_sheet_by_index(self, index):
self.raise_if_bad_sheet_by_index(index)
return self.book.sheet_by_index(index)

def get_sheet_data(self, sheet, convert_float):
def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
from xlrd import (
XL_CELL_BOOLEAN,
XL_CELL_DATE,
Expand Down Expand Up @@ -107,7 +114,10 @@ def _parse_cell(cell_contents, cell_typ):

data = []

for i in range(sheet.nrows):
nrows = sheet.nrows
if file_rows_needed is not None:
nrows = min(nrows, file_rows_needed)
for i in range(nrows):
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
Expand Down
29 changes: 6 additions & 23 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,7 @@ def __init__(self, kwds) -> None:

# validate header options for mi
self.header = kwds.get("header")
if isinstance(self.header, (list, tuple, np.ndarray)):
if not all(map(is_integer, self.header)):
raise ValueError("header must be integer or list of integers")
if any(i < 0 for i in self.header):
raise ValueError(
"cannot specify multi-index header with negative integers"
)
Comment on lines -123 to -129
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be missing it, is validate_header_arg called somewhere instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but it took me a while to find it. It's called earlier in TextFileReader:

  pandas/io/parsers/readers.py(1847)TextParser()
-> return TextFileReader(*args, **kwds)

  pandas/io/parsers/readers.py(1412)__init__()
-> self.options, self.engine = self._clean_options(options, engine)

  pandas/io/parsers/readers.py(1607)_clean_options()
-> validate_header_arg(options["header"])

if is_list_like(self.header):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allow_sets=False

if kwds.get("usecols"):
raise ValueError(
"cannot specify usecols when specifying a multi-index header"
Expand All @@ -138,31 +132,20 @@ def __init__(self, kwds) -> None:

# validate index_col that only contains integers
if self.index_col is not None:
is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray))
if not (
is_sequence
is_list_like(self.index_col)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allow_sets=False

and all(map(is_integer, self.index_col))
or is_integer(self.index_col)
):
raise ValueError(
"index_col must only contain row numbers "
"when specifying a multi-index header"
)
elif self.header is not None:
elif self.header is not None and self.prefix is not None:
# GH 27394
if self.prefix is not None:
raise ValueError(
"Argument prefix must be None if argument header is not None"
)
# GH 16338
elif not is_integer(self.header):
raise ValueError("header must be integer or list of integers")
# GH 27779
elif self.header < 0:
raise ValueError(
"Passing negative integer to header is invalid. "
"For no header, use header=None instead"
)
raise ValueError(
"Argument prefix must be None if argument header is not None"
)

self._name_processed = False

Expand Down
Loading