Skip to content

Commit

Permalink
PERF: Optimize read_excel nrows (#46894)
Browse files Browse the repository at this point in the history
  • Loading branch information
ahawryluk committed Jun 5, 2022
1 parent 9b77039 commit 9e10206
Show file tree
Hide file tree
Showing 10 changed files with 210 additions and 33 deletions.
11 changes: 11 additions & 0 deletions asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,15 @@ def time_read_excel(self, engine):
read_excel(fname, engine=engine)


class ReadExcelNRows(ReadExcel):
def time_read_excel(self, engine):
if engine == "xlrd":
fname = self.fname_excel_xls
elif engine == "odf":
fname = self.fname_odf
else:
fname = self.fname_excel
read_excel(fname, engine=engine, nrows=10)


from ..pandas_vb_common import setup # noqa: F401 isort:skip
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,7 @@ Performance improvements
- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
- Performance improvement in :func:`factorize` (:issue:`46109`)
- Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
- Performance improvement in :func:`read_excel` when ``nrows`` argument provided (:issue:`32727`)

.. ---------------------------------------------------------------------------
.. _whatsnew_150.bug_fixes:
Expand Down
30 changes: 28 additions & 2 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
Generic,
Literal,
Mapping,
Sequence,
TypeVar,
cast,
overload,
Expand Down Expand Up @@ -58,7 +59,12 @@
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import is_file_like
from pandas.core.dtypes.common import (
is_bool,
is_file_like,
is_integer,
is_list_like,
)

from pandas.core.shared_docs import _shared_docs

Expand Down Expand Up @@ -175,12 +181,32 @@ def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT:


def validate_header_arg(header: object) -> None:
if isinstance(header, bool):
if header is None:
return
if is_integer(header):
header = cast(int, header)
if header < 0:
# GH 27779
raise ValueError(
"Passing negative integer to header is invalid. "
"For no header, use header=None instead"
)
return
if is_list_like(header, allow_sets=False):
header = cast(Sequence, header)
if not all(map(is_integer, header)):
raise ValueError("header must be integer or list of integers")
if any(i < 0 for i in header):
raise ValueError("cannot specify multi-index header with negative integers")
return
if is_bool(header):
raise TypeError(
"Passing a bool to header is invalid. Use header=None for no header or "
"header=int or list-like of ints to specify "
"the row(s) making up the column names"
)
# GH 16338
raise ValueError("header must be integer or list of integers")


@overload
Expand Down
101 changes: 99 additions & 2 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import abc
import datetime
from functools import partial
from io import BytesIO
import os
from textwrap import fill
Expand Down Expand Up @@ -70,6 +71,7 @@
pop_header_name,
)
from pandas.io.parsers import TextParser
from pandas.io.parsers.readers import validate_integer

_read_excel_doc = (
"""
Expand Down Expand Up @@ -563,7 +565,7 @@ def get_sheet_by_index(self, index: int):
pass

@abc.abstractmethod
def get_sheet_data(self, sheet, convert_float: bool):
def get_sheet_data(self, sheet, convert_float: bool, rows: int | None = None):
pass

def raise_if_bad_sheet_by_index(self, index: int) -> None:
Expand All @@ -577,6 +579,99 @@ def raise_if_bad_sheet_by_name(self, name: str) -> None:
if name not in self.sheet_names:
raise ValueError(f"Worksheet named '{name}' not found")

def _check_skiprows_func(
self,
skiprows: Callable,
rows_to_use: int,
) -> int:
"""
Determine how many file rows are required to obtain `nrows` data
rows when `skiprows` is a function.
Parameters
----------
skiprows : function
The function passed to read_excel by the user.
rows_to_use : int
The number of rows that will be needed for the header and
the data.
Returns
-------
int
"""
i = 0
rows_used_so_far = 0
while rows_used_so_far < rows_to_use:
if not skiprows(i):
rows_used_so_far += 1
i += 1
return i

def _calc_rows(
self,
header: int | Sequence[int] | None,
index_col: int | Sequence[int] | None,
skiprows: Sequence[int] | int | Callable[[int], object] | None,
nrows: int | None,
) -> int | None:
"""
If nrows specified, find the number of rows needed from the
file, otherwise return None.
Parameters
----------
header : int, list of int, or None
See read_excel docstring.
index_col : int, list of int, or None
See read_excel docstring.
skiprows : list-like, int, callable, or None
See read_excel docstring.
nrows : int or None
See read_excel docstring.
Returns
-------
int or None
"""
if nrows is None:
return None
if header is None:
header_rows = 1
elif is_integer(header):
header = cast(int, header)
header_rows = 1 + header
else:
header = cast(Sequence, header)
header_rows = 1 + header[-1]
# If there is a MultiIndex header and an index then there is also
# a row containing just the index name(s)
if is_list_like(header) and index_col is not None:
header = cast(Sequence, header)
if len(header) > 1:
header_rows += 1
if skiprows is None:
return header_rows + nrows
if is_integer(skiprows):
skiprows = cast(int, skiprows)
return header_rows + nrows + skiprows
if is_list_like(skiprows):

def f(skiprows: Sequence, x: int) -> bool:
return x in skiprows

skiprows = cast(Sequence, skiprows)
return self._check_skiprows_func(partial(f, skiprows), header_rows + nrows)
if callable(skiprows):
return self._check_skiprows_func(
skiprows,
header_rows + nrows,
)
# else unexpected skiprows type: read_excel will not optimize
# the number of rows read from file
return None

def parse(
self,
sheet_name: str | int | list[int] | list[str] | None = 0,
Expand Down Expand Up @@ -613,6 +708,7 @@ def parse(
)

validate_header_arg(header)
validate_integer("nrows", nrows)

ret_dict = False

Expand Down Expand Up @@ -643,7 +739,8 @@ def parse(
else: # assume an integer if not a string
sheet = self.get_sheet_by_index(asheetname)

data = self.get_sheet_data(sheet, convert_float)
file_rows_needed = self._calc_rows(header, index_col, skiprows, nrows)
data = self.get_sheet_data(sheet, convert_float, file_rows_needed)
if hasattr(sheet, "close"):
# pyxlsb opens two TemporaryFiles
sheet.close()
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def get_sheet_by_name(self, name: str):
raise ValueError(f"sheet {name} not found")

def get_sheet_data(
self, sheet, convert_float: bool
self, sheet, convert_float: bool, file_rows_needed: int | None = None
) -> list[list[Scalar | NaTType]]:
"""
Parse an ODF Table into a list of lists
Expand Down Expand Up @@ -148,6 +148,8 @@ def get_sheet_data(
empty_rows = 0
for _ in range(row_repeat):
table.append(table_row)
if file_rows_needed is not None and len(table) >= file_rows_needed:
break

# Make our table square
for row in table:
Expand Down
6 changes: 5 additions & 1 deletion pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,9 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.value

def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
) -> list[list[Scalar]]:

if self.book.read_only:
sheet.reset_dimensions()
Expand All @@ -603,6 +605,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
if converted_row:
last_row_with_data = row_number
data.append(converted_row)
if file_rows_needed is not None and len(data) >= file_rows_needed:
break

# Trim trailing empty rows
data = data[: last_row_with_data + 1]
Expand Down
9 changes: 8 additions & 1 deletion pandas/io/excel/_pyxlsb.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,12 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:

return cell.v

def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
def get_sheet_data(
self,
sheet,
convert_float: bool,
file_rows_needed: int | None = None,
) -> list[list[Scalar]]:
data: list[list[Scalar]] = []
prevous_row_number = -1
# When sparse=True the rows can have different lengths and empty rows are
Expand All @@ -94,6 +99,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> list[list[Scalar]]:
data.extend([[]] * (row_number - prevous_row_number - 1))
data.append(converted_row)
prevous_row_number = row_number
if file_rows_needed is not None and len(data) >= file_rows_needed:
break
if data:
# extend rows to max_width
max_width = max(len(data_row) for data_row in data)
Expand Down
16 changes: 13 additions & 3 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from __future__ import annotations

from datetime import time

import numpy as np

from pandas._typing import StorageOptions
from pandas._typing import (
Scalar,
StorageOptions,
)
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc

Expand Down Expand Up @@ -56,7 +61,9 @@ def get_sheet_by_index(self, index):
self.raise_if_bad_sheet_by_index(index)
return self.book.sheet_by_index(index)

def get_sheet_data(self, sheet, convert_float):
def get_sheet_data(
self, sheet, convert_float: bool, file_rows_needed: int | None = None
) -> list[list[Scalar]]:
from xlrd import (
XL_CELL_BOOLEAN,
XL_CELL_DATE,
Expand Down Expand Up @@ -107,7 +114,10 @@ def _parse_cell(cell_contents, cell_typ):

data = []

for i in range(sheet.nrows):
nrows = sheet.nrows
if file_rows_needed is not None:
nrows = min(nrows, file_rows_needed)
for i in range(nrows):
row = [
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
Expand Down
29 changes: 6 additions & 23 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,7 @@ def __init__(self, kwds) -> None:

# validate header options for mi
self.header = kwds.get("header")
if isinstance(self.header, (list, tuple, np.ndarray)):
if not all(map(is_integer, self.header)):
raise ValueError("header must be integer or list of integers")
if any(i < 0 for i in self.header):
raise ValueError(
"cannot specify multi-index header with negative integers"
)
if is_list_like(self.header, allow_sets=False):
if kwds.get("usecols"):
raise ValueError(
"cannot specify usecols when specifying a multi-index header"
Expand All @@ -139,31 +133,20 @@ def __init__(self, kwds) -> None:

# validate index_col that only contains integers
if self.index_col is not None:
is_sequence = isinstance(self.index_col, (list, tuple, np.ndarray))
if not (
is_sequence
is_list_like(self.index_col, allow_sets=False)
and all(map(is_integer, self.index_col))
or is_integer(self.index_col)
):
raise ValueError(
"index_col must only contain row numbers "
"when specifying a multi-index header"
)
elif self.header is not None:
elif self.header is not None and self.prefix is not None:
# GH 27394
if self.prefix is not None:
raise ValueError(
"Argument prefix must be None if argument header is not None"
)
# GH 16338
elif not is_integer(self.header):
raise ValueError("header must be integer or list of integers")
# GH 27779
elif self.header < 0:
raise ValueError(
"Passing negative integer to header is invalid. "
"For no header, use header=None instead"
)
raise ValueError(
"Argument prefix must be None if argument header is not None"
)

self._name_processed = False

Expand Down
Loading

0 comments on commit 9e10206

Please sign in to comment.