Skip to content

Commit

Permalink
Fix an issue with reading raw string in cudf.read_json (NVIDIA#10924)
Browse files Browse the repository at this point in the history
Fixes issue described here: rapidsai/cudf#10275 (comment)

This PR removes a false error that says path couldn't be resolved. But that isn't true incase of a json reader where the input can be a json string itself, hence to resolve this issue `is_raw_text_like_input` that indicates an IO reader(like read_json) is calling the utility function and the path need not be a valid one. In case of a true invalid path, either fsspec or libcudf throws a file not found error.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: rapidsai/cudf#10924
  • Loading branch information
galipremsagar authored May 23, 2022
1 parent d1a0191 commit 5067cc7
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 17 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/avro.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def read_avro(
"`read_avro` does not yet support reading multiple files"
)

filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
path_or_data=filepath_or_buffer, compression=None, **kwargs
)
if compression is not None:
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def read_csv(
"`read_csv` does not yet support reading multiple files"
)

filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
path_or_data=filepath_or_buffer,
compression=compression,
iotypes=(BytesIO, StringIO, NativeFile),
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,11 @@ def read_json(
source = ioutils.stringify_pathlike(source)
source = fs.sep.join([source, "*.json"])

tmp_source, compression = ioutils.get_filepath_or_buffer(
tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
path_or_data=source,
compression=compression,
iotypes=(BytesIO, StringIO),
allow_raw_text_input=True,
**kwargs,
)
if isinstance(tmp_source, list):
Expand Down Expand Up @@ -73,10 +74,11 @@ def read_json(
"multiple files via pandas"
)

path_or_buf, compression = ioutils.get_filepath_or_buffer(
path_or_buf, compression = ioutils.get_reader_filepath_or_buffer(
path_or_data=path_or_buf,
compression=compression,
iotypes=(BytesIO, StringIO),
allow_raw_text_input=True,
**kwargs,
)

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def read_orc_statistics(
files_statistics = []
stripes_statistics = []
for source in filepaths_or_buffers:
filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
path_or_buf, compression = ioutils.get_reader_filepath_or_buffer(
path_or_data=source, compression=None, **kwargs
)
if compression is not None:
Expand All @@ -182,7 +182,7 @@ def read_orc_statistics(
column_names,
raw_file_statistics,
raw_stripes_statistics,
) = liborc.read_raw_orc_statistics(filepath_or_buffer)
) = liborc.read_raw_orc_statistics(path_or_buf)

# Parse column names
column_names = [
Expand Down Expand Up @@ -323,7 +323,7 @@ def read_orc(
source = stringify_path(source)
source = fs.sep.join([source, "*.orc"])

tmp_source, compression = ioutils.get_filepath_or_buffer(
tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
path_or_data=source,
compression=None,
use_python_file_object=use_python_file_object,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ def read_parquet(
fs=fs,
)
for i, source in enumerate(filepath_or_buffer):
tmp_source, compression = ioutils.get_filepath_or_buffer(
tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
path_or_data=source,
compression=None,
fs=fs,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def read_text(
):
"""{docstring}"""

filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
path_or_data=filepath_or_buffer,
compression=None,
iotypes=(BytesIO, StringIO),
Expand Down
26 changes: 18 additions & 8 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1319,7 +1319,7 @@ def _open_remote_files(
]


def get_filepath_or_buffer(
def get_reader_filepath_or_buffer(
path_or_data,
compression,
mode="rb",
Expand All @@ -1328,6 +1328,7 @@ def get_filepath_or_buffer(
byte_ranges=None,
use_python_file_object=False,
open_file_options=None,
allow_raw_text_input=False,
**kwargs,
):
"""Return either a filepath string to data, or a memory buffer of data.
Expand All @@ -1352,6 +1353,11 @@ def get_filepath_or_buffer(
open_file_options : dict, optional
Optional dictionary of key-word arguments to pass to
`_open_remote_files` (used for remote storage only).
allow_raw_text_input : boolean, default False
If True, this indicates the input `path_or_data` could be a raw text
input and will not check for its existence in the filesystem. If False,
the input must be a path and an error will be raised if it does not
exist.
Returns
-------
Expand All @@ -1372,18 +1378,22 @@ def get_filepath_or_buffer(
if fs is None:
return path_or_data, compression

if len(paths) == 0:
raise FileNotFoundError(
f"{path_or_data} could not be resolved to any files"
)

if _is_local_filesystem(fs):
# Doing this as `read_json` accepts a json string
# path_or_data need not be a filepath like string
if os.path.exists(paths[0]):
path_or_data = paths if len(paths) > 1 else paths[0]
if len(paths):
if fs.exists(paths[0]):
path_or_data = paths if len(paths) > 1 else paths[0]
elif not allow_raw_text_input:
raise FileNotFoundError(
f"{path_or_data} could not be resolved to any files"
)

else:
if len(paths) == 0:
raise FileNotFoundError(
f"{path_or_data} could not be resolved to any files"
)
if use_python_file_object:
path_or_data = _open_remote_files(
paths,
Expand Down

0 comments on commit 5067cc7

Please sign in to comment.