From 5067cc7da324fe7e08454c25bc254a55cc71de75 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 23 May 2022 13:27:55 -0500 Subject: [PATCH] Fix an issue with reading raw string in `cudf.read_json` (#10924) Fixes issue described here: https://github.com/rapidsai/cudf/pull/10275#issuecomment-1133750320 This PR removes a false error that says path couldn't be resolved. But that isn't true incase of a json reader where the input can be a json string itself, hence to resolve this issue `is_raw_text_like_input` that indicates an IO reader(like read_json) is calling the utility function and the path need not be a valid one. In case of a true invalid path, either fsspec or libcudf throws a file not found error. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10924 --- python/cudf/cudf/io/avro.py | 2 +- python/cudf/cudf/io/csv.py | 2 +- python/cudf/cudf/io/json.py | 6 ++++-- python/cudf/cudf/io/orc.py | 6 +++--- python/cudf/cudf/io/parquet.py | 2 +- python/cudf/cudf/io/text.py | 2 +- python/cudf/cudf/utils/ioutils.py | 26 ++++++++++++++++++-------- 7 files changed, 29 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index e4824c2ccbe..66c5c1c5a56 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -24,7 +24,7 @@ def read_avro( "`read_avro` does not yet support reading multiple files" ) - filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( + filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs ) if compression is not None: diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index a81563884d9..2288f896a9d 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -60,7 +60,7 @@ def read_csv( "`read_csv` does not yet support reading multiple files" ) - filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( + filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=compression, iotypes=(BytesIO, StringIO, NativeFile), diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 142b9c26f96..869e055decf 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -42,10 +42,11 @@ def read_json( source = ioutils.stringify_pathlike(source) source = fs.sep.join([source, "*.json"]) - tmp_source, compression = ioutils.get_filepath_or_buffer( + tmp_source, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=source, compression=compression, iotypes=(BytesIO, StringIO), + allow_raw_text_input=True, **kwargs, ) if isinstance(tmp_source, list): @@ -73,10 +74,11 @@ def read_json( "multiple files via pandas" ) - path_or_buf, compression = ioutils.get_filepath_or_buffer( + path_or_buf, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=path_or_buf, compression=compression, iotypes=(BytesIO, StringIO), + allow_raw_text_input=True, **kwargs, ) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 6a2ffef52db..cd72a60b182 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -171,7 +171,7 @@ def read_orc_statistics( files_statistics = [] stripes_statistics = [] for source in filepaths_or_buffers: - filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( + path_or_buf, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=source, compression=None, **kwargs ) if compression is not None: @@ -182,7 +182,7 @@ def read_orc_statistics( column_names, raw_file_statistics, raw_stripes_statistics, - ) = liborc.read_raw_orc_statistics(filepath_or_buffer) + ) = liborc.read_raw_orc_statistics(path_or_buf) # Parse column names column_names = [ @@ -323,7 +323,7 @@ def read_orc( source = stringify_path(source) source = fs.sep.join([source, "*.orc"]) - tmp_source, compression = ioutils.get_filepath_or_buffer( + tmp_source, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=source, compression=None, use_python_file_object=use_python_file_object, diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 94e9b7a6292..51c2ac8b828 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -435,7 +435,7 @@ def read_parquet( fs=fs, ) for i, source in enumerate(filepath_or_buffer): - tmp_source, compression = ioutils.get_filepath_or_buffer( + tmp_source, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=source, compression=None, fs=fs, diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index 86f99b319f0..12aa0f6ef8b 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -18,7 +18,7 @@ def read_text( ): """{docstring}""" - filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( + filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, iotypes=(BytesIO, StringIO), diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 6ef44d9b1d6..6d6bdabf70d 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1319,7 +1319,7 @@ def _open_remote_files( ] -def get_filepath_or_buffer( +def get_reader_filepath_or_buffer( path_or_data, compression, mode="rb", @@ -1328,6 +1328,7 @@ def get_filepath_or_buffer( byte_ranges=None, use_python_file_object=False, open_file_options=None, + allow_raw_text_input=False, **kwargs, ): """Return either a filepath string to data, or a memory buffer of data. @@ -1352,6 +1353,11 @@ def get_filepath_or_buffer( open_file_options : dict, optional Optional dictionary of key-word arguments to pass to `_open_remote_files` (used for remote storage only). + allow_raw_text_input : boolean, default False + If True, this indicates the input `path_or_data` could be a raw text + input and will not check for its existence in the filesystem. If False, + the input must be a path and an error will be raised if it does not + exist. Returns ------- @@ -1372,18 +1378,22 @@ def get_filepath_or_buffer( if fs is None: return path_or_data, compression - if len(paths) == 0: - raise FileNotFoundError( - f"{path_or_data} could not be resolved to any files" - ) - if _is_local_filesystem(fs): # Doing this as `read_json` accepts a json string # path_or_data need not be a filepath like string - if os.path.exists(paths[0]): - path_or_data = paths if len(paths) > 1 else paths[0] + if len(paths): + if fs.exists(paths[0]): + path_or_data = paths if len(paths) > 1 else paths[0] + elif not allow_raw_text_input: + raise FileNotFoundError( + f"{path_or_data} could not be resolved to any files" + ) else: + if len(paths) == 0: + raise FileNotFoundError( + f"{path_or_data} could not be resolved to any files" + ) if use_python_file_object: path_or_data = _open_remote_files( paths,