From 5067cc7da324fe7e08454c25bc254a55cc71de75 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 23 May 2022 13:27:55 -0500
Subject: [PATCH] Fix an issue with reading raw string in `cudf.read_json`
 (#10924)

Fixes issue described here: https://github.com/rapidsai/cudf/pull/10275#issuecomment-1133750320

This PR removes a false error that says path couldn't be resolved. But that isn't true incase of a json reader where the input can be a json string itself, hence to resolve this issue `is_raw_text_like_input` that indicates an IO reader(like read_json) is calling the utility function and the path need not be a valid one. In case of a true invalid path, either fsspec or libcudf throws a file not found error.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10924
---
 python/cudf/cudf/io/avro.py       |  2 +-
 python/cudf/cudf/io/csv.py        |  2 +-
 python/cudf/cudf/io/json.py       |  6 ++++--
 python/cudf/cudf/io/orc.py        |  6 +++---
 python/cudf/cudf/io/parquet.py    |  2 +-
 python/cudf/cudf/io/text.py       |  2 +-
 python/cudf/cudf/utils/ioutils.py | 26 ++++++++++++++++++--------
 7 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py
index e4824c2ccbe..66c5c1c5a56 100644
--- a/python/cudf/cudf/io/avro.py
+++ b/python/cudf/cudf/io/avro.py
@@ -24,7 +24,7 @@ def read_avro(
             "`read_avro` does not yet support reading multiple files"
         )
 
-    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
+    filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer, compression=None, **kwargs
     )
     if compression is not None:
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index a81563884d9..2288f896a9d 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -60,7 +60,7 @@ def read_csv(
             "`read_csv` does not yet support reading multiple files"
         )
 
-    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
+    filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=compression,
         iotypes=(BytesIO, StringIO, NativeFile),
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 142b9c26f96..869e055decf 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -42,10 +42,11 @@ def read_json(
                 source = ioutils.stringify_pathlike(source)
                 source = fs.sep.join([source, "*.json"])
 
-            tmp_source, compression = ioutils.get_filepath_or_buffer(
+            tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
                 path_or_data=source,
                 compression=compression,
                 iotypes=(BytesIO, StringIO),
+                allow_raw_text_input=True,
                 **kwargs,
             )
             if isinstance(tmp_source, list):
@@ -73,10 +74,11 @@ def read_json(
                 "multiple files via pandas"
             )
 
-        path_or_buf, compression = ioutils.get_filepath_or_buffer(
+        path_or_buf, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=path_or_buf,
             compression=compression,
             iotypes=(BytesIO, StringIO),
+            allow_raw_text_input=True,
             **kwargs,
         )
 
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 6a2ffef52db..cd72a60b182 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -171,7 +171,7 @@ def read_orc_statistics(
     files_statistics = []
     stripes_statistics = []
     for source in filepaths_or_buffers:
-        filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
+        path_or_buf, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source, compression=None, **kwargs
         )
         if compression is not None:
@@ -182,7 +182,7 @@ def read_orc_statistics(
             column_names,
             raw_file_statistics,
             raw_stripes_statistics,
-        ) = liborc.read_raw_orc_statistics(filepath_or_buffer)
+        ) = liborc.read_raw_orc_statistics(path_or_buf)
 
         # Parse column names
         column_names = [
@@ -323,7 +323,7 @@ def read_orc(
             source = stringify_path(source)
             source = fs.sep.join([source, "*.orc"])
 
-        tmp_source, compression = ioutils.get_filepath_or_buffer(
+        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
             compression=None,
             use_python_file_object=use_python_file_object,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 94e9b7a6292..51c2ac8b828 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -435,7 +435,7 @@ def read_parquet(
             fs=fs,
         )
     for i, source in enumerate(filepath_or_buffer):
-        tmp_source, compression = ioutils.get_filepath_or_buffer(
+        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
             compression=None,
             fs=fs,
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 86f99b319f0..12aa0f6ef8b 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -18,7 +18,7 @@ def read_text(
 ):
     """{docstring}"""
 
-    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
+    filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=None,
         iotypes=(BytesIO, StringIO),
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6ef44d9b1d6..6d6bdabf70d 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1319,7 +1319,7 @@ def _open_remote_files(
     ]
 
 
-def get_filepath_or_buffer(
+def get_reader_filepath_or_buffer(
     path_or_data,
     compression,
     mode="rb",
@@ -1328,6 +1328,7 @@ def get_filepath_or_buffer(
     byte_ranges=None,
     use_python_file_object=False,
     open_file_options=None,
+    allow_raw_text_input=False,
     **kwargs,
 ):
     """Return either a filepath string to data, or a memory buffer of data.
@@ -1352,6 +1353,11 @@ def get_filepath_or_buffer(
     open_file_options : dict, optional
         Optional dictionary of key-word arguments to pass to
         `_open_remote_files` (used for remote storage only).
+    allow_raw_text_input : boolean, default False
+        If True, this indicates the input `path_or_data` could be a raw text
+        input and will not check for its existence in the filesystem. If False,
+        the input must be a path and an error will be raised if it does not
+        exist.
 
     Returns
     -------
@@ -1372,18 +1378,22 @@ def get_filepath_or_buffer(
             if fs is None:
                 return path_or_data, compression
 
-        if len(paths) == 0:
-            raise FileNotFoundError(
-                f"{path_or_data} could not be resolved to any files"
-            )
-
         if _is_local_filesystem(fs):
             # Doing this as `read_json` accepts a json string
             # path_or_data need not be a filepath like string
-            if os.path.exists(paths[0]):
-                path_or_data = paths if len(paths) > 1 else paths[0]
+            if len(paths):
+                if fs.exists(paths[0]):
+                    path_or_data = paths if len(paths) > 1 else paths[0]
+                elif not allow_raw_text_input:
+                    raise FileNotFoundError(
+                        f"{path_or_data} could not be resolved to any files"
+                    )
 
         else:
+            if len(paths) == 0:
+                raise FileNotFoundError(
+                    f"{path_or_data} could not be resolved to any files"
+                )
             if use_python_file_object:
                 path_or_data = _open_remote_files(
                     paths,