Consistent method names; docstrings for from_csv_arrow

Co-authored-by: Maarten Breddels <maartenbreddels@gmail.com>>
vaexio · Sep 20, 2022 · d89c6a2 · d89c6a2
1 parent 49ab2c8
commit d89c6a2
Showing 1 changed file with 24 additions and 16 deletions.
diff --git a/packages/vaex-core/vaex/__init__.py b/packages/vaex-core/vaex/__init__.py
@@ -219,7 +219,7 @@ def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=No
             # # naked_path, _ = vaex.file.split_options(path, fs_options)
             _, ext, _ = vaex.file.split_ext(path)
             if ext == '.csv':  # special case for csv
-                return vaex.read_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
+                return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
             if convert:
                 path_output = convert if isinstance(convert, str) else filename_hdf5
                 vaex.convert.convert(
@@ -410,7 +410,7 @@ def from_pandas(df, name="pandas", copy_index=False, index_name="index"):
     :param: name: unique for the DataFrame
 
     >>> import vaex, pandas as pd
-    >>> df_pandas = pd.read_csv('test.csv')
+    >>> df_pandas = pd.from_csv('test.csv')
     >>> df = vaex.from_pandas(df_pandas)
 
     :rtype: DataFrame
@@ -531,23 +531,31 @@ def from_records(records : List[Dict], array_type="arrow", defaults={}) -> vaex.
     return vaex.from_dict(arrays)
 
 
-def read_csv_arrow_lazy(file, chunk_size="10MiB", newline_readahead="64kiB", read_options=None, parse_options=None, convert_options=None):
-    '''Experimental Lazy CSV reader using Apache Arrow'''
-    import vaex.csv
-    ds = vaex.csv.DatasetCsvLazy(file, chunk_size=chunk_size, newline_readahead=newline_readahead, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
-    return vaex.from_dataset(ds)
+def from_csv_arrow(file, read_options=None, parse_options=None, convert_options=None, lazy=False, chunk_size="10MiB", newline_readahead="64kiB"):
+    """ Fast CSV reader using Apache Arrow. Support for lazy reading of CSV files (experimental).
 
-def read_csv_arrow(file, read_options=None, parse_options=None, convert_options=None):
-    '''CSV reader using Apache Arrow'''
+    :param file: file path or file-like object
+    :param read_options: PyArrow CSV read options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
+    :param parse_options: PyArrow CSV parse options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
+    :param convert_options: PyArrow CSV convert options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html
+    :param lazy: If True, the CSV file is lazily read, and the DataFrame is not stored in memory.
+    :param chunk_size: The CSV is read in chunks of the specified size. Relevant only if lazy=True.
+    :param newline_readahead: The size of the readahead buffer for newline detection. Relevant only if lazy=True.
+    :return: DataFrame
+    """
     import vaex.csv
-    ds = vaex.csv.DatasetCsv(file, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
-    return vaex.from_dataset(ds)
+    if lazy is True:
+        ds = vaex.csv.DatasetCsvLazy(file, chunk_size=chunk_size, read_options=read_options, parse_options=parse_options, convert_options=convert_options, newline_readahead=newline_readahead)
+        return vaex.from_dataset(ds)
+    else:
+        ds = vaex.csv.DatasetCsv(file, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
+        return vaex.from_dataset(ds)
 
 
 @docsubst
-def read_csv(filename_or_buffer, copy_index=False, chunk_size=None, convert=False, fs_options={}, progress=None, fs=None, **kwargs):
+def from_csv(filename_or_buffer, copy_index=False, chunk_size=None, convert=False, fs_options={}, progress=None, fs=None, **kwargs):
     """
-    Read a CSV file as a DataFrame, and optionally convert to an hdf5 file.
+    Load a CSV file as a DataFrame, and optionally convert to an HDF5 file.
 
     :param str or file filename_or_buffer: CSV file path or file-like
     :param bool copy_index: copy index when source is read via Pandas
@@ -603,9 +611,9 @@ def iterator():
         return iterator()
 
 
-def from_csv(filepath_or_buffer, **kwargs):
-    '''Alias to read_csv.'''
-    return read_csv(filepath_or_buffer, **kwargs)
+def read_csv(filepath_or_buffer, **kwargs):
+    '''Alias to from_csv.'''
+    return from_csv(filepath_or_buffer, **kwargs)
 
 aliases = vaex.settings.aliases