Consistent method names; docstrings for from_csv_arrow

Co-authored-by: Maarten Breddels <maartenbreddels@gmail.com>>
vaexio · Sep 20, 2022 · c9cb709 · c9cb709
1 parent 49ab2c8
commit c9cb709
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 23 deletions.
diff --git a/packages/vaex-core/vaex/__init__.py b/packages/vaex-core/vaex/__init__.py
@@ -219,7 +219,7 @@ def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=No
             # # naked_path, _ = vaex.file.split_options(path, fs_options)
             _, ext, _ = vaex.file.split_ext(path)
             if ext == '.csv':  # special case for csv
-                return vaex.read_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
+                return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
             if convert:
                 path_output = convert if isinstance(convert, str) else filename_hdf5
                 vaex.convert.convert(
@@ -410,7 +410,7 @@ def from_pandas(df, name="pandas", copy_index=False, index_name="index"):
     :param: name: unique for the DataFrame
 
     >>> import vaex, pandas as pd
-    >>> df_pandas = pd.read_csv('test.csv')
+    >>> df_pandas = pd.from_csv('test.csv')
     >>> df = vaex.from_pandas(df_pandas)
 
     :rtype: DataFrame
@@ -531,23 +531,31 @@ def from_records(records : List[Dict], array_type="arrow", defaults={}) -> vaex.
     return vaex.from_dict(arrays)
 
 
-def read_csv_arrow_lazy(file, chunk_size="10MiB", newline_readahead="64kiB", read_options=None, parse_options=None, convert_options=None):
-    '''Experimental Lazy CSV reader using Apache Arrow'''
-    import vaex.csv
-    ds = vaex.csv.DatasetCsvLazy(file, chunk_size=chunk_size, newline_readahead=newline_readahead, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
-    return vaex.from_dataset(ds)
+def from_csv_arrow(file, read_options=None, parse_options=None, convert_options=None, lazy=False, chunk_size="10MiB", newline_readahead="64kiB"):
+    """ Fast CSV reader using Apache Arrow. Support for lazy reading of CSV files (experimental).
 
-def read_csv_arrow(file, read_options=None, parse_options=None, convert_options=None):
-    '''CSV reader using Apache Arrow'''
+    :param file: file path or file-like object
+    :param read_options: PyArrow CSV read options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
+    :param parse_options: PyArrow CSV parse options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
+    :param convert_options: PyArrow CSV convert options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html
+    :param lazy: If True, the CSV file is lazily read, and the DataFrame is not stored in memory.
+    :param chunk_size: The CSV is read in chunks of the specified size. Relevant only if lazy=True.
+    :param newline_readahead: The size of the readahead buffer for newline detection. Relevant only if lazy=True.
+    :return: DataFrame
+    """
     import vaex.csv
-    ds = vaex.csv.DatasetCsv(file, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
-    return vaex.from_dataset(ds)
+    if lazy is True:
+        ds = vaex.csv.DatasetCsvLazy(file, chunk_size=chunk_size, read_options=read_options, parse_options=parse_options, convert_options=convert_options, newline_readahead=newline_readahead)
+        return vaex.from_dataset(ds)
+    else:
+        ds = vaex.csv.DatasetCsv(file, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
+        return vaex.from_dataset(ds)
 
 
 @docsubst
-def read_csv(filename_or_buffer, copy_index=False, chunk_size=None, convert=False, fs_options={}, progress=None, fs=None, **kwargs):
+def from_csv(filename_or_buffer, copy_index=False, chunk_size=None, convert=False, fs_options={}, progress=None, fs=None, **kwargs):
     """
-    Read a CSV file as a DataFrame, and optionally convert to an hdf5 file.
+    Load a CSV file as a DataFrame, and optionally convert to an HDF5 file.
 
     :param str or file filename_or_buffer: CSV file path or file-like
     :param bool copy_index: copy index when source is read via Pandas
@@ -603,9 +611,9 @@ def iterator():
         return iterator()
 
 
-def from_csv(filepath_or_buffer, **kwargs):
-    '''Alias to read_csv.'''
-    return read_csv(filepath_or_buffer, **kwargs)
+def read_csv(filepath_or_buffer, **kwargs):
+    '''Alias to from_csv.'''
+    return from_csv(filepath_or_buffer, **kwargs)
 
 aliases = vaex.settings.aliases
 

diff --git a/tests/arrow/dataset_test.py b/tests/arrow/dataset_test.py
@@ -24,10 +24,7 @@ def test_csv(tmpdir, df_filtered, rebuild_dataframe, lazy):
     df_filtered.drop('datetime', inplace=True)
     df_filtered.drop('timedelta', inplace=True)
     df_filtered.export_csv(path)
-    if lazy:
-        df = vaex.read_csv_arrow_lazy(path)
-    else:
-        df = vaex.read_csv_arrow(path)
+    df = vaex.from_csv_arrow(path, lazy=lazy)
 
     df2 = rebuild_dataframe(df)
     assert df2.dataset == df.dataset
@@ -108,8 +105,8 @@ def test_parquet(l1, l2, rebuild_dataset):
             for i1, i2, chunks in ds.chunk_iterator(['x']):
                 values.extend(chunks['x'].to_pylist())
             assert x[i:j].tolist() == values
-            
-    
+
+
 
     assert df.x.tolist() == x.tolist()
     assert df.g.tolist() == g.tolist()

diff --git a/tests/csv_test.py b/tests/csv_test.py
@@ -57,7 +57,7 @@ def test_chunk_iterator(l1, l2):
     path.parent.mkdir(exist_ok=True)
     df_original.export(str(path))
 
-    df = vaex.read_csv_arrow_lazy(str(path), chunk_size=20, newline_readahead=10)
+    df = vaex.from_csv_arrow(str(path), chunk_size=20, newline_readahead=10, lazy=True)
     ds_full = ds = df.dataset
 
     # very similar to the dataset_test::test_concat_chunk_iterator and arrow/datase_test.py parquet test