Skip to content

Commit

Permalink
Consistent method names; docstrings for from_csv_arrow
Browse files Browse the repository at this point in the history
Co-authored-by: Maarten Breddels <maartenbreddels@gmail.com>>
  • Loading branch information
JovanVeljanoski and Maarten Breddels committed Sep 20, 2022
1 parent 49ab2c8 commit c9cb709
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 23 deletions.
40 changes: 24 additions & 16 deletions packages/vaex-core/vaex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=No
# # naked_path, _ = vaex.file.split_options(path, fs_options)
_, ext, _ = vaex.file.split_ext(path)
if ext == '.csv': # special case for csv
return vaex.read_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
if convert:
path_output = convert if isinstance(convert, str) else filename_hdf5
vaex.convert.convert(
Expand Down Expand Up @@ -410,7 +410,7 @@ def from_pandas(df, name="pandas", copy_index=False, index_name="index"):
:param: name: unique for the DataFrame
>>> import vaex, pandas as pd
>>> df_pandas = pd.read_csv('test.csv')
>>> df_pandas = pd.from_csv('test.csv')
>>> df = vaex.from_pandas(df_pandas)
:rtype: DataFrame
Expand Down Expand Up @@ -531,23 +531,31 @@ def from_records(records : List[Dict], array_type="arrow", defaults={}) -> vaex.
return vaex.from_dict(arrays)


def read_csv_arrow_lazy(file, chunk_size="10MiB", newline_readahead="64kiB", read_options=None, parse_options=None, convert_options=None):
'''Experimental Lazy CSV reader using Apache Arrow'''
import vaex.csv
ds = vaex.csv.DatasetCsvLazy(file, chunk_size=chunk_size, newline_readahead=newline_readahead, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
return vaex.from_dataset(ds)
def from_csv_arrow(file, read_options=None, parse_options=None, convert_options=None, lazy=False, chunk_size="10MiB", newline_readahead="64kiB"):
""" Fast CSV reader using Apache Arrow. Support for lazy reading of CSV files (experimental).
def read_csv_arrow(file, read_options=None, parse_options=None, convert_options=None):
'''CSV reader using Apache Arrow'''
:param file: file path or file-like object
:param read_options: PyArrow CSV read options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
:param parse_options: PyArrow CSV parse options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
:param convert_options: PyArrow CSV convert options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html
:param lazy: If True, the CSV file is lazily read, and the DataFrame is not stored in memory.
:param chunk_size: The CSV is read in chunks of the specified size. Relevant only if lazy=True.
:param newline_readahead: The size of the readahead buffer for newline detection. Relevant only if lazy=True.
:return: DataFrame
"""
import vaex.csv
ds = vaex.csv.DatasetCsv(file, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
return vaex.from_dataset(ds)
if lazy is True:
ds = vaex.csv.DatasetCsvLazy(file, chunk_size=chunk_size, read_options=read_options, parse_options=parse_options, convert_options=convert_options, newline_readahead=newline_readahead)
return vaex.from_dataset(ds)
else:
ds = vaex.csv.DatasetCsv(file, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
return vaex.from_dataset(ds)


@docsubst
def read_csv(filename_or_buffer, copy_index=False, chunk_size=None, convert=False, fs_options={}, progress=None, fs=None, **kwargs):
def from_csv(filename_or_buffer, copy_index=False, chunk_size=None, convert=False, fs_options={}, progress=None, fs=None, **kwargs):
"""
Read a CSV file as a DataFrame, and optionally convert to an hdf5 file.
Load a CSV file as a DataFrame, and optionally convert to an HDF5 file.
:param str or file filename_or_buffer: CSV file path or file-like
:param bool copy_index: copy index when source is read via Pandas
Expand Down Expand Up @@ -603,9 +611,9 @@ def iterator():
return iterator()


def from_csv(filepath_or_buffer, **kwargs):
'''Alias to read_csv.'''
return read_csv(filepath_or_buffer, **kwargs)
def read_csv(filepath_or_buffer, **kwargs):
'''Alias to from_csv.'''
return from_csv(filepath_or_buffer, **kwargs)

aliases = vaex.settings.aliases

Expand Down
9 changes: 3 additions & 6 deletions tests/arrow/dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,7 @@ def test_csv(tmpdir, df_filtered, rebuild_dataframe, lazy):
df_filtered.drop('datetime', inplace=True)
df_filtered.drop('timedelta', inplace=True)
df_filtered.export_csv(path)
if lazy:
df = vaex.read_csv_arrow_lazy(path)
else:
df = vaex.read_csv_arrow(path)
df = vaex.from_csv_arrow(path, lazy=lazy)

df2 = rebuild_dataframe(df)
assert df2.dataset == df.dataset
Expand Down Expand Up @@ -108,8 +105,8 @@ def test_parquet(l1, l2, rebuild_dataset):
for i1, i2, chunks in ds.chunk_iterator(['x']):
values.extend(chunks['x'].to_pylist())
assert x[i:j].tolist() == values



assert df.x.tolist() == x.tolist()
assert df.g.tolist() == g.tolist()
Expand Down
2 changes: 1 addition & 1 deletion tests/csv_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def test_chunk_iterator(l1, l2):
path.parent.mkdir(exist_ok=True)
df_original.export(str(path))

df = vaex.read_csv_arrow_lazy(str(path), chunk_size=20, newline_readahead=10)
df = vaex.from_csv_arrow(str(path), chunk_size=20, newline_readahead=10, lazy=True)
ds_full = ds = df.dataset

# very similar to the dataset_test::test_concat_chunk_iterator and arrow/datase_test.py parquet test
Expand Down

0 comments on commit c9cb709

Please sign in to comment.