Skip to content

Commit

Permalink
Consistent method names; docstrings for from_csv_arrow
Browse files Browse the repository at this point in the history
Co-authored-by: Maarten Breddels <maartenbreddels@gmail.com>>
  • Loading branch information
JovanVeljanoski and Maarten Breddels committed Sep 20, 2022
1 parent 49ab2c8 commit d89c6a2
Showing 1 changed file with 24 additions and 16 deletions.
40 changes: 24 additions & 16 deletions packages/vaex-core/vaex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def open(path, convert=False, progress=None, shuffle=False, fs_options={}, fs=No
# # naked_path, _ = vaex.file.split_options(path, fs_options)
_, ext, _ = vaex.file.split_ext(path)
if ext == '.csv': # special case for csv
return vaex.read_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
return vaex.from_csv(path, fs_options=fs_options, fs=fs, convert=convert, progress=progress, **kwargs)
if convert:
path_output = convert if isinstance(convert, str) else filename_hdf5
vaex.convert.convert(
Expand Down Expand Up @@ -410,7 +410,7 @@ def from_pandas(df, name="pandas", copy_index=False, index_name="index"):
:param: name: unique for the DataFrame
>>> import vaex, pandas as pd
>>> df_pandas = pd.read_csv('test.csv')
>>> df_pandas = pd.from_csv('test.csv')
>>> df = vaex.from_pandas(df_pandas)
:rtype: DataFrame
Expand Down Expand Up @@ -531,23 +531,31 @@ def from_records(records : List[Dict], array_type="arrow", defaults={}) -> vaex.
return vaex.from_dict(arrays)


def read_csv_arrow_lazy(file, chunk_size="10MiB", newline_readahead="64kiB", read_options=None, parse_options=None, convert_options=None):
'''Experimental Lazy CSV reader using Apache Arrow'''
import vaex.csv
ds = vaex.csv.DatasetCsvLazy(file, chunk_size=chunk_size, newline_readahead=newline_readahead, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
return vaex.from_dataset(ds)
def from_csv_arrow(file, read_options=None, parse_options=None, convert_options=None, lazy=False, chunk_size="10MiB", newline_readahead="64kiB"):
""" Fast CSV reader using Apache Arrow. Support for lazy reading of CSV files (experimental).
def read_csv_arrow(file, read_options=None, parse_options=None, convert_options=None):
'''CSV reader using Apache Arrow'''
:param file: file path or file-like object
:param read_options: PyArrow CSV read options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
:param parse_options: PyArrow CSV parse options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
:param convert_options: PyArrow CSV convert options, see https://arrow.apache.org/docs/python/generated/pyarrow.csv.ConvertOptions.html
:param lazy: If True, the CSV file is lazily read, and the DataFrame is not stored in memory.
:param chunk_size: The CSV is read in chunks of the specified size. Relevant only if lazy=True.
:param newline_readahead: The size of the readahead buffer for newline detection. Relevant only if lazy=True.
:return: DataFrame
"""
import vaex.csv
ds = vaex.csv.DatasetCsv(file, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
return vaex.from_dataset(ds)
if lazy is True:
ds = vaex.csv.DatasetCsvLazy(file, chunk_size=chunk_size, read_options=read_options, parse_options=parse_options, convert_options=convert_options, newline_readahead=newline_readahead)
return vaex.from_dataset(ds)
else:
ds = vaex.csv.DatasetCsv(file, read_options=read_options, parse_options=parse_options, convert_options=convert_options)
return vaex.from_dataset(ds)


@docsubst
def read_csv(filename_or_buffer, copy_index=False, chunk_size=None, convert=False, fs_options={}, progress=None, fs=None, **kwargs):
def from_csv(filename_or_buffer, copy_index=False, chunk_size=None, convert=False, fs_options={}, progress=None, fs=None, **kwargs):
"""
Read a CSV file as a DataFrame, and optionally convert to an hdf5 file.
Load a CSV file as a DataFrame, and optionally convert to an HDF5 file.
:param str or file filename_or_buffer: CSV file path or file-like
:param bool copy_index: copy index when source is read via Pandas
Expand Down Expand Up @@ -603,9 +611,9 @@ def iterator():
return iterator()


def from_csv(filepath_or_buffer, **kwargs):
'''Alias to read_csv.'''
return read_csv(filepath_or_buffer, **kwargs)
def read_csv(filepath_or_buffer, **kwargs):
'''Alias to from_csv.'''
return from_csv(filepath_or_buffer, **kwargs)

aliases = vaex.settings.aliases

Expand Down

0 comments on commit d89c6a2

Please sign in to comment.