Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More CSV updates #328

Merged
merged 5 commits into from
Jun 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# TileDB-Py 0.6.3 Release Notes

* Fix unnecessary implicit ordering requirement for multi-attribute assignment. [#328](https://github.com/TileDB-Inc/TileDB-Py/pull/328)

# TileDB-Py 0.6.2 Release Notes

## Bug fixes
Expand Down
45 changes: 37 additions & 8 deletions tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import io
from collections import OrderedDict
import warnings

if sys.version_info >= (3,3):
unicode_type = str
Expand All @@ -22,7 +23,9 @@
'tile_order': 'row-major',
'allows_duplicates': False,
'sparse': False,
'mode': 'ingest'
'mode': 'ingest',
'attrs_filters': None,
'coords_filters': None
}

def parse_tiledb_kwargs(kwargs):
Expand All @@ -38,6 +41,10 @@ def parse_tiledb_kwargs(kwargs):
args['allows_duplicates'] = kwargs.pop('allows_duplicates')
if 'mode' in kwargs:
args['mode'] = kwargs.pop('mode')
if 'attrs_filters' in kwargs:
args['attrs_filters'] = kwargs.pop('attrs_filters')
if 'coords_filters' in kwargs:
args['coords_filters'] = kwargs.pop('coords_filters')

return args

Expand Down Expand Up @@ -96,7 +103,7 @@ def dtype_from_column(col):
)

# TODO make this a staticmethod on Attr?
def attrs_from_df(df, index_dims=None, ctx=None):
def attrs_from_df(df, index_dims=None, filters=None, ctx=None):
attr_reprs = dict()

if ctx is None:
Expand All @@ -108,7 +115,7 @@ def attrs_from_df(df, index_dims=None, ctx=None):
if index_dims and name in index_dims:
continue
attr_info = dtype_from_column(col)
attrs.append(tiledb.Attr(name=name, dtype=attr_info.dtype))
attrs.append(tiledb.Attr(name=name, dtype=attr_info.dtype, filters=filters))

if attr_info.repr is not None:
attr_reprs[name] = attr_info.repr
Expand Down Expand Up @@ -201,16 +208,22 @@ def create_dims(ctx, dataframe, index_dims):

return dims


def from_dataframe(uri, dataframe, **kwargs):
# deprecated in 0.6.3
warnings.warn("tiledb.from_dataframe is deprecated; please use .from_pandas",
DeprecationWarning)

from_pandas(uri, dataframe, **kwargs)

def from_pandas(uri, dataframe, **kwargs):
"""Create TileDB array at given URI from pandas dataframe

:param uri: URI for new TileDB array
:param dataframe: pandas DataFrame
:param mode: Creation mode, one of 'ingest' (default), 'create_schema'
:param kwargs: optional keyword arguments for Pandas and TileDB.
TileDB context and configuration arguments
may be passed in a dictionary as `tiledb_args={...}`
TileDB arguments: tile_order, cell_order, allows_duplicates, sparse,
mode, attrs_filters, coords_filters
:return:
"""
args = parse_tiledb_kwargs(kwargs)
Expand All @@ -222,6 +235,8 @@ def from_dataframe(uri, dataframe, **kwargs):
sparse = args['sparse']
index_dims = args.get('index_dims', None)
mode = args.get('mode', 'ingest')
attrs_filters = args.get('attrs_filters', None)
coords_filters = args.get('coords_filters', None)

write = True
if mode is not None and mode == 'schema_only':
Expand All @@ -230,6 +245,14 @@ def from_dataframe(uri, dataframe, **kwargs):
if ctx is None:
ctx = tiledb.default_ctx()

if attrs_filters is None:
attrs_filters = tiledb.FilterList(
[tiledb.ZstdFilter(1, ctx=ctx)])

if coords_filters is None:
coords_filters = tiledb.FilterList(
[tiledb.ZstdFilter(1, ctx=ctx)])

nrows = len(dataframe)
tiling = np.min((nrows % 200, nrows))

Expand All @@ -247,14 +270,17 @@ def from_dataframe(uri, dataframe, **kwargs):
*dims,
ctx = ctx
)
attrs, attr_metadata = attrs_from_df(dataframe, index_dims=index_dims)

attrs, attr_metadata = attrs_from_df(dataframe, index_dims=index_dims,
filters=attrs_filters)

# now create the ArraySchema
schema = tiledb.ArraySchema(
domain=domain,
attrs=attrs,
cell_order=cell_order,
tile_order=tile_order,
coords_filters=coords_filters,
allows_duplicates=allows_duplicates,
sparse=sparse
)
Expand Down Expand Up @@ -302,6 +328,9 @@ def open_dataframe(uri):
>>> tiledb.objec_type("iris.tldb")
'array'
"""
warnings.warn("open_dataframe is deprecated and will be removed in the next release",
DeprecationWarning)

import pandas as pd

# TODO support `distributed=True` option?
Expand Down Expand Up @@ -383,4 +412,4 @@ def from_csv(uri, csv_file, **kwargs):
df = pandas.read_csv(csv_file, **kwargs)

kwargs.update(tiledb_args)
from_dataframe(uri, df, **kwargs)
from_pandas(uri, df, **kwargs)
5 changes: 4 additions & 1 deletion tiledb/libtiledb.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4205,7 +4205,10 @@ cdef class DenseArrayImpl(Array):
cdef list values = list()

if isinstance(val, dict):
for (k, v) in val.items():
for attr_idx in range(self.schema.nattr):
attr = self.schema.attr(attr_idx)
k = attr.name
v = val[k]
attr = self.schema.attr(k)
attributes.append(attr._internal_name)
# object arrays are var-len and handled later
Expand Down
6 changes: 6 additions & 0 deletions tiledb/tests/test_libtiledb.py
Original file line number Diff line number Diff line change
Expand Up @@ -1125,6 +1125,12 @@ def test_multiple_attributes(self):
with tiledb.DenseArray(self.path("foo"), mode='w', ctx=ctx) as T:
T[:] = V

# check setting attribute in different order from Attr definition
# https://github.com/TileDB-Inc/TileDB-Py/issues/299
V2 = {"floats": V_floats, "ints": V_ints}
with tiledb.DenseArray(self.path("foo"), mode='w', ctx=ctx) as T:
T[:] = V

with tiledb.DenseArray(self.path("foo"), mode='r', ctx=ctx) as T:
R = T[:]
assert_array_equal(V["ints"], R["ints"])
Expand Down
23 changes: 19 additions & 4 deletions tiledb/tests/test_pandas_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def test_csv_col_to_sparse_dims(self):
tmp_array2b = os.path.join(tmp_dir, "array2b")

# create a duplicate value
df.int_vals[0] = df.int_vals[1]
df.loc[0, 'int_vals'] = df.int_vals[1]
df.sort_values('int_vals', inplace=True)

df.to_csv(tmp_csv2, index=False)
Expand Down Expand Up @@ -345,11 +345,17 @@ def test_csv_schema_only(self):
df.sort_values('time', inplace=True)
df.to_csv(tmp_csv, index=False)

attrs_filters = tiledb.FilterList([tiledb.ZstdFilter(1)])
# from_dataframe default is 1, so use 7 here to check
# the arg is correctly parsed/passed
coords_filters = tiledb.FilterList([tiledb.ZstdFilter(7)])

tmp_array = os.path.join(tmp_dir, "array")
tiledb.from_csv(tmp_array, tmp_csv,
index_col=['time', 'double_range'],
parse_dates=['time'],
mode='schema_only')
mode='schema_only',
coords_filters=coords_filters)

t0, t1 = df.time.min(), df.time.max()

Expand All @@ -360,8 +366,9 @@ def test_csv_schema_only(self):
tiledb.Dim(name='double_range', domain=(-1000.0, 1000.0), tile=1.0, dtype='float64'),
]),
attrs=[
tiledb.Attr(name='int_vals', dtype='int64'),
tiledb.Attr(name='int_vals', dtype='int64', filters=attrs_filters),
],
coords_filters=coords_filters,
cell_order='row-major',
tile_order='row-major', sparse=True,
allows_duplicates=False)
Expand All @@ -371,4 +378,12 @@ def test_csv_schema_only(self):
self.assertEqual(array_nfiles, 3)

with tiledb.open(tmp_array) as A:
self.assertEqual(A.schema, ref_schema)
self.assertEqual(A.schema, ref_schema)

# TODO currently no equality check for filters
self.assertEqual(
A.schema.coords_filters[0].level, coords_filters[0].level
)
self.assertEqual(
A.schema.attr(0).filters[0].level, attrs_filters[0].level
)