Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #567 - Implement ods import #568

Merged
merged 2 commits into from
Oct 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
### Improvements

- The html format now supports importing from HTML content (#243)
- The ODS format now supports importing from .ods files (#567). The support is
still a bit experimental.

### Changes

Expand Down
11 changes: 9 additions & 2 deletions docs/formats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,19 @@ If a title has been set, it will be exported as the table caption.
ods
===

Export data in OpenDocument Spreadsheet format. The ``ods`` format is currently
export-only.
Import/export data in OpenDocument Spreadsheet format.

.. versionadded:: 3.6.0

Import functionality was added.

This format is optional, install Tablib with ``pip install "tablib[ods]"`` to
make the format available.

The ``import_set()`` method also supports a ``skip_lines`` parameter that you
can set to a number of lines that should be skipped before starting to read
data.

.. admonition:: Binary Warning

:class:`Dataset.ods` contains binary data, so make sure to write in binary mode::
Expand Down
98 changes: 98 additions & 0 deletions src/tablib/formats/_ods.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
""" Tablib - ODF Support.
"""

import datetime as dt
import numbers
from io import BytesIO

from odf import opendocument, style, table, text

import tablib

bold = style.Style(name="bold", family="paragraph")
bold.addElement(style.TextProperties(
fontweight="bold",
Expand Down Expand Up @@ -49,6 +52,93 @@ def export_book(cls, databook):
wb.save(stream)
return stream.getvalue()

@classmethod
def import_sheet(cls, dset, sheet, headers=True, skip_lines=0):
"""Populate dataset `dset` with sheet data."""

dset.title = sheet.getAttribute('name')

def is_real_cell(cell):
return cell.hasChildNodes() or not cell.getAttribute('numbercolumnsrepeated')

for i, row in enumerate(sheet.childNodes):
if row.tagName != 'table:table-row':
continue
if i < skip_lines:
continue
claudep marked this conversation as resolved.
Show resolved Hide resolved
row_vals = [cls.read_cell(cell) for cell in row.childNodes if is_real_cell(cell)]
if not row_vals:
continue
if i == skip_lines and headers:
dset.headers = row_vals
else:
if i > skip_lines and len(row_vals) < dset.width:
row_vals += [''] * (dset.width - len(row_vals))
claudep marked this conversation as resolved.
Show resolved Hide resolved
dset.append(row_vals)

@classmethod
def read_cell(cls, cell, value_type=None):
def convert_date(val):
if 'T' in val:
return dt.datetime.strptime(val, "%Y-%m-%dT%H:%M:%S")
else:
return dt.datetime.strptime(val, "%Y-%m-%d").date()

if value_type is None:
value_type = cell.getAttribute('valuetype')
if value_type == 'date':
date_value = cell.getAttribute('datevalue')
if date_value:
return convert_date(date_value)
if value_type == 'time':
time_value = cell.getAttribute('timevalue')
return dt.datetime.strptime(time_value, "%H:%M:%S").time()
if value_type == 'boolean':
bool_value = cell.getAttribute('booleanvalue')
return bool_value == 'true'
if not cell.childNodes:
value = getattr(cell, 'data', None)
if value is None:
value = cell.getAttribute('value')
if value is None:
return ''
if value_type == 'float':
return float(value)
if value_type == 'date':
return convert_date(value)
return value # Any other type default to 'string'

for subnode in cell.childNodes:
value = cls.read_cell(subnode, value_type)
if value:
return value

@classmethod
def import_set(cls, dset, in_stream, headers=True, skip_lines=0):
"""Populate dataset `dset` from ODS stream."""

dset.wipe()

ods_book = opendocument.load(in_stream)
for sheet in ods_book.spreadsheet.childNodes:
if sheet.qname[1] == 'table':
cls.import_sheet(dset, sheet, headers, skip_lines)

@classmethod
def import_book(cls, dbook, in_stream, headers=True):
"""Populate databook `dbook` from ODS stream."""

dbook.wipe()

ods_book = opendocument.load(in_stream)

for sheet in ods_book.spreadsheet.childNodes:
if sheet.qname[1] != 'table':
continue
dset = tablib.Dataset()
cls.import_sheet(dset, sheet, headers)
dbook.add_sheet(dset)

@classmethod
def dset_sheet(cls, dataset, ws):
"""Completes given worksheet from given Dataset."""
Expand All @@ -66,6 +156,14 @@ def dset_sheet(cls, dataset, ws):
for j, col in enumerate(row):
if isinstance(col, numbers.Number):
cell = table.TableCell(valuetype="float", value=col)
elif isinstance(col, dt.datetime):
cell = table.TableCell(
valuetype="date", value=col.strftime('%Y-%m-%dT%H:%M:%S')
)
elif isinstance(col, dt.date):
cell = table.TableCell(valuetype="date", datevalue=col.strftime('%Y-%m-%d'))
elif isinstance(col, dt.time):
cell = table.TableCell(valuetype="time", timevalue=col.strftime('%H:%M:%S'))
else:
cell = table.TableCell(valuetype="string")
cell.addElement(text.P(text=str(col), stylename=style))
Expand Down
Binary file added tests/files/book.ods
Binary file not shown.
Binary file added tests/files/ragged.ods
Binary file not shown.
Binary file added tests/files/unknown_value_type.ods
Binary file not shown.
65 changes: 54 additions & 11 deletions tests/test_tablib.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,15 @@ def test_unknown_format(self):
with self.assertRaises(UnsupportedFormat):
data.export('??')
# A known format but uninstalled
del registry._formats['ods']
msg = (r"The 'ods' format is not available. You may want to install the "
"odfpy package \\(or `pip install \"tablib\\[ods\\]\"`\\).")
with self.assertRaisesRegex(UnsupportedFormat, msg):
data.export('ods')
saved_registry = registry._formats.copy()
try:
del registry._formats['ods']
msg = (r"The 'ods' format is not available. You may want to install the "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
msg = (r"The 'ods' format is not available. You may want to install the "
msg = (r"The ODS format is not available. You may want to install the "

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we change here, then we must also change the generation of error message in Registry.get_format (replacing '{key}' by {key.upper()}). Would you suggest that? In this PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice but not essential! This or another is fine :)

"odfpy package \\(or `pip install \"tablib\\[ods\\]\"`\\).")
with self.assertRaisesRegex(UnsupportedFormat, msg):
data.export('ods')
finally:
registry._formats = saved_registry

def test_empty_append(self):
"""Verify append() correctly adds tuple with no headers."""
Expand Down Expand Up @@ -1103,13 +1107,52 @@ def test_tsv_export(self):


class ODSTests(BaseTestCase):
def test_ods_export_datatypes(self):
def test_ods_export_import_set(self):
date = datetime.date(2019, 10, 4)
date_time = datetime.datetime(2019, 10, 4, 12, 30, 8)
data.append(('string', '004', 42, 21.55, Decimal('34.5'), date_time))
data.headers = ('string', 'start0', 'integer', 'float', 'decimal', 'date/time')
# ODS is currently write-only, just test that output doesn't crash.
assert data.ods is not None
assert len(data.ods)
time = datetime.time(14, 30)
data.append(('string', '004', 42, 21.55, Decimal('34.5'), date, time, date_time))
data.headers = (
'string', 'start0', 'integer', 'float', 'decimal', 'date', 'time', 'date/time'
)
_ods = data.ods
data.ods = _ods
self.assertEqual(data.dict[0]['string'], 'string')
self.assertEqual(data.dict[0]['start0'], '004')
self.assertEqual(data.dict[0]['integer'], 42)
self.assertEqual(data.dict[0]['float'], 21.55)
self.assertEqual(data.dict[0]['decimal'], 34.5)
self.assertEqual(data.dict[0]['date'], date)
self.assertEqual(data.dict[0]['time'], time)
self.assertEqual(data.dict[0]['date/time'], date_time)

def test_ods_import_book(self):
ods_source = Path(__file__).parent / 'files' / 'book.ods'
with ods_source.open('rb') as fh:
dbook = tablib.Databook().load(fh, 'ods')
self.assertEqual(len(dbook.sheets()), 2)

def test_ods_import_set_skip_lines(self):
data.append(('garbage', 'line', ''))
data.append(('', '', ''))
data.append(('id', 'name', 'description'))
_ods = data.ods
new_data = tablib.Dataset().load(_ods, skip_lines=2)
self.assertEqual(new_data.headers, ['id', 'name', 'description'])

def test_ods_import_set_ragged(self):
ods_source = Path(__file__).parent / 'files' / 'ragged.ods'
with ods_source.open('rb') as fh:
dataset = tablib.Dataset().load(fh, 'ods')
self.assertEqual(dataset.pop(), (1, '', True, ''))

def test_ods_unknown_value_type(self):
# The ods file was trafficked to contain:
# <table:table-cell office:value-type="unknown" calcext:value-type="string">
ods_source = Path(__file__).parent / 'files' / 'unknown_value_type.ods'
with ods_source.open('rb') as fh:
dataset = tablib.Dataset().load(fh, 'ods')
self.assertEqual(dataset.pop(), ('abcd',))


class XLSTests(BaseTestCase):
Expand Down