From b52999ccb0967bf2cae6437a3b036c4240114302 Mon Sep 17 00:00:00 2001 From: Sandro Loch Date: Fri, 15 Sep 2023 13:47:35 -0300 Subject: [PATCH] Remove Unnecessary Function from readdbc Module (#18) * chore(release): Add --build parameter to enable package publication with semantic-release * refactor: Remove 'dbf_to_csvgz' function * feat(module): Remove unnecessary functions from the readdbc module * test: Enhance and refactor test suite * feat(deps): Move pandas to dev group in pyproject.toml * chore(docs): Update README * Minor changes * chore(docs): Fix requested changes in reviews * Minor changes * Update README.md --- .releaserc.json | 16 ++++--- README.md | 92 +++++++++++++++++++++++++++++++++++++- poetry.lock | 98 +++++++++++++++++++++++++---------------- pyproject.toml | 5 ++- pyreaddbc/readdbc.py | 64 --------------------------- tests/test_pyreaddbc.py | 94 ++++++++++++++++++++++----------------- 6 files changed, 218 insertions(+), 151 deletions(-) diff --git a/.releaserc.json b/.releaserc.json index c054dbc..2926c31 100644 --- a/.releaserc.json +++ b/.releaserc.json @@ -3,9 +3,11 @@ "tagFormat": "${version}", "plugins": [ [ - "@semantic-release/commit-analyzer", { - "preset": "conventionalcommits" - }], + "@semantic-release/commit-analyzer", + { + "preset": "conventionalcommits" + } + ], [ "semantic-release-replace-plugin", { @@ -42,9 +44,11 @@ } ], [ - "@semantic-release/release-notes-generator", { + "@semantic-release/release-notes-generator", + { "preset": "conventionalcommits" - }], + } + ], [ "@semantic-release/changelog", { @@ -56,7 +60,7 @@ "@semantic-release/exec", { "prepareCmd": "poetry build", - "publishCmd": "poetry publish" + "publishCmd": "poetry publish --build" } ], [ diff --git a/README.md b/README.md index 76f1287..cda6153 100644 --- a/README.md +++ b/README.md @@ -1 +1,91 @@ -# py-blast-dbf +# pyreaddbc + +**pyreaddbc** is a Python library for working with [DBase database file](https://docs.fileformat.com/database/dbf/). Legacy systems from the Brazilian Ministry of Health still uses DBF and DBC formats to Publicize data. This package was developed to help [PySUS](https://github.com/AlertaDengue/pysus) to extract data from these formats into more modern ones. Pyreaddbc can also be used to convert DBC files from any other source." + + +## Installation + +You can install **pyreaddbc** using pip: + +```bash +pip install pyreaddbc +``` + +## Usage + +**Note**: *Extracting the DBF from a DBC may require to specify the encoding of the original data, if known.* + +### Reading DBC Files + +To read a DBC file and convert it to a pandas DataFrame, use the `read_dbc` function: + +```python +import pyreaddbc + +dfs = pyreaddbc.read_dbc("LTPI2201.dbc", encoding="iso-8859-1") +``` + +### Exporting to CSV.GZ + +To export a DataFrame to a compressed CSV file (CSV.GZ), you can use pandas: + +```python +import pyreaddbc + +df = pyreaddbc.read_dbc("./LTPI2201.dbc", encoding="iso-8859-1") +df.to_csv("LTPI2201.csv.gz", compression="gzip", index=False) +``` + +### Exporting to Parquet + +To export a DataFrame to a Parquet file, you can use the `pyarrow` library: + +```python +import pyreaddbc +import pyarrow.parquet as pq +import pandas as pd +from pathlib import Path + +# Read DBC file and convert to DataFrame +df = pyreaddbc.read_dbc("./LTPI2201.dbc", encoding="iso-8859-1") + +# Export to CSV.GZ +df.to_csv("LTPI2201.csv.gz", compression="gzip", index=False) + +# Export to Parquet +pq.write_table(pa.Table.from_pandas(df), "parquets/LTPI2201.parquet") + +# Read the Parquet files and decode DataFrame columns +parquet_dir = Path("parquets") +parquets = parquet_dir.glob("*.parquet") + +chunks_list = [ + pd.read_parquet(str(f), engine='fastparquet') for f in parquets +] + +# Concatenate DataFrames +df_parquet = pd.concat(chunks_list, ignore_index=True) + +``` +--- + +## License + +[GNU Affero General Public License (AGPL-3.0)](./LICENSE) + +This license ensures that the software remains open-source and free to use, modify, and distribute while requiring that any changes or enhancements made to the codebase are also made available to the community under the same terms. + + +Acknowledge
+============ +
+ + +This program decompresses .dbc files to .dbf. This code is based on the work +of Mark Adler (zlib/blast), Pablo Fonseca +(https://github.com/eaglebh/blast-dbf). + +[PySUS](https://github.com/AlertaDengue/PySUS) has further extended and adapted this code to +create **pyreaddbc**. The original work of Mark Adler and Pablo Fonseca is much appreciated for its contribution to this project. + +**Note**: *pyreaddbc* is maintained with funding from [AlertaDengue](https://github.com/AlertaDengue). diff --git a/poetry.lock b/poetry.lock index e58f3b7..5831745 100644 --- a/poetry.lock +++ b/poetry.lock @@ -269,13 +269,13 @@ files = [ [[package]] name = "commitizen" -version = "3.8.2" +version = "3.9.0" description = "Python commitizen client tool" optional = false python-versions = ">=3.7,<4.0" files = [ - {file = "commitizen-3.8.2-py3-none-any.whl", hash = "sha256:d21da30d28430f5d93983d936ffd17c8750ad441f8497f8c653e81589c4853d7"}, - {file = "commitizen-3.8.2.tar.gz", hash = "sha256:ff480cd6d6a5ce03b4273659f59e4975860938435b09c27b33302ae2f2a32393"}, + {file = "commitizen-3.9.0-py3-none-any.whl", hash = "sha256:ae6a525a0acdb65437e7ba0cf4714dfb3c5f528316a0bbf4368f8e537432f815"}, + {file = "commitizen-3.9.0.tar.gz", hash = "sha256:36630bea53a35bc2c578346d516ca6b9870f13c7e009c68265e2f098423de2a2"}, ] [package.dependencies] @@ -590,51 +590,64 @@ files = [ [[package]] name = "pandas" -version = "1.5.3" +version = "2.1.0" description = "Powerful data structures for data analysis, time series, and statistics" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406"}, - {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572"}, - {file = "pandas-1.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996"}, - {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354"}, - {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23"}, - {file = "pandas-1.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"}, - {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae"}, - {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6"}, - {file = "pandas-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792"}, - {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7"}, - {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf"}, - {file = "pandas-1.5.3-cp38-cp38-win32.whl", hash = "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51"}, - {file = "pandas-1.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a"}, - {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0"}, - {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5"}, - {file = "pandas-1.5.3-cp39-cp39-win32.whl", hash = "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a"}, - {file = "pandas-1.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9"}, - {file = "pandas-1.5.3.tar.gz", hash = "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1"}, + {file = "pandas-2.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:40dd20439ff94f1b2ed55b393ecee9cb6f3b08104c2c40b0cb7186a2f0046242"}, + {file = "pandas-2.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d4f38e4fedeba580285eaac7ede4f686c6701a9e618d8a857b138a126d067f2f"}, + {file = "pandas-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e6a0fe052cf27ceb29be9429428b4918f3740e37ff185658f40d8702f0b3e09"}, + {file = "pandas-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d81e1813191070440d4c7a413cb673052b3b4a984ffd86b8dd468c45742d3cc"}, + {file = "pandas-2.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:eb20252720b1cc1b7d0b2879ffc7e0542dd568f24d7c4b2347cb035206936421"}, + {file = "pandas-2.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:38f74ef7ebc0ffb43b3d633e23d74882bce7e27bfa09607f3c5d3e03ffd9a4a5"}, + {file = "pandas-2.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cda72cc8c4761c8f1d97b169661f23a86b16fdb240bdc341173aee17e4d6cedd"}, + {file = "pandas-2.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d97daeac0db8c993420b10da4f5f5b39b01fc9ca689a17844e07c0a35ac96b4b"}, + {file = "pandas-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8c58b1113892e0c8078f006a167cc210a92bdae23322bb4614f2f0b7a4b510f"}, + {file = "pandas-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:629124923bcf798965b054a540f9ccdfd60f71361255c81fa1ecd94a904b9dd3"}, + {file = "pandas-2.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:70cf866af3ab346a10debba8ea78077cf3a8cd14bd5e4bed3d41555a3280041c"}, + {file = "pandas-2.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:d53c8c1001f6a192ff1de1efe03b31a423d0eee2e9e855e69d004308e046e694"}, + {file = "pandas-2.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:86f100b3876b8c6d1a2c66207288ead435dc71041ee4aea789e55ef0e06408cb"}, + {file = "pandas-2.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28f330845ad21c11db51e02d8d69acc9035edfd1116926ff7245c7215db57957"}, + {file = "pandas-2.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9a6ccf0963db88f9b12df6720e55f337447aea217f426a22d71f4213a3099a6"}, + {file = "pandas-2.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d99e678180bc59b0c9443314297bddce4ad35727a1a2656dbe585fd78710b3b9"}, + {file = "pandas-2.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b31da36d376d50a1a492efb18097b9101bdbd8b3fbb3f49006e02d4495d4c644"}, + {file = "pandas-2.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0164b85937707ec7f70b34a6c3a578dbf0f50787f910f21ca3b26a7fd3363437"}, + {file = "pandas-2.1.0.tar.gz", hash = "sha256:62c24c7fc59e42b775ce0679cfa7b14a5f9bfb7643cfbe708c960699e05fb918"}, ] [package.dependencies] numpy = [ - {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, ] -python-dateutil = ">=2.8.1" +python-dateutil = ">=2.8.2" pytz = ">=2020.1" +tzdata = ">=2022.1" [package.extras] -test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +all = ["PyQt5 (>=5.15.6)", "SQLAlchemy (>=1.4.36)", "beautifulsoup4 (>=4.11.1)", "bottleneck (>=1.3.4)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=0.8.1)", "fsspec (>=2022.05.0)", "gcsfs (>=2022.05.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.8.0)", "matplotlib (>=3.6.1)", "numba (>=0.55.2)", "numexpr (>=2.8.0)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pandas-gbq (>=0.17.5)", "psycopg2 (>=2.9.3)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.5)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "pyxlsb (>=1.0.9)", "qtpy (>=2.2.0)", "s3fs (>=2022.05.0)", "scipy (>=1.8.1)", "tables (>=3.7.0)", "tabulate (>=0.8.10)", "xarray (>=2022.03.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)", "zstandard (>=0.17.0)"] +aws = ["s3fs (>=2022.05.0)"] +clipboard = ["PyQt5 (>=5.15.6)", "qtpy (>=2.2.0)"] +compression = ["zstandard (>=0.17.0)"] +computation = ["scipy (>=1.8.1)", "xarray (>=2022.03.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.10)", "pyxlsb (>=1.0.9)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2022.05.0)"] +gcp = ["gcsfs (>=2022.05.0)", "pandas-gbq (>=0.17.5)"] +hdf5 = ["tables (>=3.7.0)"] +html = ["beautifulsoup4 (>=4.11.1)", "html5lib (>=1.1)", "lxml (>=4.8.0)"] +mysql = ["SQLAlchemy (>=1.4.36)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.8.10)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.4)", "numba (>=0.55.2)", "numexpr (>=2.8.0)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.36)", "psycopg2 (>=2.9.3)"] +spss = ["pyreadstat (>=1.1.5)"] +sql-other = ["SQLAlchemy (>=1.4.36)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.8.0)"] [[package]] name = "pathspec" @@ -949,6 +962,17 @@ files = [ {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] +[[package]] +name = "tzdata" +version = "2023.3" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, + {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, +] + [[package]] name = "virtualenv" version = "20.24.5" @@ -998,4 +1022,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.9,<4" -content-hash = "b0a6d9346d396fae1088fd9c4c9b255aff6ab3b3124295f9f6476d33dfc28ba3" +content-hash = "ce7e16b9b0e8dad562929f6420158f8255cca0f46219eb2eb462ff32d1cce552" diff --git a/pyproject.toml b/pyproject.toml index 577ea97..5bfad0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,9 @@ build-backend = "poetry.core.masonry.api" script = "pyreaddbc/_build_readdbc.py" generate-setup-file = false +[tool.poetry.group.dev.dependencies] +pandas = "^2.1.0" + [tool.poetry] name = "pyreaddbc" version = "1.1.0" # changed by semantic-release @@ -40,7 +43,6 @@ packages = [ [tool.poetry.dependencies] python = ">=3.9,<4" -pandas = ">=1.4.3,<2" dbfread = ">=2.0.7,<3" tqdm = ">=4.64.0,<5" cffi = ">=1.15.1,<2" @@ -55,7 +57,6 @@ pytest = ">=7.1.2" commitizen = ">=2.32.2" chardet = ">=5.2.0" - [tool.black] line-length = 79 skip-string-normalization = true diff --git a/pyreaddbc/readdbc.py b/pyreaddbc/readdbc.py index c177b7d..8cd1b05 100644 --- a/pyreaddbc/readdbc.py +++ b/pyreaddbc/readdbc.py @@ -3,14 +3,7 @@ by fccoelho license: GPL V3 or Later """ -import csv -import gzip import os -from tempfile import NamedTemporaryFile - -import pandas as pd -from dbfread import DBF -from tqdm import tqdm try: from pyreaddbc._readdbc import ffi, lib @@ -18,31 +11,6 @@ from ._readdbc import ffi, lib -def read_dbc(filename, encoding="utf-8", raw=False): - """ - Opens a DBC file and return its contents as a pandas - Dataframe. - :param filename: .dbc filename - :param encoding: encoding of the data - :param raw: | - Skip type conversion. Set it to True to avoid type conversion errors - :return: Pandas Dataframe. - """ - if isinstance(filename, str): - filename = filename.encode() - with NamedTemporaryFile(delete=False) as tf: - dbc2dbf(filename, tf.name.encode()) - try: - dbf = DBF(tf.name, encoding=encoding, raw=raw) - df = pd.DataFrame(list(dbf)) - except ValueError: - dbf = DBF(tf.name, encoding=encoding, raw=not raw) - df = pd.DataFrame(list(dbf)) - os.unlink(tf.name) - - return df - - def dbc2dbf(infile, outfile): """ Converts a DBC file to a DBF database saving it to `outfile`. @@ -57,35 +25,3 @@ def dbc2dbf(infile, outfile): q = ffi.new("char[]", os.path.abspath(outfile)) lib.dbc2dbf([p], [q]) - - -def read_dbc_dbf(filename: str): - if filename.endswith(("dbc", "DBC")): - df = read_dbc(filename, encoding="iso-8859-1") - elif filename.endswith(("DBF", "dbf")): - dbf = DBF(filename, encoding="iso-8859-1") - df = pd.DataFrame(list(dbf)) - - return df - - -def dbf_to_csvgz(filename: str, encoding: str = "iso-8859-1"): - """ - Streams a dbf file to gzipped CSV file. The Gzipped csv - will be saved on the same path but with a csv.gz extension. - :param filename: path to the dbf file - """ - data = DBF(filename, encoding=encoding, raw=False) - fn = os.path.splitext(filename)[0] + ".csv.gz" - - with gzip.open(fn, "wt") as gzf: - for i, d in tqdm( - enumerate(data), - desc="Converting", - ): - if i == 0: - csvwriter = csv.DictWriter(gzf, fieldnames=d.keys()) - csvwriter.writeheader() - csvwriter.writerow(d) - else: - csvwriter.writerow(d) diff --git a/tests/test_pyreaddbc.py b/tests/test_pyreaddbc.py index f744a91..ea698fd 100644 --- a/tests/test_pyreaddbc.py +++ b/tests/test_pyreaddbc.py @@ -1,17 +1,67 @@ -import gzip import os from pathlib import Path +from tempfile import NamedTemporaryFile -import chardet import pandas as pd import pytest +from dbfread import DBF -from pyreaddbc import dbc2dbf, dbf_to_csvgz, read_dbc, read_dbc_dbf +from pyreaddbc import dbc2dbf path_root = Path(__file__).resolve().parent.parent data_files = path_root / "tests/data" +def read_dbc(filename, encoding="utf-8", raw=False): + """ + Opens a DBC file and returns its contents as a pandas DataFrame. + + Parameters: + filename (str): + The name of the .dbc file. + encoding (str, optional): + The encoding of the data (default is "utf-8"). + raw (bool, optional): + If True, skips type conversion to avoid + type conversion errors (default is False). + + Returns: + pandas.DataFrame: A DataFrame containing the data from the DBC file. + """ + + if isinstance(filename, str): + filename = filename.encode() + with NamedTemporaryFile(delete=False) as tf: + dbc2dbf(filename, tf.name.encode()) + try: + dbf = DBF(tf.name, encoding=encoding, raw=raw) + df = pd.DataFrame(list(dbf)) + except ValueError: + dbf = DBF(tf.name, encoding=encoding, raw=not raw) + df = pd.DataFrame(list(dbf)) + os.unlink(tf.name) + return df + + +def read_dbc_dbf(filename: str) -> pd.DataFrame: + """ + Read a DBC or DBF file and return its contents as a pandas DataFrame. + + Parameters: + filename (str): The name of the file to read. + + Returns: + pd.DataFrame: A DataFrame containing the data from the file. + """ + if filename.endswith(("dbc", "DBC")): + df = read_dbc(filename, encoding="iso-8859-1") + elif filename.endswith(("DBF", "dbf")): + dbf = DBF(filename, encoding="iso-8859-1") + df = pd.DataFrame(list(dbf)) + return df + + +# DATASUS Databases db_tests = [ "ZIKABR21", "STPI2206", @@ -36,42 +86,6 @@ def test_dbc2dbf(db_test): assert_dataframe_valid(df) -@pytest.mark.parametrize("db_test", db_tests) -def test_read_dbc_dbf(db_test): - dbc_file = str(data_files / f"{db_test}.dbc") - df = read_dbc_dbf(dbc_file) - assert_dataframe_valid(df) - - -@pytest.mark.parametrize("db_test", db_tests) -def test_dbf_to_csvgz(db_test): - temp_dbf_file = str(data_files / f"{db_test}.dbf") - temp_csvgz_file = str(data_files / f"{db_test}.csv.gz") - dbf_to_csvgz(temp_dbf_file, encoding='iso-8859-1') - assert os.path.isfile(temp_csvgz_file) - with gzip.open(temp_csvgz_file, "rt") as gzfile: - df = pd.read_csv(gzfile) - assert isinstance(df, pd.DataFrame) - assert not df.empty - assert len(df.columns) > 2 - assert len(df) > 0 - - -@pytest.mark.parametrize("db_test", db_tests) -@pytest.mark.skipif -def test_encoding(db_test): - dbc_file = str(data_files / f"{db_test}.dbc") - common_encodings = [ - 'utf-8', - 'iso-8859-1', - 'cp1252', - 'Windows-1252', - ] # Add more if needed - - detected_encoding = chardet.detect(open(dbc_file, 'rb').read())['encoding'] - assert detected_encoding in common_encodings - - @pytest.mark.parametrize("db_test", db_tests) def test_dbc_file_header(db_test): dbc_file = data_files / f"{db_test}.dbc" @@ -92,7 +106,5 @@ def test_dbc_file_header(db_test): def assert_dataframe_valid(df): assert isinstance(df, pd.DataFrame) assert not df.empty - assert len(df.columns) > 0 - assert len(df) > 0 assert df.shape[0] > 0 assert df.shape[1] > 0