Remove Unnecessary Function from readdbc Module (#18)

* chore(release): Add --build parameter to enable package publication with semantic-release * refactor: Remove 'dbf_to_csvgz' function * feat(module): Remove unnecessary functions from the readdbc module * test: Enhance and refactor test suite * feat(deps): Move pandas to dev group in pyproject.toml * chore(docs): Update README * Minor changes * chore(docs): Fix requested changes in reviews * Minor changes * Update README.md
osl-incubator · Sep 15, 2023 · b52999c · b52999c
1 parent 44c7121
commit b52999c
Show file tree

Hide file tree

Showing 6 changed files with 218 additions and 151 deletions.
diff --git a/.releaserc.json b/.releaserc.json
@@ -3,9 +3,11 @@
   "tagFormat": "${version}",
   "plugins": [
     [
-      "@semantic-release/commit-analyzer", {
-      "preset": "conventionalcommits"
-    }],
+      "@semantic-release/commit-analyzer",
+      {
+        "preset": "conventionalcommits"
+      }
+    ],
     [
       "semantic-release-replace-plugin",
       {
@@ -42,9 +44,11 @@
       }
     ],
     [
-      "@semantic-release/release-notes-generator", {
+      "@semantic-release/release-notes-generator",
+      {
         "preset": "conventionalcommits"
-      }],
+      }
+    ],
     [
       "@semantic-release/changelog",
       {
@@ -56,7 +60,7 @@
       "@semantic-release/exec",
       {
         "prepareCmd": "poetry build",
-        "publishCmd": "poetry publish"
+        "publishCmd": "poetry publish --build"
       }
     ],
     [

diff --git a/README.md b/README.md
@@ -1 +1,91 @@
-# py-blast-dbf
+# pyreaddbc
+
+**pyreaddbc** is a Python library for working with [DBase database file](https://docs.fileformat.com/database/dbf/). Legacy systems from the Brazilian Ministry of Health still uses DBF and DBC formats to Publicize data. This package was developed to help [PySUS](https://github.com/AlertaDengue/pysus) to extract data from these formats into more modern ones. Pyreaddbc can also be used to convert DBC files from any other source."
+
+
+## Installation
+
+You can install **pyreaddbc** using pip:
+
+```bash
+pip install pyreaddbc
+```
+
+## Usage
+
+**Note**: *Extracting the DBF from a DBC may require to specify the encoding of the original data, if known.*
+
+### Reading DBC Files
+
+To read a DBC file and convert it to a pandas DataFrame, use the `read_dbc` function:
+
+```python
+import pyreaddbc
+
+dfs = pyreaddbc.read_dbc("LTPI2201.dbc", encoding="iso-8859-1")
+```
+
+### Exporting to CSV.GZ
+
+To export a DataFrame to a compressed CSV file (CSV.GZ), you can use pandas:
+
+```python
+import pyreaddbc
+
+df = pyreaddbc.read_dbc("./LTPI2201.dbc", encoding="iso-8859-1")
+df.to_csv("LTPI2201.csv.gz", compression="gzip", index=False)
+```
+
+### Exporting to Parquet
+
+To export a DataFrame to a Parquet file, you can use the `pyarrow` library:
+
+```python
+import pyreaddbc
+import pyarrow.parquet as pq
+import pandas as pd
+from pathlib import Path
+
+# Read DBC file and convert to DataFrame
+df = pyreaddbc.read_dbc("./LTPI2201.dbc", encoding="iso-8859-1")
+
+# Export to CSV.GZ
+df.to_csv("LTPI2201.csv.gz", compression="gzip", index=False)
+
+# Export to Parquet
+pq.write_table(pa.Table.from_pandas(df), "parquets/LTPI2201.parquet")
+
+# Read the Parquet files and decode DataFrame columns
+parquet_dir = Path("parquets")
+parquets = parquet_dir.glob("*.parquet")
+
+chunks_list = [
+    pd.read_parquet(str(f), engine='fastparquet') for f in parquets
+]
+
+# Concatenate DataFrames
+df_parquet = pd.concat(chunks_list, ignore_index=True)
+
+```
+---
+
+## License
+
+[GNU Affero General Public License (AGPL-3.0)](./LICENSE)
+
+This license ensures that the software remains open-source and free to use, modify, and distribute while requiring that any changes or enhancements made to the codebase are also made available to the community under the same terms.
+
+<span>
+<b>Acknowledge</b></br>
+============
+</span> 
+
+
+This program decompresses .dbc files to .dbf. This code is based on the work
+of Mark Adler <madler@alumni.caltech.edu> (zlib/blast), Pablo Fonseca
+(https://github.com/eaglebh/blast-dbf).
+
+[PySUS](https://github.com/AlertaDengue/PySUS) has further extended and adapted this code to
+create **pyreaddbc**. The original work of Mark Adler and Pablo Fonseca is much appreciated for its contribution to this project.
+
+**Note**: *pyreaddbc* is maintained with funding from [AlertaDengue](https://github.com/AlertaDengue).
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,9 @@ build-backend = "poetry.core.masonry.api"
 script = "pyreaddbc/_build_readdbc.py"
 generate-setup-file = false
 
+[tool.poetry.group.dev.dependencies]
+pandas = "^2.1.0"
+
 [tool.poetry]
 name = "pyreaddbc"
 version = "1.1.0"  # changed by semantic-release
@@ -40,7 +43,6 @@ packages = [
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4"
-pandas = ">=1.4.3,<2"
 dbfread = ">=2.0.7,<3"
 tqdm = ">=4.64.0,<5"
 cffi = ">=1.15.1,<2"
@@ -55,7 +57,6 @@ pytest = ">=7.1.2"
 commitizen = ">=2.32.2"
 chardet = ">=5.2.0"
 
-
 [tool.black]
 line-length = 79
 skip-string-normalization = true

diff --git a/pyreaddbc/readdbc.py b/pyreaddbc/readdbc.py
@@ -3,46 +3,14 @@
 by fccoelho
 license: GPL V3 or Later
 """
-import csv
-import gzip
 import os
-from tempfile import NamedTemporaryFile
-
-import pandas as pd
-from dbfread import DBF
-from tqdm import tqdm
 
 try:
     from pyreaddbc._readdbc import ffi, lib
 except (ImportError, ModuleNotFoundError):
     from ._readdbc import ffi, lib
 
 
-def read_dbc(filename, encoding="utf-8", raw=False):
-    """
-    Opens a DBC file and return its contents as a pandas
-    Dataframe.
-    :param filename: .dbc filename
-    :param encoding: encoding of the data
-    :param raw: |
-        Skip type conversion. Set it to True to avoid type conversion errors
-    :return: Pandas Dataframe.
-    """
-    if isinstance(filename, str):
-        filename = filename.encode()
-    with NamedTemporaryFile(delete=False) as tf:
-        dbc2dbf(filename, tf.name.encode())
-        try:
-            dbf = DBF(tf.name, encoding=encoding, raw=raw)
-            df = pd.DataFrame(list(dbf))
-        except ValueError:
-            dbf = DBF(tf.name, encoding=encoding, raw=not raw)
-            df = pd.DataFrame(list(dbf))
-    os.unlink(tf.name)
-
-    return df
-
-
 def dbc2dbf(infile, outfile):
     """
     Converts a DBC file to a DBF database saving it to `outfile`.
@@ -57,35 +25,3 @@ def dbc2dbf(infile, outfile):
     q = ffi.new("char[]", os.path.abspath(outfile))
 
     lib.dbc2dbf([p], [q])
-
-
-def read_dbc_dbf(filename: str):
-    if filename.endswith(("dbc", "DBC")):
-        df = read_dbc(filename, encoding="iso-8859-1")
-    elif filename.endswith(("DBF", "dbf")):
-        dbf = DBF(filename, encoding="iso-8859-1")
-        df = pd.DataFrame(list(dbf))
-
-    return df
-
-
-def dbf_to_csvgz(filename: str, encoding: str = "iso-8859-1"):
-    """
-    Streams a dbf file to gzipped CSV file. The Gzipped csv
-        will be saved on the same path but with a csv.gz extension.
-    :param filename: path to the dbf file
-    """
-    data = DBF(filename, encoding=encoding, raw=False)
-    fn = os.path.splitext(filename)[0] + ".csv.gz"
-
-    with gzip.open(fn, "wt") as gzf:
-        for i, d in tqdm(
-            enumerate(data),
-            desc="Converting",
-        ):
-            if i == 0:
-                csvwriter = csv.DictWriter(gzf, fieldnames=d.keys())
-                csvwriter.writeheader()
-                csvwriter.writerow(d)
-            else:
-                csvwriter.writerow(d)