catalyst-cooperative · zaneselvans · Sep 26, 2023 · Sep 12, 2023 · Sep 13, 2023 · Sep 14, 2023
diff --git a/docs/dev/testing.rst b/docs/dev/testing.rst
@@ -155,18 +155,19 @@ their own:
     doc8             -> Check the documentation input files for syntactical correctness.
     docs             -> Remove old docs output and rebuild HTML from scratch with Sphinx
     unit             -> Run all the software unit tests.
-    ferc1_solo       -> Test whether FERC 1 can be loaded into the PUDL database alone.
     integration      -> Run all software integration tests and process a full year of data.
+    minmax_rows      -> Check that all outputs have the expected number of rows.
     validate         -> Run all data validation tests. This requires a complete PUDL DB.
-    ferc1_schema     -> Verify FERC Form 1 DB schema are compatible for all years.
+    jupyter          -> Ensure that designated Jupyter notebooks can be executed.
     full_integration -> Run ETL and integration tests for all years and data sources.
     full             -> Run all CI checks, but for all years of data.
-    build            -> Prepare Python source and binary packages for release.
-    testrelease      -> Do a dry run of Python package release using the PyPI test server.
-    release          -> Release the PUDL package to the production PyPI server.
+    nuke             -> Nuke & recreate SQLite & Parquet outputs, then run all tests and
+                        data validations against the new outputs.
+    get_unmapped_ids -> Make the raw FERC1 DB and generate a PUDL database with only EIA in
+                        order to generate any unmapped IDs.
 
-Note that not all of them literally run tests. For instance, to lint and
-build the documentation you can run:
+Note that not all of them literally run tests. For instance, to lint and build the
+documentation you can run:
 
 .. code-block:: console
 
@@ -321,41 +322,25 @@ with the construction of that database. For example, the output routines:
 
 We also use this option to run the data validations.
 
-Assuming you do want to run the ETL and build new databases as part of the test
-you're running, the contents of that database are determined by an ETL settings
-file. By default, the settings file that's used is
-``test/settings/integration-test.yml`` But it's also possible to use a
-different input file, generating a different database, and then run some
-tests against that database.
-
-For example, we test that FERC 1 data can be loaded into a PUDL database all
-by itself by running the ETL tests with a settings file that includes only A
-couple of FERC 1 tables for a single year. This is the ``ferc1_solo`` Tox
-test environment:
-
-.. code-block:: console
-
-  $ pytest --etl-settings=test/settings/ferc1-solo-test.yml test/integration/etl_test.py
-
-Similarly, we use the ``test/settings/full-integration-test.yml`` settings file
-to specify an exhaustive collection of input data, and then we run a test that
-checks that the database schemas extracted from all historical FERC 1 databases
-are compatible with each other. This is the ``ferc1_schema`` test:
-
-.. code-block:: console
-
-  $ pytest --etl-settings test/settings/full-integration-test.yml test/integration/etl_test.py::test_ferc1_schema
-
-The raw input data that all the tests use is ultimately coming from our
-`archives on Zenodo <https://zenodo.org/communities/catalyst-cooperative>`__.
-However, you can optionally tell the tests to look in a different places for more
-rapidly accessible caches of that data and to force the download of a fresh
-copy (especially useful when you are testing the datastore functionality
-specifically). By default, the tests will use the datastore that's part of your
-local PUDL workspace.
-
-For example, to run the ETL portion of the integration tests and download
-fresh input data to a temporary datastore that's later deleted automatically:
+Assuming you do want to run the ETL and build new databases as part of the test you're
+running, the contents of that database are determined by an ETL settings file. By
+default, the settings file that's used is
+``src/pudl/package_data/settings/etl_fast.yml`` But it's also possible to use a
+different input file, generating a different database, and then run some tests against
+that database.
+
+We use the ``src/pudl/package_data/etl_full.yml`` settings file to specify an exhaustive
+collection of input data.
+
+The raw input data that all the tests use is ultimately coming from our `archives on
+Zenodo <https://zenodo.org/communities/catalyst-cooperative>`__. However, you can
+optionally tell the tests to look in a different places for more rapidly accessible
+caches of that data and to force the download of a fresh copy (especially useful when
+you are testing the datastore functionality specifically). By default, the tests will
+use the datastore that's part of your local PUDL workspace.
+
+For example, to run the ETL portion of the integration tests and download fresh input
+data to a temporary datastore that's later deleted automatically:
 
 .. code-block:: console
 

diff --git a/docs/pudl/pudl-etl.dot b/docs/pudl/pudl-etl.dot
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "addfips>=0.4,<0.5",
     "alembic>=1.10.3,<1.13",
     "catalystcoop.dbfread>=3.0,<3.1",
-    "catalystcoop.ferc-xbrl-extractor>=1.1.0,<2",
+    "catalystcoop.ferc-xbrl-extractor>=1.1.1,<2",
     "coloredlogs>=14.0,<15.1",  # Dagster requires 14.0
     "dagster-webserver>=1.4,<1.5",  # 1.2.2 is first version to support Python 3.11
     "dagster>=1.4,<1.5",  # 1.2.2 is first version to support Python 3.11
@@ -28,7 +28,7 @@ dependencies = [
     "matplotlib>=3.3,<3.9",  # Should make this optional with a "viz" extras
     "networkx>=2.2,<3.2",
     "numpy>=1.18.5,!=1.23.0,<1.27",
-    "pandas[parquet,excel,fss,gcp,compression]>=2.0,<2.1",
+    "pandas[compression,excel,fss,gcp,parquet,performance]>=2,<2.2",
     "pyarrow>=7,<13",
     "pydantic[email]>=1.7,<2",
     "python-dotenv>=0.21,<1.1",

diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py
@@ -572,7 +572,7 @@ class Field(Base):
     Examples:
         >>> field = Field(name='x', type='string', constraints={'enum': ['x', 'y']})
         >>> field.to_pandas_dtype()
-        CategoricalDtype(categories=['x', 'y'], ordered=False)
+        CategoricalDtype(categories=['x', 'y'], ordered=False, categories_dtype=object)
         >>> field.to_sql()
         Column('x', Enum('x', 'y'), CheckConstraint(...), table=None)
         >>> field = Field.from_id('utility_id_eia')

diff --git a/src/pudl/metadata/fields.py b/src/pudl/metadata/fields.py
@@ -1552,7 +1552,7 @@
         "description": "For nuclear plants only, the unit number .One digit numeric. Nuclear plants are the only type of plants for which data are shown explicitly at the generating unit level.",
     },
     "num_transmission_circuits": {
-        "type": "number",
+        "type": "integer",
         "description": "Number of circuits in a transmission line.",
     },
     "oil_fraction_cost": {

diff --git a/src/pudl/output/ferc1.py b/src/pudl/output/ferc1.py
@@ -44,7 +44,7 @@ class CalculationToleranceFerc1(BaseModel):
         intertable_calculation_errors=0.20,
     ),
     "balance_sheet_assets_ferc1": CalculationToleranceFerc1(
-        intertable_calculation_errors=0.85,
+        intertable_calculation_errors=1.00,
     ),
     "balance_sheet_liabilities_ferc1": CalculationToleranceFerc1(
         intertable_calculation_errors=0.07,

diff --git a/src/pudl/output/ferc714.py b/src/pudl/output/ferc714.py
@@ -135,14 +135,15 @@ def categorize_eia_code(
     This function associates a ``respondent_type`` of ``utility``,
     ``balancing_authority`` or ``pandas.NA`` with each input ``eia_code`` using the
     following rules:
+
     * If a ``eia_code`` appears only in ``util_ids`` the ``respondent_type`` will be
-    ``utility``.
+      ``utility``.
     * If ``eia_code`` appears only in ``ba_ids`` the ``respondent_type`` will be
-    assigned ``balancing_authority``.
+      assigned ``balancing_authority``.
     * If ``eia_code`` appears in neither set of IDs, ``respondent_type`` will be
-    assigned ``pandas.NA``.
+      assigned ``pandas.NA``.
     * If ``eia_code`` appears in both sets of IDs, then whichever ``respondent_type``
-    has been selected with the ``priority`` flag will be assigned.
+      has been selected with the ``priority`` flag will be assigned.
 
     Note that the vast majority of ``balancing_authority_id_eia`` values also show up
     as ``utility_id_eia`` values, but only a small subset of the ``utility_id_eia``
@@ -163,8 +164,7 @@ def categorize_eia_code(
             "balancing_authority". The default is "balancing_authority".
 
     Returns:
-        A DataFrame containing 2 columns: ``eia_code`` and
-        ``respondent_type``.
+        A DataFrame containing 2 columns: ``eia_code`` and ``respondent_type``.
     """
     if priority == "balancing_authority":
         primary = "balancing_authority"
@@ -226,14 +226,15 @@ def filled_balancing_authority_eia861(
     # Build table of new rows
     # Insert row for each target balancing authority-year pair
     # missing from the original table, using the reference year as a template.
-    rows = []
+    rows: list[dict[str, Any]] = []
     for ref, fix in zip(refs, ASSOCIATIONS):
         for year in range(fix["to"][0], fix["to"][1] + 1):
             key = (fix["id"], pd.Timestamp(year, 1, 1))
             if key not in dfi.index:
                 rows.append({**ref, "report_date": key[1]})
     # Append to original table
-    df = pd.concat([df, pd.DataFrame(rows)])
+    new_rows = apply_pudl_dtypes(pd.DataFrame(rows), group="eia")
+    df = pd.concat([df, new_rows], axis="index")
     # Remove balancing authorities treated as utilities
     mask = df["balancing_authority_id_eia"].isin([util["id"] for util in UTILITIES])
     return apply_pudl_dtypes(df[~mask], group="eia")
@@ -275,7 +276,7 @@ def filled_balancing_authority_assn_eia861(
             tables.append(ref.assign(report_date=key[1]))
             replaced |= mask
     # Append to original table with matching rows removed
-    df = pd.concat([df[~replaced], pd.concat(tables)])
+    df = pd.concat([df[~replaced], apply_pudl_dtypes(pd.concat(tables), group="eia")])
     # Remove balancing authorities treated as utilities
     mask = np.zeros(df.shape[0], dtype=bool)
     tables = []

diff --git a/src/pudl/transform/classes.py b/src/pudl/transform/classes.py
@@ -552,7 +552,7 @@ def convert_units(col: pd.Series, params: UnitConversion) -> pd.Series:
         new_name = col.name
     if (col.name == new_name) & (params.from_unit != "") & (params.to_unit != ""):
         logger.debug(f"Old and new column names are identical: {col.name}.")
-    col = (params.multiplier * col) + params.adder
+    col = (params.multiplier * pd.to_numeric(col)) + params.adder
     col.name = new_name
     return col