Skip to content

Commit

Permalink
Load dagster assets for FERC1 validation tests
Browse files Browse the repository at this point in the history
This lets us test things that aren't in `PudlTabl`.
  • Loading branch information
jdangerx committed Sep 20, 2024
1 parent a3e55dc commit 2ab90f3
Showing 1 changed file with 73 additions and 46 deletions.
119 changes: 73 additions & 46 deletions test/validate/ferc1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@

import logging

import pandas as pd
import pytest

from pudl import validate as pv
from pudl.etl import defs
from pudl.metadata.classes import DataSource

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -46,9 +46,9 @@


@pytest.mark.parametrize("table_name", unique_record_tables)
def test_record_id_dupes(pudl_engine, table_name):
def test_record_id_dupes(table_name):
"""Verify that the generated ferc1 record_ids are unique."""
table = pd.read_sql(table_name, pudl_engine)
table = defs.load_asset_value(table_name)
n_dupes = table.record_id.duplicated().to_numpy().sum()

if n_dupes:
Expand All @@ -59,76 +59,103 @@ def test_record_id_dupes(pudl_engine, table_name):


@pytest.mark.parametrize(
"df_name,cols",
"asset_key,cols",
[
("fbp_ferc1", "all"),
("fuel_ferc1", "all"),
("plant_in_service_ferc1", "all"),
("plants_all_ferc1", "all"),
("plants_hydro_ferc1", "all"),
("plants_pumped_storage_ferc1", "all"),
("plants_small_ferc1", "all"),
("plants_steam_ferc1", "all"),
("pu_ferc1", "all"),
("purchased_power_ferc1", "all"),
("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402", "all"),
("out_ferc1__yearly_steam_plants_fuel_sched402", "all"),
("out_ferc1__yearly_plant_in_service_sched204", "all"),
("out_ferc1__yearly_all_plants", "all"),
("out_ferc1__yearly_hydroelectric_plants_sched406", "all"),
("out_ferc1__yearly_pumped_storage_plants_sched408", "all"),
("out_ferc1__yearly_small_plants_sched410", "all"),
("out_ferc1__yearly_steam_plants_sched402", "all"),
("_out_ferc1__yearly_plants_utilities", "all"),
("out_ferc1__yearly_purchased_power_and_exchanges_sched326", "all"),
],
)
def test_no_null_cols_ferc1(pudl_out_ferc1, live_dbs, cols, df_name):
def test_no_null_cols_ferc1(live_dbs, cols, asset_key):
"""Verify that output DataFrames have no entirely NULL columns."""
if not live_dbs:
pytest.skip("Data validation only works with a live PUDL DB.")
pv.no_null_cols(
pudl_out_ferc1.__getattribute__(df_name)(), cols=cols, df_name=df_name
)
pv.no_null_cols(defs.load_asset_value(asset_key), cols=cols, df_name=asset_key)


@pytest.mark.parametrize(
"df_name,expected_rows",
"asset_key,expected_rows",
[
("fbp_ferc1", 26_947),
("fuel_ferc1", 51_238),
("plant_in_service_ferc1", 355_918),
("plants_all_ferc1", 58_520),
("plants_hydro_ferc1", 7_202),
("plants_pumped_storage_ferc1", 580),
("plants_small_ferc1", 17_763),
("plants_steam_ferc1", 32_975),
("pu_ferc1", 7_887),
("purchased_power_ferc1", 211_794),
("_out_ferc1__yearly_plants_utilities", 7_887),
("out_ferc1__yearly_all_plants", 58_520),
("out_ferc1__yearly_balance_sheet_assets_sched110", None),
("out_ferc1__yearly_balance_sheet_liabilities_sched110", None),
("out_ferc1__yearly_cash_flows_sched120", None),
("out_ferc1__yearly_depreciation_by_function_sched219", None),
("out_ferc1__yearly_depreciation_changes_sched219", None),
("out_ferc1__yearly_depreciation_summary_sched336", None),
("out_ferc1__yearly_energy_dispositions_sched401", None),
("out_ferc1__yearly_energy_sources_sched401", None),
("out_ferc1__yearly_hydroelectric_plants_sched406", 7_202),
("out_ferc1__yearly_income_statements_sched114", None),
("out_ferc1__yearly_operating_expenses_sched320", None),
("out_ferc1__yearly_operating_revenues_sched300", None),
("out_ferc1__yearly_other_regulatory_liabilities_sched278", None),
("out_ferc1__yearly_plant_in_service_sched204", 355_918),
("out_ferc1__yearly_pumped_storage_plants_sched408", 580),
("out_ferc1__yearly_purchased_power_and_exchanges_sched326", 211_794),
("out_ferc1__yearly_retained_earnings_sched118", None),
("out_ferc1__yearly_sales_by_rate_schedules_sched304", None),
("out_ferc1__yearly_small_plants_sched410", None),
("out_ferc1__yearly_steam_plants_fuel_by_plant_sched402", 26_947),
("out_ferc1__yearly_steam_plants_fuel_sched402", 51_238),
("out_ferc1__yearly_steam_plants_sched402", 32_975),
("out_ferc1__yearly_transmission_lines_sched422", None),
("out_ferc1__yearly_utility_plant_summary_sched200", None),
("out_ferc1__yearly_small_plants_sched410", 17_763),
],
)
def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
def test_minmax_rows(live_dbs, expected_rows, asset_key):
"""Verify that output DataFrames don't have too many or too few rows.
Args:
pudl_out_ferc1: A PudlTabl output object.
live_dbs: Boolean (wether we're using a live or testing DB).
expected_rows (int): Expected number of rows that the dataframe should
contain when all data is loaded and is output without aggregation.
df_name (str): Shorthand name identifying the dataframe, corresponding
to the name of the function used to pull it from the PudlTabl
output object.
asset_key (str): Shorthand name identifying the dataframe, corresponding
to the name of the asset.
"""
if not live_dbs:
pytest.skip("Data validation only works with a live PUDL DB.")
if expected_rows is None:
pytest.skip("We don't actually have an expected value here yet.")
_ = (
pudl_out_ferc1.__getattribute__(df_name)()
defs.load_asset_value(asset_key)
.pipe(
pv.check_min_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name
pv.check_min_rows,
expected_rows=expected_rows,
margin=0.0,
df_name=asset_key,
)
.pipe(
pv.check_max_rows, expected_rows=expected_rows, margin=0.0, df_name=df_name
pv.check_max_rows,
expected_rows=expected_rows,
margin=0.0,
df_name=asset_key,
)
)


@pytest.mark.parametrize(
"df_name,unique_subset",
"asset_key,unique_subset",
[
("pu_ferc1", ["utility_id_ferc1", "plant_name_ferc1"]),
("fbp_ferc1", ["report_year", "utility_id_ferc1", "plant_name_ferc1"]),
(
"plants_hydro_ferc1",
"_out_ferc1__yearly_plants_utilities",
["utility_id_ferc1", "plant_name_ferc1"],
),
(
"out_ferc1__yearly_steam_plants_fuel_by_plant_sched402",
["report_year", "utility_id_ferc1", "plant_name_ferc1"],
),
(
"out_ferc1__yearly_hydroelectric_plants_sched406",
[
"report_year",
"utility_id_ferc1",
Expand All @@ -137,7 +164,7 @@ def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
],
),
(
"plants_pumped_storage_ferc1",
"out_ferc1__yearly_pumped_storage_plants_sched408",
[
"report_year",
"utility_id_ferc1",
Expand All @@ -146,17 +173,17 @@ def test_minmax_rows(pudl_out_ferc1, live_dbs, expected_rows, df_name):
],
),
(
"plant_in_service_ferc1",
"out_ferc1__yearly_plant_in_service_sched204",
["report_year", "utility_id_ferc1", "ferc_account_label"],
),
],
)
def test_unique_rows_ferc1(pudl_out_ferc1, live_dbs, df_name, unique_subset):
def test_unique_rows_ferc1(live_dbs, asset_key, unique_subset):
"""Test whether dataframe has unique records within a subset of columns."""
if not live_dbs:
pytest.skip("Data validation only works with a live PUDL DB.")
pv.check_unique_rows(
pudl_out_ferc1.__getattribute__(df_name)(),
defs.load_asset_value(asset_key),
subset=unique_subset,
df_name=df_name,
df_name=asset_key,
)

0 comments on commit 2ab90f3

Please sign in to comment.