Skip to content

Commit

Permalink
update docs and metadata, plus stop trying to impute midnight jan 1st…
Browse files Browse the repository at this point in the history
… 2024
  • Loading branch information
cmgosnell committed Sep 24, 2024
1 parent 9b938c3 commit 91c8859
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 27 deletions.
1 change: 0 additions & 1 deletion docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ FERC Form 714
raw data sources and extends the data coverage through 2023. See :issue:`3809`
and :pr:`3842`.


Schema Changes
^^^^^^^^^^^^^^
* Added :ref:`out_eia__yearly_assn_plant_parts_plant_gen` table. This table associates
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
"""Add my cool lil respondent id glue tables and other 714 xbrl updates
Revision ID: 9902021cb3f1
Revision ID: 8fffc1d0399a
Revises: a93bdb8d4fbd
Create Date: 2024-09-23 17:59:44.690940
Create Date: 2024-09-24 09:28:45.862748
"""
from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = '9902021cb3f1'
revision = '8fffc1d0399a'
down_revision = 'a93bdb8d4fbd'
branch_labels = None
depends_on = None
Expand All @@ -19,24 +19,24 @@
def upgrade() -> None:
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('core_pudl__assn_ferc714_pudl_respondents',
sa.Column('respondent_id_ferc714', sa.Integer(), nullable=False, comment='FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.'),
sa.Column('respondent_id_ferc714', sa.Integer(), nullable=False, comment='PUDL-assigned identifying a respondent to FERC Form 714. This ID associates natively reported respondent IDs from the orignal CSV and XBRL data sources.'),
sa.PrimaryKeyConstraint('respondent_id_ferc714', name=op.f('pk_core_pudl__assn_ferc714_pudl_respondents'))
)
op.create_table('core_pudl__assn_ferc714_csv_pudl_respondents',
sa.Column('respondent_id_ferc714', sa.Integer(), nullable=False, comment='FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.'),
sa.Column('respondent_id_ferc714_csv', sa.Integer(), nullable=False, comment='FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.'),
sa.Column('respondent_id_ferc714', sa.Integer(), nullable=False, comment='PUDL-assigned identifying a respondent to FERC Form 714. This ID associates natively reported respondent IDs from the orignal CSV and XBRL data sources.'),
sa.Column('respondent_id_ferc714_csv', sa.Integer(), nullable=False, comment='FERC Form 714 respondent ID from CSV reported data - published from years: 2006-2020. This ID is linked to the newer years of reported XBRL data through the PUDL-assigned respondent_id_ferc714 ID. This ID was originally reported as respondent_id. Note that this ID does not correspond to FERC respondent IDs from other forms.'),
sa.ForeignKeyConstraint(['respondent_id_ferc714'], ['core_pudl__assn_ferc714_pudl_respondents.respondent_id_ferc714'], name=op.f('fk_core_pudl__assn_ferc714_csv_pudl_respondents_respondent_id_ferc714_core_pudl__assn_ferc714_pudl_respondents')),
sa.PrimaryKeyConstraint('respondent_id_ferc714', 'respondent_id_ferc714_csv', name=op.f('pk_core_pudl__assn_ferc714_csv_pudl_respondents'))
)
op.create_table('core_pudl__assn_ferc714_xbrl_pudl_respondents',
sa.Column('respondent_id_ferc714', sa.Integer(), nullable=False, comment='FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.'),
sa.Column('respondent_id_ferc714_xbrl', sa.Text(), nullable=False, comment='FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.'),
sa.Column('respondent_id_ferc714', sa.Integer(), nullable=False, comment='PUDL-assigned identifying a respondent to FERC Form 714. This ID associates natively reported respondent IDs from the orignal CSV and XBRL data sources.'),
sa.Column('respondent_id_ferc714_xbrl', sa.Text(), nullable=False, comment='FERC Form 714 respondent ID from XBRL reported data - published from years: 2021-present. This ID is linked to the older years of reported CSV data through the PUDL-assigned respondent_id_ferc714 ID. This ID was originally reported as entity_id. Note that this ID does not correspond to FERC respondent IDs from other forms.'),
sa.ForeignKeyConstraint(['respondent_id_ferc714'], ['core_pudl__assn_ferc714_pudl_respondents.respondent_id_ferc714'], name=op.f('fk_core_pudl__assn_ferc714_xbrl_pudl_respondents_respondent_id_ferc714_core_pudl__assn_ferc714_pudl_respondents')),
sa.PrimaryKeyConstraint('respondent_id_ferc714', 'respondent_id_ferc714_xbrl', name=op.f('pk_core_pudl__assn_ferc714_xbrl_pudl_respondents'))
)
with op.batch_alter_table('core_ferc714__respondent_id', schema=None) as batch_op:
batch_op.add_column(sa.Column('respondent_id_ferc714_csv', sa.Integer(), nullable=True, comment='FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.'))
batch_op.add_column(sa.Column('respondent_id_ferc714_xbrl', sa.Text(), nullable=True, comment='FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.'))
batch_op.add_column(sa.Column('respondent_id_ferc714_csv', sa.Integer(), nullable=True, comment='FERC Form 714 respondent ID from CSV reported data - published from years: 2006-2020. This ID is linked to the newer years of reported XBRL data through the PUDL-assigned respondent_id_ferc714 ID. This ID was originally reported as respondent_id. Note that this ID does not correspond to FERC respondent IDs from other forms.'))
batch_op.add_column(sa.Column('respondent_id_ferc714_xbrl', sa.Text(), nullable=True, comment='FERC Form 714 respondent ID from XBRL reported data - published from years: 2021-present. This ID is linked to the older years of reported CSV data through the PUDL-assigned respondent_id_ferc714 ID. This ID was originally reported as entity_id. Note that this ID does not correspond to FERC respondent IDs from other forms.'))
batch_op.create_foreign_key(batch_op.f('fk_core_ferc714__respondent_id_respondent_id_ferc714_core_pudl__assn_ferc714_pudl_respondents'), 'core_pudl__assn_ferc714_pudl_respondents', ['respondent_id_ferc714'], ['respondent_id_ferc714'])

with op.batch_alter_table('core_ferc714__yearly_planning_area_demand_forecast', schema=None) as batch_op:
Expand All @@ -45,9 +45,9 @@ def upgrade() -> None:
batch_op.add_column(sa.Column('net_demand_forecast_mwh', sa.Float(), nullable=True, comment='Net forecasted electricity demand for the specific period in megawatt-hours (MWh).'))
batch_op.drop_constraint('fk_core_ferc714__yearly_planning_area_demand_forecast_respondent_id_ferc714_core_ferc714__respondent_id', type_='foreignkey')
batch_op.create_foreign_key(batch_op.f('fk_core_ferc714__yearly_planning_area_demand_forecast_respondent_id_ferc714_core_pudl__assn_ferc714_pudl_respondents'), 'core_pudl__assn_ferc714_pudl_respondents', ['respondent_id_ferc714'], ['respondent_id_ferc714'])
batch_op.drop_column('winter_peak_demand_mw')
batch_op.drop_column('summer_peak_demand_mw')
batch_op.drop_column('net_demand_mwh')
batch_op.drop_column('winter_peak_demand_mw')

with op.batch_alter_table('out_ferc714__respondents_with_fips', schema=None) as batch_op:
batch_op.drop_constraint('fk_out_ferc714__respondents_with_fips_respondent_id_ferc714_core_ferc714__respondent_id', type_='foreignkey')
Expand All @@ -71,9 +71,9 @@ def downgrade() -> None:
batch_op.create_foreign_key('fk_out_ferc714__respondents_with_fips_respondent_id_ferc714_core_ferc714__respondent_id', 'core_ferc714__respondent_id', ['respondent_id_ferc714'], ['respondent_id_ferc714'])

with op.batch_alter_table('core_ferc714__yearly_planning_area_demand_forecast', schema=None) as batch_op:
batch_op.add_column(sa.Column('winter_peak_demand_mw', sa.FLOAT(), nullable=True))
batch_op.add_column(sa.Column('net_demand_mwh', sa.FLOAT(), nullable=True))
batch_op.add_column(sa.Column('summer_peak_demand_mw', sa.FLOAT(), nullable=True))
batch_op.add_column(sa.Column('winter_peak_demand_mw', sa.FLOAT(), nullable=True))
batch_op.drop_constraint(batch_op.f('fk_core_ferc714__yearly_planning_area_demand_forecast_respondent_id_ferc714_core_pudl__assn_ferc714_pudl_respondents'), type_='foreignkey')
batch_op.create_foreign_key('fk_core_ferc714__yearly_planning_area_demand_forecast_respondent_id_ferc714_core_ferc714__respondent_id', 'core_ferc714__respondent_id', ['respondent_id_ferc714'], ['respondent_id_ferc714'])
batch_op.drop_column('net_demand_forecast_mwh')
Expand Down
33 changes: 25 additions & 8 deletions src/pudl/analysis/state_demand.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,13 +289,24 @@ def load_hourly_demand_matrix_ferc714(
out_ferc714__hourly_planning_area_demand["datetime_utc"],
out_ferc714__hourly_planning_area_demand["utc_offset"],
)
# remove the records o/s of the working years because some
# respondents report one record of midnight of January first
# of the next year (report_date.dt.year + 1). and
# impute_ferc714_hourly_demand_matrix chunks over years at a time
# and having only one record
report_year_mask = out_ferc714__hourly_planning_area_demand[
"datetime"
].dt.year.isin(pudl.settings.Ferc714Settings().years)
out_ferc714__hourly_planning_area_demand = out_ferc714__hourly_planning_area_demand[
report_year_mask
]
# Pivot to demand matrix: timestamps x respondents
matrix = out_ferc714__hourly_planning_area_demand.pivot(
index="datetime", columns="respondent_id_ferc714", values="demand_mwh"
)
# List timezone by year for each respondent
# List timezone by year for each respondent by the datetime
out_ferc714__hourly_planning_area_demand["year"] = (
out_ferc714__hourly_planning_area_demand["report_date"].dt.year
out_ferc714__hourly_planning_area_demand["datetime"].dt.year
)
utc_offset = out_ferc714__hourly_planning_area_demand.groupby(
["respondent_id_ferc714", "year"], as_index=False
Expand Down Expand Up @@ -395,12 +406,18 @@ def impute_ferc714_hourly_demand_matrix(df: pd.DataFrame) -> pd.DataFrame:
Copy of `df` with imputed values.
"""
results = []
for year, gdf in df.groupby(df.index.year):
logger.info(f"Imputing year {year}")
keep = df.columns[~gdf.isnull().all()]
tsi = pudl.analysis.timeseries_cleaning.Timeseries(gdf[keep])
result = tsi.to_dataframe(tsi.impute(method="tnn"), copy=False)
results.append(result)
# sort here and then don't sort in the groupby so we can process
# the newer years of data first. This is so we can see early if
# new data causes any failures.
df = df.sort_index(ascending=False)
for year, gdf in df.groupby(df.index.year, sort=False):
# skip any year that is not in the settings
if year in pudl.settings.Ferc714Settings().years:
logger.info(f"Imputing year {year}")
keep = df.columns[~gdf.isnull().all()]
tsi = pudl.analysis.timeseries_cleaning.Timeseries(gdf[keep])
result = tsi.to_dataframe(tsi.impute(method="tnn"), copy=False)
results.append(result)
return pd.concat(results)


Expand Down
21 changes: 18 additions & 3 deletions src/pudl/metadata/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -3660,15 +3660,30 @@
},
"respondent_id_ferc714": {
"type": "integer",
"description": "FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.",
"description": (
"PUDL-assigned identifying a respondent to FERC Form 714. This ID associates "
"natively reported respondent IDs from the orignal CSV and XBRL data sources."
),
},
"respondent_id_ferc714_csv": {
"type": "integer",
"description": "FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.",
"description": (
"FERC Form 714 respondent ID from CSV reported data - published from years: 2006-2020. "
"This ID is linked to the newer years of reported XBRL data through the PUDL-assigned "
"respondent_id_ferc714 ID. "
"This ID was originally reported as respondent_id. "
"Note that this ID does not correspond to FERC respondent IDs from other forms."
),
},
"respondent_id_ferc714_xbrl": {
"type": "string",
"description": "FERC Form 714 respondent ID. Note that this ID does not correspond to FERC respondent IDs from other forms.",
"description": (
"FERC Form 714 respondent ID from XBRL reported data - published from years: 2021-present. "
"This ID is linked to the older years of reported CSV data through the PUDL-assigned "
"respondent_id_ferc714 ID. "
"This ID was originally reported as entity_id. "
"Note that this ID does not correspond to FERC respondent IDs from other forms."
),
},
"respondent_name_ferc714": {
"type": "string",
Expand Down
6 changes: 5 additions & 1 deletion src/pudl/metadata/resources/ferc714.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@
"valid format for the hour of a datetime, so we convert these T24 hours into "
"T00 of the next day. A smaller subset of the respondents reports the 24th hour "
"as the last second of the day - we also convert these records to the T00 of the "
"next day."
"next day.\n\nThis table includes three respondent ID columns: one from the "
"CSV raw source, one from the XBRL raw source and another that is PUDL-derived "
"that links those two source ID's together. This table has filled in source IDs "
"for all records so you can select the full timeseries for a given respondent from "
"any of these three IDs."
),
"schema": {
"fields": [
Expand Down
4 changes: 2 additions & 2 deletions src/pudl/transform/ferc714.py
Original file line number Diff line number Diff line change
Expand Up @@ -847,7 +847,7 @@ def convert_dates_to_zero_seconds_xbrl(xbrl: pd.DataFrame) -> pd.DataFrame:
There are a small amount of records which report the last "hour" of the day
as last second of the day, as opposed to T24 cleaned in
:func:`convert_dates_to_zero_offset_hours_xbrl` or T00 which is standard for a
:meth:`convert_dates_to_zero_offset_hours_xbrl` or T00 which is standard for a
datetime. This function finds these records and adds one second to them and
then ensures all of the records has 0's for seconds.
"""
Expand Down Expand Up @@ -1057,7 +1057,7 @@ def run(
process the combined datasets.
The main transforms include spot-fixing forecast years with
:func:`spot_fix_forecast_years_xbrl` and averaging out duplicate forecast values
:meth:`spot_fix_forecast_years_xbrl` and averaging out duplicate forecast values
for duplicate primary key rows in the CSV table.
"""
Expand Down

0 comments on commit 91c8859

Please sign in to comment.