From f5bfc6235f8f621a0ad43b130745151c393123e9 Mon Sep 17 00:00:00 2001 From: Christina Gosnell Date: Tue, 17 Sep 2024 13:38:12 -0400 Subject: [PATCH] yay add the hour into the csv report_date early so i'm not oopsies loosing all the report_dates plus lots of documentation --- src/pudl/transform/ferc714.py | 170 +++++++++++++++++++--------------- 1 file changed, 94 insertions(+), 76 deletions(-) diff --git a/src/pudl/transform/ferc714.py b/src/pudl/transform/ferc714.py index c86de74ac..4a80bf1d5 100644 --- a/src/pudl/transform/ferc714.py +++ b/src/pudl/transform/ferc714.py @@ -15,7 +15,6 @@ from pudl.metadata import PUDL_PACKAGE from pudl.transform.classes import ( RenameColumns, - TransformParams, rename_columns, ) @@ -27,7 +26,7 @@ # More detailed fixes on a per respondent basis -OFFSET_CODE_FIXES = { +TIMEZONE_OFFSET_CODE_FIXES = { 102: {"CPT": "CST"}, 110: {"CPT": "EST"}, 115: {"MS": "MST"}, @@ -189,7 +188,7 @@ "C011399": {np.nan: "PST"}, # this was just one lil empty guy } -OFFSET_CODE_FIXES_BY_YEAR = [ +TIMEZONE_OFFSET_CODE_FIXES_BY_YEAR = [ {"respondent_id_ferc714": 139, "report_year": 2006, "utc_offset_code": "PST"}, {"respondent_id_ferc714": 235, "report_year": 2015, "utc_offset_code": "MST"}, {"respondent_id_ferc714": 289, "report_year": 2011, "utc_offset_code": "CST"}, @@ -207,7 +206,7 @@ ] """Fake respondent IDs for database test entities.""" -OFFSET_CODES = { +TIMEZONE_OFFSET_CODES = { "EST": pd.Timedelta(-5, unit="hours"), # Eastern Standard "EDT": pd.Timedelta(-5, unit="hours"), # Eastern Daylight "CST": pd.Timedelta(-6, unit="hours"), # Central Standard @@ -222,15 +221,14 @@ } """A mapping of timezone offset codes to Timedelta offsets from UTC. -from one year to the next, and these result in duplicate records, which are Note that -the FERC 714 instructions state that all hourly demand is to be reported in STANDARD -time for whatever timezone is being used. Even though many respondents use daylight -savings / standard time abbreviations, a large majority do appear to conform to using a -single UTC offset throughout the year. There are 6 instances in which the timezone -associated with reporting changed dropped. +Note that the FERC 714 instructions state that all hourly demand is to be reported +in STANDARD time for whatever timezone is being used. Even though many respondents +use daylight savings / standard time abbreviations, a large majority do appear to +conform to using a single UTC offset throughout the year. There are 6 instances in +which the timezone associated with reporting changed dropped. """ -TZ_CODES = { +TIMEZONE_CODES = { "EST": "America/New_York", "EDT": "America/New_York", "CST": "America/Chicago", @@ -305,17 +303,6 @@ } -class RenameColumnsFerc714(TransformParams): - """Dictionaries for renaming either XBRL or CSV derived FERC 714 columns. - - TODO: Determine if this is helpful/worth it. I think it'll only be if there are - a bunch of share params to validate upfront. - """ - - csv: RenameColumns = RenameColumns() - xbrl: RenameColumns = RenameColumns() - - ############################################################################## # Internal helper functions. ############################################################################## @@ -420,15 +407,26 @@ def run( raw_xbrl_duration: pd.DataFrame, raw_xbrl_instant: pd.DataFrame, ) -> pd.DataFrame: - """Build the :ref:`out_ferc714__hourly_planning_area_demand` asset.""" + """Build the :ref:`out_ferc714__hourly_planning_area_demand` asset. + + To transform this table we have to process the instant and duration xbrl + tables so we can merge them together and process the XBRL data. We also + have to process the CSV data so we can concatenate it with the XBLR data. + Then we can process all of the data together. + + For both the CSV and XBRL data, the main transforms that are happening + have to do with cleaning the timestamps in the data, resulting in + timestamps that are in a datetime format and are nearly continuous + for every respondent. + + Once the CSV and XBRL data is merged together, the transforms are mostly + focused on cleaning the timezone codes reported to FERC + and then using those timezone codes to convert all of timestamps into + UTC datetime. + + The outcome here is nearly continuous and non-duplicative time series. + """ table_name = "out_ferc714__hourly_planning_area_demand" - # CSV STUFF - csv = ( - _pre_process_csv(raw_csv, table_name=table_name) - .pipe(_map_respondent_id_ferc714, "csv") - .pipe(cls.melt_hourx_columns_csv) - .pipe(cls.parse_date_strings_csv, datetime_format="%m/%d/%Y") - ) # XBRL STUFF duration_xbrl = cls.remove_yearly_records_duration_xbrl(raw_xbrl_duration) xbrl = ( @@ -441,35 +439,47 @@ def run( ) .pipe(_map_respondent_id_ferc714, "xbrl") .pipe(cls.convert_dates_to_zero_offset_hours_xbrl) - .pipe(cls.parse_date_strings_xbrl) + .astype({"report_date": "datetime64[ns]"}) .pipe(cls.convert_dates_to_zero_seconds_xbrl) - .pipe(cls.ensure_dates_are_complete_and_unique_xbrl) + .pipe(cls.ensure_dates_are_continuous, source="xbrl") + ) + # CSV STUFF + csv = ( + _pre_process_csv(raw_csv, table_name=table_name) + .pipe(_map_respondent_id_ferc714, "csv") + .pipe(cls.melt_hourx_columns_csv) + .pipe(cls.parse_date_strings_csv) + .pipe(cls.ensure_dates_are_continuous, source="csv") ) # CONCATED STUFF df = ( pd.concat([csv, xbrl]) .assign( utc_offset_code=lambda x: cls.standardize_offset_codes( - x, OFFSET_CODE_FIXES + x, TIMEZONE_OFFSET_CODE_FIXES ) ) .pipe(cls.clean_utc_code_offsets_and_set_timezone) .pipe(cls.drop_missing_utc_offset) .pipe(cls.construct_utc_datetime) + .pipe(cls.ensure_non_duplicated_datetimes) .pipe(cls.spot_fix_values) # Convert report_date to first day of year .assign( report_date=lambda x: x.report_date.dt.to_period("Y").dt.to_timestamp() ) - .pipe(_post_process, table_name=table_name) ) return df @staticmethod def melt_hourx_columns_csv(df): - """Melt hourX columns into hours.""" - # Almost all 25th hours are unusable (0.0 or daily totals), - # and they shouldn't really exist at all based on FERC instructions. + """Melt hourX columns into hours. + + There are some instances of the CSVs with a 25th hour. We drop + those entirely because almost all of them are unusable (0.0 or + daily totals), and they shouldn't really exist at all based on + FERC instructions. + """ df = df.drop(columns="hour25") # Melt daily rows with 24 demands to hourly rows with single demand @@ -491,24 +501,19 @@ def melt_hourx_columns_csv(df): return df @staticmethod - def parse_date_strings_csv(df, datetime_format): - """Convert report_date into pandas Datetime types.""" + def parse_date_strings_csv(csv): + """Convert report_date into pandas Datetime types. + + Make the report_date column from the daily string ``report_date`` and + the integer ``hour`` column. + """ # Parse date strings + hour_timedeltas = {i: pd.to_timedelta(i, unit="h") for i in range(24)} # NOTE: Faster to ignore trailing 00:00:00 and use exact=False - df["report_date"] = pd.to_datetime( - df["report_date"], format=datetime_format, exact=False - ) - # Assert that all respondents and years have complete and unique dates - all_dates = { - year: set(pd.date_range(f"{year}-01-01", f"{year}-12-31", freq="1D")) - for year in range(df["report_year"].min(), df["report_year"].max() + 1) - } - assert ( # nosec B101 - df.groupby(["respondent_id_ferc714", "report_year"]) - .apply(lambda x: set(x["report_date"]) == all_dates[x.name[1]]) - .all() - ) - return df + csv["report_date"] = pd.to_datetime( + csv["report_date"], format="%m/%d/%Y", exact=False + ) + csv["hour"].map(hour_timedeltas) + return csv.drop(columns=["hour"]) @staticmethod def remove_yearly_records_duration_xbrl(duration_xbrl): @@ -590,8 +595,22 @@ def merge_instant_and_duration_tables_xbrl( def convert_dates_to_zero_offset_hours_xbrl(xbrl: pd.DataFrame) -> pd.DataFrame: """Convert all hours to: Hour (24-hour clock) as a zero-padded decimal number. - Some but not all of the records start with hour 0, while other start with hour 1. - It is not immediately clear whether or not hours 1-24 corresponds to 1-00 hours. + The FERC 714 form includes columns for the hours of each day. Those columns are + labeled with 1-24 to indicate the hours of the day. The XBRL filings themselves + have time-like string associated with each of the facts. They include both a the + year-month-day portion (formatted as %Y-%m-%d) as well as an hour-minute-second + component (semi-formatted as T%H:%M:%S). Attempting to simply convert this + timestamp information to a datetime using the format ``"%Y-%m-%dT%H:%M:%S"`` + fails because about a third of the records include hour 24 - which is not an + accepted hour in standard datetime formats. + + The respondents that report hour 24 do not report hour 00. We have done some spot + checking of values reported to FERC and have determined that hour 24 seems to + correspond with hour 00 (of the next day). We have not gotten complete + confirmation from FERC staff that this is always the case, but it seems like a + decent assumption. + + So, this step converts all of the hour 24 records to be hour 00 of the next day. """ bad_24_hour_mask = xbrl.report_date.str.contains("T24:") @@ -606,9 +625,9 @@ def convert_dates_to_zero_seconds_xbrl(xbrl: pd.DataFrame) -> pd.DataFrame: """Convert the last second of the day records to the first (0) second of the next day. There are a small amount of records which report the last "hour" of the day - with as last second of the day, as opposed to T24 cleaned in + as last second of the day, as opposed to T24 cleaned in :func:`convert_dates_to_zero_offset_hours_xbrl` or T00 which is standard for a - numpy datetime. This function finds these records and adds one second of them and + datetime. This function finds these records and adds one second to them and then ensures all of the records has 0's for seconds. """ last_second_mask = xbrl.report_date.dt.second == 59 @@ -620,27 +639,23 @@ def convert_dates_to_zero_seconds_xbrl(xbrl: pd.DataFrame) -> pd.DataFrame: return xbrl @staticmethod - def ensure_dates_are_complete_and_unique_xbrl(df): - """Assert that almost all respondents and years have complete and unique dates. + def ensure_dates_are_continuous(df: pd.DataFrame, source: Literal["csv", "xbrl"]): + """Assert that almost all respondents have continuous timestamps. - We found 41 gaps in the timeseries! + In the xbrl data, we found 41 gaps in the timeseries! They are almost entirely + on the hour in which daylight savings times goes into effect. The csv data + had 10 gaps. Pretty good all in all! """ df["gap"] = df[["respondent_id_ferc714", "report_date"]].sort_values( by=["respondent_id_ferc714", "report_date"] ).groupby("respondent_id_ferc714").diff() > pd.to_timedelta("1h") - if len(gappy_dates := df[df.gap]) > 41: + if len(gappy_dates := df[df.gap]) > (41 if source == "xbrl" else 10): raise AssertionError( "We expect there to be nearly no gaps in the time series." f"but we found these gaps:\n{gappy_dates}" ) return df.drop(columns=["gap"]) - @staticmethod - def parse_date_strings_xbrl(xbrl: pd.DataFrame) -> pd.DataFrame: - """Convert report_date into pandas Datetime types.""" - xbrl = xbrl.astype({"report_date": "datetime64[ns]"}) - return xbrl - @staticmethod def standardize_offset_codes(df: pd.DataFrame, offset_fixes) -> pd.Series: """Convert to standardized UTC offset abbreviations. @@ -693,15 +708,15 @@ def standardize_offset_codes(df: pd.DataFrame, offset_fixes) -> pd.Series: def clean_utc_code_offsets_and_set_timezone(df): """Clean UTC Codes and set timezone.""" # NOTE: Assumes constant timezone for entire year - for fix in OFFSET_CODE_FIXES_BY_YEAR: + for fix in TIMEZONE_OFFSET_CODE_FIXES_BY_YEAR: mask = (df["report_year"] == fix["report_year"]) & ( df["respondent_id_ferc714"] == fix["respondent_id_ferc714"] ) df.loc[mask, "utc_offset_code"] = fix["utc_offset_code"] # Replace UTC offset codes with UTC offset and timezone - df["utc_offset"] = df["utc_offset_code"].map(OFFSET_CODES) - df["timezone"] = df["utc_offset_code"].map(TZ_CODES) + df["utc_offset"] = df["utc_offset_code"].map(TIMEZONE_OFFSET_CODES) + df["timezone"] = df["utc_offset_code"].map(TIMEZONE_CODES) return df @staticmethod @@ -726,17 +741,20 @@ def construct_utc_datetime(df: pd.DataFrame) -> pd.DataFrame: """Construct datetime_utc column.""" # Construct UTC datetime logger.info("Converting local time + offset code to UTC + timezone.") - hour_timedeltas = {i: pd.to_timedelta(i, unit="h") for i in range(24)} - df["report_date"] += df["hour"].map(hour_timedeltas) df["datetime_utc"] = df["report_date"] - df["utc_offset"] - df = df.drop(columns=["hour", "utc_offset"]) + df = df.drop(columns=["utc_offset"]) + return df - # Report and drop duplicated UTC datetimes + @staticmethod + def ensure_non_duplicated_datetimes(df): + """Report and drop duplicated UTC datetimes.""" # There should be less than 10 of these, # resulting from changes to a planning area's reporting timezone. duplicated = df.duplicated(["respondent_id_ferc714", "datetime_utc"]) - # TODO: convert this into an error - logger.info(f"Found {np.count_nonzero(duplicated)} duplicate UTC datetimes.") + if (num_dupes := np.count_nonzero(duplicated)) > 10: + raise AssertionError( + f"Found {num_dupes} duplicate UTC datetimes, but we expected 10 or less." + ) df = df.query("~@duplicated") return df