catalyst-cooperative · zaneselvans · Sep 8, 2024 · Sep 7, 2024 · Sep 7, 2024 · Sep 7, 2024
diff --git a/environments/conda-linux-64.lock.yml b/environments/conda-linux-64.lock.yml
diff --git a/environments/conda-lock.yml b/environments/conda-lock.yml
diff --git a/environments/conda-osx-64.lock.yml b/environments/conda-osx-64.lock.yml
diff --git a/environments/conda-osx-arm64.lock.yml b/environments/conda-osx-arm64.lock.yml
diff --git a/pyproject.toml b/pyproject.toml
@@ -73,7 +73,7 @@ dependencies = [
     "scikit-learn>=1.5",
     "scipy>=1.14",
     "Shapely>=2",
-    "splink>=3.9.14,<4", # Need to update PUDL to use new Splink v4 API. See issue #3735
+    "splink>=4",
     "sphinx>=7.4.4",
     "sphinx-autoapi>=3",
     "sphinx-issues>=1.2",

diff --git a/src/pudl/analysis/record_linkage/eia_ferc1_model_config.py b/src/pudl/analysis/record_linkage/eia_ferc1_model_config.py
@@ -5,10 +5,9 @@
 model.
 """
 
-import splink.duckdb.comparison_level_library as cll
-import splink.duckdb.comparison_library as cl
-import splink.duckdb.comparison_template_library as ctl
-from splink.duckdb.blocking_rule_library import block_on
+import splink.comparison_level_library as cll
+import splink.comparison_library as cl
+from splink import block_on
 
 blocking_rule_1 = "l.report_year = r.report_year and substr(l.plant_name_mphone,1,3) = substr(r.plant_name_mphone,1,3)"
 blocking_rule_2 = "l.report_year = r.report_year and substr(l.utility_name_mphone,1,2) = substr(r.utility_name_mphone,1,2) and substr(l.plant_name_mphone,1,2) = substr(r.plant_name_mphone,1,2)"
@@ -19,7 +18,7 @@
 blocking_rule_7 = "l.report_year = r.report_year and l.capacity_mw = r.capacity_mw and substr(l.plant_name_mphone,1,2) = substr(r.plant_name_mphone,1,2)"
 blocking_rule_8 = "l.report_year = r.report_year and l.installation_year = r.installation_year and substr(l.plant_name_mphone,1,2) = substr(r.plant_name_mphone,1,2)"
 blocking_rule_9 = "l.report_year = r.report_year and l.construction_year = r.construction_year and substr(l.plant_name_mphone,1,2) = substr(r.plant_name_mphone,1,2)"
-blocking_rule_10 = block_on(["report_year", "net_generation_mwh"])
+blocking_rule_10 = block_on("report_year", "net_generation_mwh")
 BLOCKING_RULES = [
     blocking_rule_1,
     blocking_rule_2,
@@ -33,59 +32,57 @@
     blocking_rule_10,
 ]
 
-plant_name_comparison = ctl.name_comparison(
+plant_name_comparison = cl.NameComparison(
     "plant_name",
-    damerau_levenshtein_thresholds=[],
     jaro_winkler_thresholds=[0.9, 0.8, 0.7],
 )
-utility_name_comparison = ctl.name_comparison(
+utility_name_comparison = cl.NameComparison(
     "utility_name",
-    damerau_levenshtein_thresholds=[],
     jaro_winkler_thresholds=[0.9, 0.8, 0.7],
-    term_frequency_adjustments=True,
-)
-fuel_type_code_pudl_comparison = cl.exact_match(
-    "fuel_type_code_pudl", term_frequency_adjustments=True
 )
+utility_name_comparison.configure(term_frequency_adjustments=True)
+fuel_type_code_pudl_comparison = cl.ExactMatch("fuel_type_code_pudl")
+fuel_type_code_pudl_comparison.configure(term_frequency_adjustments=True)
+
 capacity_comparison = {
     "output_column_name": "capacity_mw",
     "comparison_levels": [
-        cll.null_level("capacity_mw"),
-        cll.percentage_difference_level(
+        cll.NullLevel("capacity_mw"),
+        cll.PercentageDifferenceLevel(
             "capacity_mw",
             0.0 + 1e-4,
         ),
-        cll.percentage_difference_level("capacity_mw", 0.05),
-        cll.percentage_difference_level("capacity_mw", 0.1),
-        cll.percentage_difference_level("capacity_mw", 0.2),
-        cll.else_level(),
+        cll.PercentageDifferenceLevel("capacity_mw", 0.05),
+        cll.PercentageDifferenceLevel("capacity_mw", 0.1),
+        cll.PercentageDifferenceLevel("capacity_mw", 0.2),
+        cll.ElseLevel(),
     ],
     "comparison_description": "0% different vs. 5% different vs. 10% different vs. 20% different vs. anything else",
 }
 
 net_gen_comparison = {
     "output_column_name": "net_generation_mwh",
     "comparison_levels": [
-        cll.null_level("net_generation_mwh"),
-        cll.percentage_difference_level(
+        cll.NullLevel("net_generation_mwh"),
+        cll.PercentageDifferenceLevel(
             "net_generation_mwh", 0.0 + 1e-4
         ),  # could add an exact match level too
-        cll.percentage_difference_level("net_generation_mwh", 0.01),
-        cll.percentage_difference_level("net_generation_mwh", 0.1),
-        cll.percentage_difference_level("net_generation_mwh", 0.2),
-        cll.else_level(),
+        cll.PercentageDifferenceLevel("net_generation_mwh", 0.01),
+        cll.PercentageDifferenceLevel("net_generation_mwh", 0.1),
+        cll.PercentageDifferenceLevel("net_generation_mwh", 0.2),
+        cll.ElseLevel(),
     ],
     "comparison_description": "0% different vs. 1% different vs. 10% different vs. 20% different vs. anything else",
 }
 
 
 def get_date_comparison(column_name):
     """Get date comparison template for column."""
-    return ctl.date_comparison(
+    return cl.DateOfBirthComparison(
         column_name,
-        damerau_levenshtein_thresholds=[],
-        datediff_thresholds=[1, 2],
-        datediff_metrics=["year", "year"],
+        input_is_string=False,
+        datetime_thresholds=[1, 2],
+        datetime_metrics=["year", "year"],
     )
 
 

diff --git a/src/pudl/analysis/record_linkage/eia_ferc1_record_linkage.py b/src/pudl/analysis/record_linkage/eia_ferc1_record_linkage.py
@@ -20,6 +20,7 @@
 that links together several thousand EIA and FERC plant records. This trained model is
 used to predict matches on the full dataset (see :func:`get_model_predictions`) using a
 threshold match probability to predict if records are a match or not.
+
 The model can return multiple EIA match options for each FERC1 record, so we rank the
 matches and choose the one with the highest score. Any matches identified by the model
 which are in conflict with our training data are overwritten with the manually
@@ -36,7 +37,7 @@
 import numpy as np
 import pandas as pd
 from dagster import Out, graph, op
-from splink.duckdb.linker import DuckDBLinker
+from splink import DuckDBAPI, Linker, SettingsCreator
 
 import pudl
 from pudl.analysis.ml_tools import experiment_tracking, models
@@ -211,29 +212,30 @@ def get_training_data_df(inputs):
 @op
 def get_model_predictions(eia_df, ferc_df, train_df, experiment_tracker):
     """Train splink model and output predicted matches."""
-    settings_dict = {
-        "link_type": "link_only",
-        "unique_id_column_name": "record_id",
-        "additional_columns_to_retain": ["plant_id_pudl", "utility_id_pudl"],
-        "comparisons": COMPARISONS,
-        "blocking_rules_to_generate_predictions": BLOCKING_RULES,
-        "retain_matching_columns": True,
-        "retain_intermediate_calculation_columns": True,
-        "probability_two_random_records_match": 1 / len(eia_df),
-    }
-    linker = DuckDBLinker(
+    settings = SettingsCreator(
+        link_type="link_only",
+        unique_id_column_name="record_id",
+        additional_columns_to_retain=["plant_id_pudl", "utility_id_pudl"],
+        comparisons=COMPARISONS,
+        blocking_rules_to_generate_predictions=BLOCKING_RULES,
+        retain_matching_columns=True,
+        retain_intermediate_calculation_columns=True,
+        probability_two_random_records_match=(1.0 / len(eia_df)),
+    )
+    linker = Linker(
         [eia_df, ferc_df],
+        settings=settings,
         input_table_aliases=["eia_df", "ferc_df"],
-        settings_dict=settings_dict,
+        db_api=DuckDBAPI(),
     )
-    linker.register_table(train_df, "training_labels", overwrite=True)
-    linker.estimate_u_using_random_sampling(max_pairs=1e7)
-    linker.estimate_m_from_pairwise_labels("training_labels")
+    linker.table_management.register_table(train_df, "training_labels", overwrite=True)
+    linker.training.estimate_u_using_random_sampling(max_pairs=1e7)
+    linker.training.estimate_m_from_pairwise_labels("training_labels")
     threshold_prob = 0.9
     experiment_tracker.execute_logging(
         lambda: mlflow.log_params({"threshold match probability": threshold_prob})
     )
-    preds_df = linker.predict(threshold_match_probability=threshold_prob)
+    preds_df = linker.inference.predict(threshold_match_probability=threshold_prob)
     return preds_df.as_pandas_dataframe()