Merge pull request #145 from iantei/Introduce_Inferred_Labels_for_Sta…

…cked_Bar_Charts Introduce Inferred Labels for Stacked Bar Charts
e-mission · Sep 22, 2024 · 88180dc · 88180dc
2 parents fc5afdd + 1a5abfb
commit 88180dc
Show file tree

Hide file tree

Showing 3 changed files with 263 additions and 56 deletions.
diff --git a/viz_scripts/generic_metrics.ipynb b/viz_scripts/generic_metrics.ipynb
@@ -141,6 +141,31 @@
     "                                                                            sensed_algo_prefix)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "325e5eda",
+   "metadata": {},
+   "source": [
+    "## Collect Data from Database for Inferred Metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c26ff5f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "expanded_ct_inferred, file_suffix_inferred, quality_text_inferred, debug_df_inferred = await scaffolding.load_viz_notebook_inferred_data(year,\n",
+    "                                                                            month,\n",
+    "                                                                            program,\n",
+    "                                                                            study_type,\n",
+    "                                                                            dynamic_labels,\n",
+    "                                                                            dic_re,\n",
+    "                                                                            dic_pur=dic_pur,\n",
+    "                                                                            include_test_users=include_test_users)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -172,9 +197,14 @@
     "labeled_match = re.match(r'Based on ([0-9]+) confirmed trips from ([0-9]+) (users|testers and participants)\\nof ([0-9]+) total  trips from ([0-9]+) (users|testers and participants) (\\(([0-9.]+|nan)%\\))', quality_text)\n",
     "# labeled_match\n",
     "stacked_bar_quality_text_labeled = f\"{labeled_match.group(1)} trips {labeled_match.group(7)}\\n from {labeled_match.group(2)} {labeled_match.group(3)}\"\n",
+    "\n",
     "sensed_match = re.match(r'Based on ([0-9]+) trips from ([0-9]+) (users|testers and participants)', quality_text_sensed)\n",
     "stacked_bar_quality_text_sensed = f\"{sensed_match.group(1)} trips (100%)\\n from {sensed_match.group(2)} {sensed_match.group(3)}\"\n",
-    "stacked_bar_quality_text_labeled, stacked_bar_quality_text_sensed"
+    "\n",
+    "inferred_match = re.match(r'Based on ([0-9]+) confirmed trips from ([0-9]+) (users|testers and participants)\\nof ([0-9]+) total  trips from ([0-9]+) (users|testers and participants) (\\(([0-9.]+|nan)%\\))', quality_text_inferred)\n",
+    "stacked_bar_quality_text_inferred = f\"{inferred_match.group(1)} trips {inferred_match.group(7)}\\n from {inferred_match.group(2)} {inferred_match.group(3)}\"\n",
+    "\n",
+    "stacked_bar_quality_text_labeled, stacked_bar_quality_text_sensed, stacked_bar_quality_text_inferred"
    ]
   },
   {
@@ -204,15 +234,16 @@
     "plot_title_no_quality= \"Number of trips for each mode\"\n",
     "\n",
     "try:\n",
-    "    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15,2*2), sharex=True)\n",
+    "    fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(15,3*2), sharex=True)\n",
     "    # We will have text results corresponding to the axes for simplicity and consistency\n",
-    "    text_results = [[\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"]]\n",
+    "    text_results = [[\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"]]\n",
     "    \n",
     "    plot_and_text_stacked_bar_chart(expanded_ct, lambda df: (df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False)), \n",
     "                                    \"Labeled by user\\n\"+stacked_bar_quality_text_labeled, ax[0], text_results[0], colors_mode, debug_df, values_to_translations)\n",
+    "    plot_and_text_stacked_bar_chart(expanded_ct_inferred, lambda df: (df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False)), \n",
+    "                                    \"Labeled and Inferred by OpenPATH\\n\"+stacked_bar_quality_text_inferred, ax[1], text_results[1], colors_mode, debug_df_inferred, values_to_translations)\n",
     "    plot_and_text_stacked_bar_chart(expanded_ct_sensed, lambda df: (df.groupby(\"primary_mode\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False)), \n",
-    "                                    \"Sensed by OpenPATH\\n\"+stacked_bar_quality_text_sensed, ax[1], text_results[1], colors_sensed, debug_df_sensed)\n",
-    "    \n",
+    "                                    \"Sensed by OpenPATH\\n\"+stacked_bar_quality_text_sensed, ax[2], text_results[2], colors_sensed, debug_df_sensed)\n",
     "    set_title_and_save(fig, text_results, plot_title_no_quality, file_name)\n",
     "except (AttributeError, KeyError, pd.errors.UndefinedVariableError) as e:\n",
     "    plt.clf()\n",
@@ -254,13 +285,23 @@
     "\n",
     "    expanded_ct_commute = expanded_ct.query(trip_purpose_query)\n",
     "    commute_quality_text = scaffolding.get_quality_text(expanded_ct, expanded_ct_commute, \"commute\", include_test_users) if not expanded_ct.empty else \"\"\n",
-    "    plot_title = plot_title_no_quality + \"\\n\" + commute_quality_text\n",
-    "    \n",
+    "    expanded_ct_inferred_commute = expanded_ct_inferred.query(trip_purpose_query)\n",
+    "    commute_quality_text_inferred = scaffolding.get_quality_text(expanded_ct_inferred, expanded_ct_inferred_commute, \"commute\", include_test_users) if not expanded_ct_inferred.empty else \"\"\n",
+    "    plot_title = plot_title_no_quality\n",
+    "\n",
+    "    commute_labeled_match = re.match(r'Based on ([0-9]+) confirmed commute trips from ([0-9]+) (users|testers and participants)\\nof ([0-9]+) total confirmed trips from ([0-9]+) (users|testers and participants) (\\(([0-9.]+|nan)%\\))', commute_quality_text)\n",
+    "    stacked_bar_quality_text_commute_labeled = f\"{commute_labeled_match.group(1)} trips {commute_labeled_match.group(7)}\\n from {commute_labeled_match.group(2)} {commute_labeled_match.group(3)}\"\n",
+    "\n",
+    "    commute_inferred_match = re.match(r'Based on ([0-9]+) confirmed commute trips from ([0-9]+) (users|testers and participants)\\nof ([0-9]+) total confirmed trips from ([0-9]+) (users|testers and participants) (\\(([0-9.]+|nan)%\\))', commute_quality_text_inferred)\n",
+    "    stacked_bar_quality_text_commute_inferred = f\"{commute_inferred_match.group(1)} trips {commute_inferred_match.group(7)}\\n from {commute_inferred_match.group(2)} {commute_inferred_match.group(3)}\"\n",
+    "\n",
     "    # Plot entries\n",
-    "    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15,2*1), sharex=True)    \n",
-    "    text_results = [\"Unmodified Alt Text\", \"Unmodified HTML\"]\n",
+    "    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15,2*2), sharex=True)    \n",
+    "    text_results = [[\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"]]\n",
     "    plot_and_text_stacked_bar_chart(expanded_ct_commute, lambda df: df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False), \n",
-    "                                    \"Labeled by user\\n (Confirmed trips)\", ax, text_results, colors_mode, debug_df, values_to_translations)\n",
+    "                                    \"Labeled by user\\n (Confirmed trips)\", ax[0], text_results[0], colors_mode, debug_df, values_to_translations)\n",
+    "    plot_and_text_stacked_bar_chart(expanded_ct_inferred_commute, lambda df: df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False), \n",
+    "                                    \"Labeled by user\\n (Confirmed trips)\", ax[1], text_results[1], colors_mode, debug_df_inferred, values_to_translations)\n",
     "    set_title_and_save(fig, text_results, plot_title, file_name)\n",
     "except (AttributeError, KeyError, pd.errors.UndefinedVariableError) as e:\n",
     "    plt.clf()\n",
@@ -290,10 +331,12 @@
     "plot_title_no_quality=\"Number of trips for each purpose\"\n",
     "file_name= f\"ntrips_purpose{file_suffix}\"\n",
     "try:\n",
-    "    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15,2*1), sharex=True)\n",
-    "    text_results = [\"Unmodified Alt Text\", \"Unmodified HTML\"]\n",
+    "    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15,2*2), sharex=True)\n",
+    "    text_results = [[\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"]]\n",
     "    plot_and_text_stacked_bar_chart(expanded_ct, lambda df: df.groupby(\"purpose_confirm_w_other\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False), \n",
-    "                                    \"Labeled by user\\n\"+stacked_bar_quality_text_labeled, ax, text_results, colors_purpose, debug_df, value_to_translations_purpose)\n",
+    "                                    \"Labeled by user\\n\"+stacked_bar_quality_text_labeled, ax[0], text_results[0], colors_purpose, debug_df, value_to_translations_purpose)\n",
+    "    plot_and_text_stacked_bar_chart(expanded_ct_inferred, lambda df: df.groupby(\"purpose_confirm_w_other\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False), \n",
+    "                                    \"Labeled and Inferred by OpenPATH\\n\"+stacked_bar_quality_text_inferred, ax[1], text_results[1], colors_purpose, debug_df_inferred)\n",
     "    set_title_and_save(fig, text_results, plot_title_no_quality, file_name)\n",
     "except (AttributeError, KeyError, pd.errors.UndefinedVariableError) as e:\n",
     "    plt.clf()\n",
@@ -337,17 +380,22 @@
     "    ## We do an existence check for the labeled df because we want to display the sensed value even if we don't have the labeled value\n",
     "    ## but we don't need to have an existence check for sensed because in that case we will have no data to display\n",
     "    expanded_ct_u80 = expanded_ct.loc[(expanded_ct['distance'] <= cutoff)] if \"mode_confirm_w_other\" in expanded_ct.columns else None\n",
+    "    expanded_ct_inferred_u80 = expanded_ct_inferred.loc[(expanded_ct_inferred['distance'] <= cutoff)] if \"mode_confirm_w_other\" in expanded_ct_inferred.columns else None\n",
     "    expanded_ct_sensed_u80 = expanded_ct_sensed.loc[(expanded_ct_sensed['distance'] <= cutoff)]\n",
+    "\n",
     "    sensed_u80_quality_text = f\"{len(expanded_ct_sensed_u80)} trips ({round(len(expanded_ct_sensed_u80)/len(expanded_ct_sensed)*100)}% of all trips)\\nfrom {scaffolding.unique_users(expanded_ct_sensed_u80)} {sensed_match.group(3)}\"\n",
     "    labeled_u80_quality_text = f\"{len(expanded_ct_u80)} trips ({round(len(expanded_ct_u80)/len(expanded_ct)*100)}% of all labeled,\\n{round(len(expanded_ct_u80)/len(expanded_ct_sensed)*100)}% of all trips)\\nfrom {scaffolding.unique_users(expanded_ct_u80)} {sensed_match.group(3)}\" if \"Mode_confirm\" in expanded_ct.columns else \"0 labeled trips\"\n",
-    "    \n",
+    "    inferred_u80_quality_text = f\"{len(expanded_ct_inferred_u80)} trips ({round(len(expanded_ct_inferred_u80)/len(expanded_ct_inferred)*100)}% of all inferred,\\n{round(len(expanded_ct_inferred_u80)/len(expanded_ct_sensed)*100)}% of all trips)\\nfrom {scaffolding.unique_users(expanded_ct_inferred_u80)} {sensed_match.group(3)}\" if \"Mode_confirm\" in expanded_ct_inferred.columns else \"0 inferred trips\"\n",
+    "\n",
     "    # Plot entries\n",
-    "    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15,2*2), sharex=True)\n",
-    "    text_results = [[\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"]]\n",
+    "    fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(15,3*2), sharex=True)\n",
+    "    text_results = [[\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"]]\n",
     "    plot_and_text_stacked_bar_chart(expanded_ct_u80, lambda df: df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False), \n",
     "                                    \"Labeled by user\\n\"+labeled_u80_quality_text, ax[0], text_results[0], colors_mode, debug_df, values_to_translations)\n",
+    "    plot_and_text_stacked_bar_chart(expanded_ct_inferred, lambda df: df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False), \n",
+    "                                    \"Labeled and Inferred by OpenPATH\\n\"+inferred_u80_quality_text, ax[1], text_results[1], colors_mode, debug_df_inferred, values_to_translations)\n",
     "    plot_and_text_stacked_bar_chart(expanded_ct_sensed_u80, lambda df: df.groupby(\"primary_mode\").agg({distance_col: 'count'}).sort_values(by=distance_col, ascending=False), \n",
-    "                                    \"Sensed by OpenPATH\\n\"+sensed_u80_quality_text, ax[1], text_results[1], colors_sensed, debug_df_sensed)\n",
+    "                                    \"Sensed by OpenPATH\\n\"+sensed_u80_quality_text, ax[2], text_results[2], colors_sensed, debug_df_sensed)\n",
     "    set_title_and_save(fig, text_results, plot_title_no_quality, file_name)\n",
     "except (AttributeError, KeyError, pd.errors.UndefinedVariableError) as e:\n",
     "    # we can have an missing attribute error during the pre-procssing, in which case we should show the missing plot\n",
@@ -381,13 +429,15 @@
     "file_name =f\"total_trip_length{file_suffix}\"\n",
     "\n",
     "try:\n",
-    "    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15,2*2), sharex=True)\n",
+    "    fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(15,3*2), sharex=True)\n",
     "    \n",
-    "    text_results = [[\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"]]\n",
+    "    text_results = [[\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"], [\"Unmodified Alt Text\", \"Unmodified HTML\"]]\n",
     "    plot_and_text_stacked_bar_chart(expanded_ct, lambda df: df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'sum'}).sort_values(by=distance_col, ascending=False), \n",
     "                                    \"Labeled by user\\n\"+stacked_bar_quality_text_labeled, ax[0], text_results[0], colors_mode, debug_df, values_to_translations)\n",
+    "    plot_and_text_stacked_bar_chart(expanded_ct_inferred, lambda df: df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'sum'}).sort_values(by=distance_col, ascending=False), \n",
+    "                                    \"Labeled and Inferred by OpenPATH\\n\"+stacked_bar_quality_text_inferred, ax[1], text_results[1], colors_mode, debug_df_inferred, values_to_translations)\n",
     "    plot_and_text_stacked_bar_chart(expanded_ct_sensed, lambda df: df.groupby(\"primary_mode\").agg({distance_col: 'sum'}).sort_values(by=distance_col, ascending=False), \n",
-    "                                    \"Sensed by OpenPATH\\n\"+stacked_bar_quality_text_sensed, ax[1], text_results[1], colors_sensed, debug_df_sensed)\n",
+    "                                    \"Sensed by OpenPATH\\n\"+stacked_bar_quality_text_sensed, ax[2], text_results[2], colors_sensed, debug_df_sensed)\n",
     "    set_title_and_save(fig, text_results, plot_title_no_quality, file_name)    \n",
     "except (AttributeError, KeyError, pd.errors.UndefinedVariableError) as e:\n",
     "    plt.clf()\n",
@@ -421,16 +471,20 @@
     "    ## We do an existence check for the labeled df because we want to display the sensed value even if we don't have the labeled value\n",
     "    ## but we don't need to have an existence check for sensed because in that case we will have no data to display\n",
     "    labeled_land_trips_df = expanded_ct[expanded_ct['mode_confirm_w_other'] != \"air\"] if \"mode_confirm_w_other\" in expanded_ct.columns else None\n",
+    "    inferred_land_trips_df = expanded_ct_inferred[expanded_ct_inferred['mode_confirm_w_other'] != \"air\"] if \"mode_confirm_w_other\" in expanded_ct_inferred.columns else None\n",
     "    sensed_land_trips_df = expanded_ct_sensed[expanded_ct_sensed['primary_mode'] != \"AIR_OR_HSR\"]\n",
     "    \n",
     "    sensed_land_quality_text = f\"{len(sensed_land_trips_df)} trips ({round(len(sensed_land_trips_df)/len(expanded_ct_sensed)*100)}% of all trips)\\nfrom {scaffolding.unique_users(sensed_land_trips_df)} {sensed_match.group(3)}\"\n",
-    "    labeled_land_quality_text = f\"{len(labeled_land_trips_df)} trips ({round(len(labeled_land_trips_df)/len(expanded_ct)*100)}% of all labeled,\\n{round(len(labeled_land_trips_df)/len(expanded_ct_sensed)*100)}%) of all trips)\\nfrom {scaffolding.unique_users(labeled_land_trips_df)} {sensed_match.group(3)}\" if \"mode_confirm_w_other\" in expanded_ct.columns else \"0 labeled trips\"\n",
-    "\n",
-    "    fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(15,2*2), sharex=True)\n",
+    "    labeled_land_quality_text = f\"{len(labeled_land_trips_df)} trips ({round(len(labeled_land_trips_df)/len(expanded_ct)*100)}% of all labeled,\\n{round(len(labeled_land_trips_df)/len(expanded_ct_sensed)*100)}%) of all trips)\\nfrom {scaffolding.unique_users(labeled_land_trips_df)} {sensed_match.group(3)}\" if \"Mode_confirm\" in expanded_ct.columns else \"0 labeled trips\"\n",
+    "    inferred_land_quality_text = f\"{len(inferred_land_trips_df)} trips ({round(len(inferred_land_trips_df)/len(expanded_ct_inferred)*100)}% of all inferred,\\n{round(len(inferred_land_trips_df)/len(expanded_ct_sensed)*100)}%) of all trips)\\nfrom {scaffolding.unique_users(inferred_land_trips_df)} {sensed_match.group(3)}\" if \"Mode_confirm\" in expanded_ct_inferred.columns else \"0 inferred trips\"\n",
+    "    \n",
+    "    fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(15,3*2), sharex=True)\n",
     "    plot_and_text_stacked_bar_chart(labeled_land_trips_df, lambda df: df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'sum'}).sort_values(by=distance_col, ascending=False), \n",
     "                                    \"Labeled by user\\n\"+labeled_land_quality_text,  ax[0], text_results[0], colors_mode, debug_df, values_to_translations)\n",
+    "    plot_and_text_stacked_bar_chart(inferred_land_trips_df, lambda df: df.groupby(\"mode_confirm_w_other\").agg({distance_col: 'sum'}).sort_values(by=distance_col, ascending=False), \n",
+    "                                    \"Labeled and Inferred by OpenPATH\\n\"+inferred_land_quality_text, ax[1], text_results[1], colors_mode, debug_df_inferred, values_to_translations)\n",
     "    plot_and_text_stacked_bar_chart(sensed_land_trips_df, lambda df: df.groupby(\"primary_mode\").agg({distance_col: 'sum'}).sort_values(by=distance_col, ascending=False), \n",
-    "                                    \"Sensed by OpenPATH\\n\"+sensed_land_quality_text, ax[1], text_results[1], colors_sensed, debug_df_sensed)\n",
+    "                                    \"Sensed by OpenPATH\\n\"+sensed_land_quality_text, ax[2], text_results[2], colors_sensed, debug_df_sensed)\n",
     "    set_title_and_save(fig, text_results, plot_title_no_quality, file_name)    \n",
     "except (AttributeError, KeyError, pd.errors.UndefinedVariableError) as e:\n",
     "    plt.clf()\n",