Merge pull request #19 from mila-aia/build_dataset

Script to build dataset with labels and paths to SAC files
mila-aia · Feb 16, 2023 · e3c0d2e · e3c0d2e
2 parents 0b8112a + 09ae5e1
commit e3c0d2e
Show file tree

Hide file tree

Showing 4 changed files with 179 additions and 34 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,4 +27,5 @@ repos:
   - repo: https://github.com/pre-commit/mirrors-mypy
     rev: v0.910
     hooks:
-      - id: mypy
+      - id: mypy
+        additional_dependencies: ['types-pyyaml']
diff --git a/config/config.yml b/config/config.yml
@@ -2,14 +2,23 @@ whale_constant:
   fw:
     name: Fin Whale
     window: 120
-    threshold_power_ratio: 3
-    threshold_detections: 5
+    window_size: 5
     high_cut_bandpass: 32
     low_cut_bandpass: 12
+    call_duration: 1
+    call_frequency: 10
+    stadir_name: stadir
   bw:
     name: Blue Whale
     window: 720
-    threshold_power_ratio: 1.5
-    threshold_detections: 3
+    window_size: 40
     high_cut_bandpass: 32
     low_cut_bandpass: 10
+    call_duration: 8
+    call_frequency: 68
+    stadir_name: stadir_bw
+
+paths:
+  whale_data_cluster: /network/projects/aia/whale_call
+  list_sac_files: /network/projects/aia/whale_call/SAC_FILES_RAW.txt
+  mat_file: /network/projects/aia/whale_call/MATLAB_OUTPUT/WhaleDetectionsLSZ_new.mat
diff --git a/requirements.txt b/requirements.txt
@@ -10,4 +10,5 @@ plotly==5.9.0
 pysmo==0.8.0
 obspy==1.3.0
 matplotlib==3.6.3
-wget==3.2
+wget==3.2
+types-pyyaml==6.0.12.6
diff --git a/scripts/preprocess_labels.py b/scripts/preprocess_labels.py
@@ -2,6 +2,9 @@
 from pathlib import Path
 import pandas as pd
 import scipy.io
+from datetime import timedelta
+import random
+import yaml
 
 
 def read_stadir(mat_file: dict, whale_type: str) -> dict:
@@ -53,17 +56,68 @@ def preprocess_(dataset: pd.DataFrame, stadir: dict) -> pd.DataFrame:
     return dataset
 
 
+def read_list_raw_files(sac_file_path: str) -> pd.DataFrame:
+    """Build a dataframe with the list of the SAC files
+
+    @param: sac_file_path (string): PATH to the txt file containing the list
+
+    return: pd.Dataframe with 2 columns: folder containing files, and PATH
+    """
+    # Read list of raw data files
+    with open(sac_file_path) as file:
+        list_files = file.readlines()
+        list_files = [line.rstrip("\n") for line in list_files]
+        list_files_df = pd.Series(list_files)
+
+    # Create df from list of files
+    list_files_detailled = pd.DataFrame(
+        list_files_df.str.split("/", n=7, expand=True)[[6, 7]]
+    ).rename(columns={6: "folder_date", 7: "file_name"})
+
+    # Rename columns
+    list_files_detailled = list_files_detailled["file_name"].str.split(
+        ".", n=7, expand=True
+    )
+    list_files_detailled = list_files_detailled.rename(
+        {
+            0: "year",
+            1: "month",
+            2: "day",
+            3: "network",
+            4: "station",
+            5: "empty",
+            6: "coordinates",
+            7: "SAC",
+        },
+        axis=1,
+    )
+    # Extract folder name
+    list_files_detailled["folder"] = list_files_df.str.split(
+        "/", n=7, expand=True
+    )[[6]]
+    # Add list of names to path
+    list_files_detailled["file_path"] = list_files_df
+
+    return list_files_detailled
+
+
 def main() -> None:
     """ """
 
     # Arguments parsing
     args = parse_args()
+
+    # Load config
+    with open("config/config.yml", "r") as file:
+        param_data = yaml.safe_load(file)
+
     # Output
-    output_dir = Path(args.output_dir).expanduser().resolve()
-    labels_output = output_dir / "LABELS"
+    labels_output = Path(param_data["paths"]["whale_data_cluster"]) / "LABELS"
+
     labels_output.mkdir(
         parents=True, exist_ok=True
     )  # Create folder if not exist
+
     # Input
     input_file = Path(args.input_file)
 
@@ -82,51 +136,131 @@ def main() -> None:
         "detection_id",
     ]
 
-    fw_calls = pd.DataFrame(mat["FWC"], columns=colnames_WC)
-    bw_calls = pd.DataFrame(mat["BWC"], columns=colnames_WC)
+    # Get list of files in dataframe
+    if args.bandpass_filter is True:
+        list_files_detailled = read_list_raw_files(
+            "/network/projects/aia/whale_call/SAC_FILES_RAW.txt"
+        )
+    else:
+        list_files_detailled = read_list_raw_files(
+            "/network/projects/aia/whale_call/SAC_FILES_FILT.txt"
+        )
 
-    # Load name and index of stations
-    stadir_fw = read_stadir(mat, "stadir")
-    stadir_bw = read_stadir(mat, "stadir_bw")
+    # Loop for the 2 whale types
+    for whale_type in ["bw", "fw"]:
 
-    # Preprocess
-    fw_ds = preprocess_(fw_calls, stadir_fw)
-    bw_ds = preprocess_(bw_calls, stadir_bw)
+        # Get data from matlab .mat matrix
+        df_calls = pd.DataFrame(
+            mat[whale_type.upper() + "C"], columns=colnames_WC
+        )
 
-    # Plot counts
-    print(
-        "Number of Fin Whale calls detected: {}".format(
-            fw_ds.detection_id.nunique()
+        # Load name and index of stations
+        stadir = read_stadir(
+            mat, param_data["whale_constant"][whale_type]["stadir_name"]
         )
-    )
-    print(
-        "Number of Blue Whale calls detected: {}".format(
-            bw_ds.detection_id.nunique()
+
+        # Preprocess labels
+        labels = preprocess_(df_calls, stadir)
+
+        # Plot counts of calls
+        print(
+            "Number of {} calls detected: {}".format(
+                param_data["whale_constant"][whale_type]["name"],
+                labels.detection_id.nunique(),
+            )
+        )
+
+        # Add start and end time of calls
+        labels["time_call_start"] = pd.to_datetime(labels["datetime"])
+        labels["time_call_end"] = labels["time_call_start"] + timedelta(
+            seconds=param_data["whale_constant"][whale_type]["call_duration"]
         )
-    )
 
-    # Save datasets
-    fw_ds.to_csv(labels_output / "fin_whales.csv")
-    bw_ds.to_csv(labels_output / "blue_whales.csv")
+        # Add random value to create start and end time of window
+
+        list_randoms = []
+        for _ in labels.index:
+            list_randoms.append(
+                random.uniform(  # nosec
+                    0,
+                    param_data["whale_constant"][whale_type]["window_size"]
+                    - param_data["whale_constant"][whale_type][
+                        "call_duration"
+                    ],
+                )
+            )
+
+        labels["random_t"] = list_randoms
+        labels["time_window_start"] = labels.apply(
+            lambda x: x.time_call_start - timedelta(seconds=x.random_t), axis=1
+        )
+        labels["time_window_end"] = labels["time_window_start"] + timedelta(
+            seconds=5
+        )
+
+        # Reformat folder name
+        labels["folder_date"] = (
+            labels["date"].astype(str).apply(lambda x: "".join(x.split("-")))
+        )
+
+        # Add column with Whale type
+        labels["whale_type"] = whale_type
+
+        # Merge labels and SAC file PATHs to same dataframe
+        final_df = pd.merge(
+            labels,
+            list_files_detailled,
+            left_on=["folder_date", "station_name"],
+            right_on=["folder", "station"],
+        ).rename(
+            columns={
+                "coordinates": "component",
+                "station": "station_code",
+                "detection_id": "group_id",
+            }
+        )
+
+        # Save results to dataframe
+        if args.bandpass_filter is True:
+            csv_name = whale_type + "_filt.csv"
+        else:
+            csv_name = whale_type + ".csv"
+
+        final_df[
+            [
+                "file_path",
+                "time_window_start",
+                "time_window_end",
+                "time_call_start",
+                "time_call_end",
+                "R",
+                "SNR",
+                "group_id",
+                "station_code",
+                "whale_type",
+                "component",
+            ]
+        ].to_csv(labels_output / csv_name, index=False)
 
 
 def parse_args() -> Namespace:
+    """Parse arguments"""
     description = "Script for preprocessing the labels coming from .mat matrix"
     arg_parser = ArgumentParser(
         description=description, formatter_class=ArgumentDefaultsHelpFormatter
     )
 
     arg_parser.add_argument(
-        "--output_dir",
-        default="data/",
-        type=str,
-        help="path to the output directory",
+        "--bandpass_filter",
+        default=True,
+        type=bool,
+        help="True if you want to use data with applied bandpass filter",
     )
 
     arg_parser.add_argument(
         "--input_file",
-        default="/network/projects/aia/whale_call/calls_data \
-        /WhaleDetectionsLSZ.mat",
+        default="/network/projects/aia/whale_call/"
+        + "MATLAB_OUTPUT/WhaleDetectionsLSZ_new.mat",
         type=str,
         help="path to the input file",
     )