From 76f9057f69ed7db0c255da863ecd7d9eeaee210c Mon Sep 17 00:00:00 2001 From: Basile R Date: Wed, 15 Feb 2023 14:37:16 -0500 Subject: [PATCH 1/4] add: new parameters added --- config/config.yml | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/config/config.yml b/config/config.yml index 60ed849..1ca0ad2 100644 --- a/config/config.yml +++ b/config/config.yml @@ -2,14 +2,23 @@ whale_constant: fw: name: Fin Whale window: 120 - threshold_power_ratio: 3 - threshold_detections: 5 + window_size: 5 high_cut_bandpass: 32 low_cut_bandpass: 12 + call_duration: 1 + call_frequency: 10 + stadir_name: stadir bw: name: Blue Whale window: 720 - threshold_power_ratio: 1.5 - threshold_detections: 3 + window_size: 40 high_cut_bandpass: 32 low_cut_bandpass: 10 + call_duration: 8 + call_frequency: 68 + stadir_name: stadir_bw + +paths: + whale_data_cluster: /network/projects/aia/whale_call + list_sac_files: /network/projects/aia/whale_call/SAC_FILES_RAW.txt + mat_file: /network/projects/aia/whale_call/MATLAB_OUTPUT/WhaleDetectionsLSZ_new.mat From f8b3547541df193115d4999d50d6318846ef0d72 Mon Sep 17 00:00:00 2001 From: Basile R Date: Wed, 15 Feb 2023 17:18:40 -0500 Subject: [PATCH 2/4] feat: build dataset using labels and sac file paths --- .pre-commit-config.yaml | 3 +- scripts/preprocess_labels.py | 190 +++++++++++++++++++++++++++++------ 2 files changed, 164 insertions(+), 29 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8cdaae2..fa57416 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,4 +27,5 @@ repos: - repo: https://github.com/pre-commit/mirrors-mypy rev: v0.910 hooks: - - id: mypy \ No newline at end of file + - id: mypy + additional_dependencies: ['types-pyyaml'] \ No newline at end of file diff --git a/scripts/preprocess_labels.py b/scripts/preprocess_labels.py index 3eaafa3..b579cdc 100644 --- a/scripts/preprocess_labels.py +++ b/scripts/preprocess_labels.py @@ -2,6 +2,9 @@ from pathlib import Path import pandas as pd import scipy.io +from datetime import timedelta +import random +import yaml def read_stadir(mat_file: dict, whale_type: str) -> dict: @@ -53,17 +56,68 @@ def preprocess_(dataset: pd.DataFrame, stadir: dict) -> pd.DataFrame: return dataset +def read_list_raw_files(sac_file_path: str) -> pd.DataFrame: + """Build a dataframe with the list of the SAC files + + @param: sac_file_path (string): PATH to the txt file containing the list + + return: pd.Dataframe with 2 columns: folder containing files, and PATH + """ + # Read list of raw data files + with open(sac_file_path) as file: + list_files = file.readlines() + list_files = [line.rstrip("\n") for line in list_files] + list_files_df = pd.Series(list_files) + + # Create df from list of files + list_files_detailled = pd.DataFrame( + list_files_df.str.split("/", n=7, expand=True)[[6, 7]] + ).rename(columns={6: "folder_date", 7: "file_name"}) + + # Rename columns + list_files_detailled = list_files_detailled["file_name"].str.split( + ".", n=7, expand=True + ) + list_files_detailled = list_files_detailled.rename( + { + 0: "year", + 1: "month", + 2: "day", + 3: "network", + 4: "station", + 5: "empty", + 6: "coordinates", + 7: "SAC", + }, + axis=1, + ) + # Extract folder name + list_files_detailled["folder"] = list_files_df.str.split( + "/", n=7, expand=True + )[[6]] + # Add list of names to path + list_files_detailled["file_path"] = list_files_df + + return list_files_detailled + + def main() -> None: """ """ # Arguments parsing args = parse_args() + + # Load config + with open("config/config.yml", "r") as file: + param_data = yaml.safe_load(file) + # Output - output_dir = Path(args.output_dir).expanduser().resolve() - labels_output = output_dir / "LABELS" + labels_output = Path(param_data["paths"]["whale_data_cluster"]) / "LABELS" + labels_output.mkdir( parents=True, exist_ok=True ) # Create folder if not exist + # Input input_file = Path(args.input_file) @@ -82,51 +136,131 @@ def main() -> None: "detection_id", ] - fw_calls = pd.DataFrame(mat["FWC"], columns=colnames_WC) - bw_calls = pd.DataFrame(mat["BWC"], columns=colnames_WC) + # Get list of files in dataframe + if args.bandpass_filter is True: + list_files_detailled = read_list_raw_files( + "/network/projects/aia/whale_call/SAC_FILES_RAW.txt" + ) + else: + list_files_detailled = read_list_raw_files( + "/network/projects/aia/whale_call/SAC_FILES_FILT.txt" + ) - # Load name and index of stations - stadir_fw = read_stadir(mat, "stadir") - stadir_bw = read_stadir(mat, "stadir_bw") + # Loop for the 2 whale types + for whale_type in ["bw", "fw"]: - # Preprocess - fw_ds = preprocess_(fw_calls, stadir_fw) - bw_ds = preprocess_(bw_calls, stadir_bw) + # Get data from matlab .mat matrix + df_calls = pd.DataFrame( + mat[whale_type.upper() + "C"], columns=colnames_WC + ) - # Plot counts - print( - "Number of Fin Whale calls detected: {}".format( - fw_ds.detection_id.nunique() + # Load name and index of stations + stadir = read_stadir( + mat, param_data["whale_constant"][whale_type]["stadir_name"] ) - ) - print( - "Number of Blue Whale calls detected: {}".format( - bw_ds.detection_id.nunique() + + # Preprocess labels + labels = preprocess_(df_calls, stadir) + + # Plot counts of calls + print( + "Number of {} calls detected: {}".format( + param_data["whale_constant"][whale_type]["name"], + labels.detection_id.nunique(), + ) + ) + + # Add start and end time of calls + labels["time_call_start"] = pd.to_datetime(labels["datetime"]) + labels["time_call_end"] = labels["time_call_start"] + timedelta( + seconds=param_data["whale_constant"][whale_type]["call_duration"] ) - ) - # Save datasets - fw_ds.to_csv(labels_output / "fin_whales.csv") - bw_ds.to_csv(labels_output / "blue_whales.csv") + # Add random value to create start and end time of window + + list_randoms = [] + for _ in labels.index: + list_randoms.append( + random.randint( # nosec + 1, + param_data["whale_constant"][whale_type]["window_size"] + - param_data["whale_constant"][whale_type][ + "call_duration" + ], + ) + ) + + labels["random_t"] = list_randoms + labels["time_window_start"] = labels.apply( + lambda x: x.time_call_start - timedelta(seconds=x.random_t), axis=1 + ) + labels["time_window_end"] = labels["time_window_start"] + timedelta( + seconds=5 + ) + + # Reformat folder name + labels["folder_date"] = ( + labels["date"].astype(str).apply(lambda x: "".join(x.split("-"))) + ) + + # Add column with Whale type + labels["whale_type"] = whale_type + + # Merge labels and SAC file PATHs to same dataframe + final_df = pd.merge( + labels, + list_files_detailled, + left_on=["folder_date", "station_name"], + right_on=["folder", "station"], + ).rename( + columns={ + "coordinates": "component", + "station": "station_code", + "detection_id": "group_id", + } + ) + + # Save results to dataframe + if args.bandpass_filter is True: + csv_name = whale_type + "_filt.csv" + else: + csv_name = whale_type + ".csv" + + final_df[ + [ + "file_path", + "time_window_start", + "time_window_end", + "time_call_start", + "time_call_end", + "R", + "SNR", + "group_id", + "station_code", + "whale_type", + "component", + ] + ].to_csv(labels_output / csv_name, index=False) def parse_args() -> Namespace: + """Parse arguments""" description = "Script for preprocessing the labels coming from .mat matrix" arg_parser = ArgumentParser( description=description, formatter_class=ArgumentDefaultsHelpFormatter ) arg_parser.add_argument( - "--output_dir", - default="data/", - type=str, - help="path to the output directory", + "--bandpass_filter", + default=True, + type=bool, + help="True if you want to use data with applied bandpass filter", ) arg_parser.add_argument( "--input_file", - default="/network/projects/aia/whale_call/calls_data \ - /WhaleDetectionsLSZ.mat", + default="/network/projects/aia/whale_call/" + + "MATLAB_OUTPUT/WhaleDetectionsLSZ_new.mat", type=str, help="path to the input file", ) From 2599239fc15bfe234fc862424a9f11089edfbd4b Mon Sep 17 00:00:00 2001 From: Basile R Date: Wed, 15 Feb 2023 17:34:48 -0500 Subject: [PATCH 3/4] fix: added yaml lib --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 039f424..7bf7e90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ plotly==5.9.0 pysmo==0.8.0 obspy==1.3.0 matplotlib==3.6.3 -wget==3.2 \ No newline at end of file +wget==3.2 +types-pyyaml==6.0.12.6 From 09ae5e1ab996369a514630f7163f87aa1ef6ff71 Mon Sep 17 00:00:00 2001 From: Basile R Date: Thu, 16 Feb 2023 10:26:07 -0500 Subject: [PATCH 4/4] fix: random generator fixed for building time windows --- scripts/preprocess_labels.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/preprocess_labels.py b/scripts/preprocess_labels.py index b579cdc..4fa6851 100644 --- a/scripts/preprocess_labels.py +++ b/scripts/preprocess_labels.py @@ -181,8 +181,8 @@ def main() -> None: list_randoms = [] for _ in labels.index: list_randoms.append( - random.randint( # nosec - 1, + random.uniform( # nosec + 0, param_data["whale_constant"][whale_type]["window_size"] - param_data["whale_constant"][whale_type][ "call_duration"