Skip to content

Commit

Permalink
Merge pull request #19 from mila-aia/build_dataset
Browse files Browse the repository at this point in the history
Script to build dataset with labels and paths to SAC files
  • Loading branch information
basileroth75 committed Feb 16, 2023
2 parents 0b8112a + 09ae5e1 commit e3c0d2e
Show file tree
Hide file tree
Showing 4 changed files with 179 additions and 34 deletions.
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,5 @@ repos:
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.910
hooks:
- id: mypy
- id: mypy
additional_dependencies: ['types-pyyaml']
17 changes: 13 additions & 4 deletions config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,23 @@ whale_constant:
fw:
name: Fin Whale
window: 120
threshold_power_ratio: 3
threshold_detections: 5
window_size: 5
high_cut_bandpass: 32
low_cut_bandpass: 12
call_duration: 1
call_frequency: 10
stadir_name: stadir
bw:
name: Blue Whale
window: 720
threshold_power_ratio: 1.5
threshold_detections: 3
window_size: 40
high_cut_bandpass: 32
low_cut_bandpass: 10
call_duration: 8
call_frequency: 68
stadir_name: stadir_bw

paths:
whale_data_cluster: /network/projects/aia/whale_call
list_sac_files: /network/projects/aia/whale_call/SAC_FILES_RAW.txt
mat_file: /network/projects/aia/whale_call/MATLAB_OUTPUT/WhaleDetectionsLSZ_new.mat
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ plotly==5.9.0
pysmo==0.8.0
obspy==1.3.0
matplotlib==3.6.3
wget==3.2
wget==3.2
types-pyyaml==6.0.12.6
190 changes: 162 additions & 28 deletions scripts/preprocess_labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
from pathlib import Path
import pandas as pd
import scipy.io
from datetime import timedelta
import random
import yaml


def read_stadir(mat_file: dict, whale_type: str) -> dict:
Expand Down Expand Up @@ -53,17 +56,68 @@ def preprocess_(dataset: pd.DataFrame, stadir: dict) -> pd.DataFrame:
return dataset


def read_list_raw_files(sac_file_path: str) -> pd.DataFrame:
"""Build a dataframe with the list of the SAC files
@param: sac_file_path (string): PATH to the txt file containing the list
return: pd.Dataframe with 2 columns: folder containing files, and PATH
"""
# Read list of raw data files
with open(sac_file_path) as file:
list_files = file.readlines()
list_files = [line.rstrip("\n") for line in list_files]
list_files_df = pd.Series(list_files)

# Create df from list of files
list_files_detailled = pd.DataFrame(
list_files_df.str.split("/", n=7, expand=True)[[6, 7]]
).rename(columns={6: "folder_date", 7: "file_name"})

# Rename columns
list_files_detailled = list_files_detailled["file_name"].str.split(
".", n=7, expand=True
)
list_files_detailled = list_files_detailled.rename(
{
0: "year",
1: "month",
2: "day",
3: "network",
4: "station",
5: "empty",
6: "coordinates",
7: "SAC",
},
axis=1,
)
# Extract folder name
list_files_detailled["folder"] = list_files_df.str.split(
"/", n=7, expand=True
)[[6]]
# Add list of names to path
list_files_detailled["file_path"] = list_files_df

return list_files_detailled


def main() -> None:
""" """

# Arguments parsing
args = parse_args()

# Load config
with open("config/config.yml", "r") as file:
param_data = yaml.safe_load(file)

# Output
output_dir = Path(args.output_dir).expanduser().resolve()
labels_output = output_dir / "LABELS"
labels_output = Path(param_data["paths"]["whale_data_cluster"]) / "LABELS"

labels_output.mkdir(
parents=True, exist_ok=True
) # Create folder if not exist

# Input
input_file = Path(args.input_file)

Expand All @@ -82,51 +136,131 @@ def main() -> None:
"detection_id",
]

fw_calls = pd.DataFrame(mat["FWC"], columns=colnames_WC)
bw_calls = pd.DataFrame(mat["BWC"], columns=colnames_WC)
# Get list of files in dataframe
if args.bandpass_filter is True:
list_files_detailled = read_list_raw_files(
"/network/projects/aia/whale_call/SAC_FILES_RAW.txt"
)
else:
list_files_detailled = read_list_raw_files(
"/network/projects/aia/whale_call/SAC_FILES_FILT.txt"
)

# Load name and index of stations
stadir_fw = read_stadir(mat, "stadir")
stadir_bw = read_stadir(mat, "stadir_bw")
# Loop for the 2 whale types
for whale_type in ["bw", "fw"]:

# Preprocess
fw_ds = preprocess_(fw_calls, stadir_fw)
bw_ds = preprocess_(bw_calls, stadir_bw)
# Get data from matlab .mat matrix
df_calls = pd.DataFrame(
mat[whale_type.upper() + "C"], columns=colnames_WC
)

# Plot counts
print(
"Number of Fin Whale calls detected: {}".format(
fw_ds.detection_id.nunique()
# Load name and index of stations
stadir = read_stadir(
mat, param_data["whale_constant"][whale_type]["stadir_name"]
)
)
print(
"Number of Blue Whale calls detected: {}".format(
bw_ds.detection_id.nunique()

# Preprocess labels
labels = preprocess_(df_calls, stadir)

# Plot counts of calls
print(
"Number of {} calls detected: {}".format(
param_data["whale_constant"][whale_type]["name"],
labels.detection_id.nunique(),
)
)

# Add start and end time of calls
labels["time_call_start"] = pd.to_datetime(labels["datetime"])
labels["time_call_end"] = labels["time_call_start"] + timedelta(
seconds=param_data["whale_constant"][whale_type]["call_duration"]
)
)

# Save datasets
fw_ds.to_csv(labels_output / "fin_whales.csv")
bw_ds.to_csv(labels_output / "blue_whales.csv")
# Add random value to create start and end time of window

list_randoms = []
for _ in labels.index:
list_randoms.append(
random.uniform( # nosec
0,
param_data["whale_constant"][whale_type]["window_size"]
- param_data["whale_constant"][whale_type][
"call_duration"
],
)
)

labels["random_t"] = list_randoms
labels["time_window_start"] = labels.apply(
lambda x: x.time_call_start - timedelta(seconds=x.random_t), axis=1
)
labels["time_window_end"] = labels["time_window_start"] + timedelta(
seconds=5
)

# Reformat folder name
labels["folder_date"] = (
labels["date"].astype(str).apply(lambda x: "".join(x.split("-")))
)

# Add column with Whale type
labels["whale_type"] = whale_type

# Merge labels and SAC file PATHs to same dataframe
final_df = pd.merge(
labels,
list_files_detailled,
left_on=["folder_date", "station_name"],
right_on=["folder", "station"],
).rename(
columns={
"coordinates": "component",
"station": "station_code",
"detection_id": "group_id",
}
)

# Save results to dataframe
if args.bandpass_filter is True:
csv_name = whale_type + "_filt.csv"
else:
csv_name = whale_type + ".csv"

final_df[
[
"file_path",
"time_window_start",
"time_window_end",
"time_call_start",
"time_call_end",
"R",
"SNR",
"group_id",
"station_code",
"whale_type",
"component",
]
].to_csv(labels_output / csv_name, index=False)


def parse_args() -> Namespace:
"""Parse arguments"""
description = "Script for preprocessing the labels coming from .mat matrix"
arg_parser = ArgumentParser(
description=description, formatter_class=ArgumentDefaultsHelpFormatter
)

arg_parser.add_argument(
"--output_dir",
default="data/",
type=str,
help="path to the output directory",
"--bandpass_filter",
default=True,
type=bool,
help="True if you want to use data with applied bandpass filter",
)

arg_parser.add_argument(
"--input_file",
default="/network/projects/aia/whale_call/calls_data \
/WhaleDetectionsLSZ.mat",
default="/network/projects/aia/whale_call/"
+ "MATLAB_OUTPUT/WhaleDetectionsLSZ_new.mat",
type=str,
help="path to the input file",
)
Expand Down

0 comments on commit e3c0d2e

Please sign in to comment.