-
Notifications
You must be signed in to change notification settings - Fork 1
/
Data_Preprocessing.py
76 lines (55 loc) · 2.26 KB
/
Data_Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import math
import json
import librosa
dir = 'D:/User/Desktop/M.L/ML data/Sound/genres_original'
jason_path = "data.json"
'''
n_mfcc=no of feature vectors (13-40)
n_fft=win_size ,
hop_len=stride value
num_segment:
because we do not have a lot of data to train on so rather then traning model
based on the complete audio we will be training it by further dividing it into segments
'''
c = 0
def Save_Data(dir_path, jason_path, n_mfcc=13, n_fft=2048, hop_length=512, num_segment=8, Samp_rate=22050):
global c
Duration = 30
Sample_per_task = Samp_rate * Duration
data = {
"Class": [],
"mfcc": [],
"labels": []
}
num_samples_per_seg = int(Sample_per_task / num_segment)
expected_num_mfcc_vector_per_segment = math.ceil(num_samples_per_seg / hop_length)
Label = 0
for dirpath, dirnames, filenames in os.walk(dir_path):
if dirpath is not dir: # for leaving the Home Dic name itself
data["Class"].append((dirpath.split())[-1]) # to store the class name form the path
for file in filenames:
print(file)
try:
signal, sr = librosa.load(os.path.join(dirpath, file), sr=Samp_rate)
c = c + 1
except:
print('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@')
continue
data['labels'].append(Label)
for s in range(num_segment):
start_samp = num_samples_per_seg * s
end_samp = start_samp + num_samples_per_seg
mfcc = librosa.feature.mfcc(signal[start_samp:end_samp], sr=Samp_rate,
n_fft=n_fft,
n_mfcc=n_mfcc,
hop_length=hop_length
)
mfcc = mfcc.T
if len(mfcc) == expected_num_mfcc_vector_per_segment:
data['mfcc'].append(mfcc.tolist())
data['labels'].append(Label)
Label = Label + 1
with open(jason_path, "w") as f:
json.dump(data, f, indent=4)
Save_Data(dir, jason_path)