forked from breadbread1984/wavenet-tf2.0
-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_dataset.py
119 lines (108 loc) · 4.98 KB
/
create_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python3
import sys;
from os import listdir, mkdir;
from os.path import join, exists, splitext;
from re import search;
from random import shuffle;
from threading import Thread, Lock;
import concurrent;
import librosa;
import pandas as pd;
import numpy as np;
import tensorflow as tf;
def mu_law_encode(audio, quantization_channels = 256):
mu = tf.constant(quantization_channels - 1, dtype = tf.float32);
safe_audio_abs = tf.math.minimum(tf.math.abs(audio), 1.0); # min(|x|, 1)
magnitude = tf.math.log1p(mu * safe_audio_abs) / tf.math.log1p(mu); # log_e(1 + mu * min(|x|, 1)) / log_e(1 + mu)
signal = tf.math.sign(audio) * magnitude; # sign(x) log_e(1 + mu * min(|x|, 1)) / log_e(1 + mu)
return tf.cast((signal + 1) / 2 * mu + 0.5, dtype = tf.int32);
def mu_law_decode(output, quantization_channels = 256):
mu = tf.constant(quantization_channels - 1, dtype = tf.float32);
signal = 2 * (tf.cast(output, dtype = tf.float32) / mu) - 1;
magnitude = (1 / mu) * ((1 + mu) ** tf.math.abs(signal) - 1);
return tf.sign(signal) * magnitude;
def parse_function_generator(quantization_channels = 256):
def parse_function(serialized_example):
feature = tf.io.parse_single_example(
serialized_example,
features = {
'audio': tf.io.VarLenFeature(dtype = tf.int64),
'length': tf.io.FixedLenFeature((), dtype = tf.int64),
'category': tf.io.FixedLenFeature((), dtype = tf.int64),
'transcript': tf.io.FixedLenFeature((), dtype = tf.string)
}
);
length = feature['length'];
audio = tf.sparse.to_dense(feature['audio'], default_value = 0);
audio = tf.cast(tf.reshape(audio, (length, 1)), dtype = tf.float32);
category = tf.cast(feature['category'], dtype = tf.float32);
transcript = tf.strings.unicode_decode(feature['transcript'],'UTF-8');
return audio, category;
return parse_function;
def main(root_dir, sample_rate = 16000, silence_threshold = 0.3, dilations = [2**i for i in range(10)] * 5, quantization_channels = 256):
from WaveNet import calculate_receptive_field;
receptive_field = calculate_receptive_field(dilations, 2, 32);
category = dict(); # person_id -> class id
count = 0;
audiolist = list();
for d in listdir(join(root_dir, 'wav48')):
for f in listdir(join(root_dir, 'wav48', d)):
result = search(r'p([0-9]+)_([0-9]+)\.wav', f);
if result is None: continue;
if False == exists(join(root_dir, 'txt', d, splitext(f)[0] + ".txt")):
print("can't find corresponding label file " + join(root_dir, 'txt', d, splitext(f)[0] + ".txt"));
continue;
if result[1] not in category:
category[result[1]] = count;
count += 1;
audiolist.append((join(root_dir, 'wav48', d, f), join(root_dir, 'txt', d, splitext(f)[0] + ".txt"), category[result[1]]));
shuffle(audiolist);
if False == exists('dataset'): mkdir('dataset');
writer = tf.io.TFRecordWriter(join('dataset', 'trainset.tfrecord'));
# NOTE: we tried using multithreading, but the speed is even slower
for f in audiolist:
# 1) load audio file
audio_path = f[0];
audio, _ = librosa.load(audio_path, sr = sample_rate, mono=True);
audio = audio.reshape(-1, 1);
# 2) load label file
label_path = f[1];
label = open(label_path, 'r');
if label is None:
print("can't open label file " + label_path);
return;
transcript = label.read().strip();
label.close();
class_id = int(f[2]);
# 3) trim silence under specific signal to noise ratio
frame_length = 2048 if audio.size >= 2048 else audio.size;
energe = librosa.feature.rms(audio, frame_length = frame_length);
frames = np.nonzero(energe > silence_threshold);
indices = librosa.core.frames_to_samples(frames)[1];
audio = audio[indices[0]:indices[-1]] if indices.size else audio[0:0];
audio = audio.reshape(-1, 1);
# 4) pad at head
audio = np.pad(audio, [[receptive_field, 0],[0, 0]], 'constant');
# 5) quantization
quantized = mu_law_encode(audio, quantization_channels); # quantized.shape = (length)
# 6) write to file
trainsample = tf.train.Example(features = tf.train.Features(
feature = {
'audio': tf.train.Feature(int64_list = tf.train.Int64List(value = tf.reshape(quantized, (-1,)))),
'length': tf.train.Feature(int64_list = tf.train.Int64List(value = [quantized.shape[0]])),
'category': tf.train.Feature(int64_list = tf.train.Int64List(value = [class_id])),
'transcript': tf.train.Feature(bytes_list = tf.train.BytesList(value = [transcript.encode('utf-8')]))
}
));
writer.write(trainsample.SerializeToString());
writer.close();
category = [(class_id, person_id) for person_id, class_id in category.items()];
category = pd.DataFrame(category, columns = ['class_id', 'person_id']);
category.to_pickle('category.pkl');
category.to_excel('category.xls');
if __name__ == "__main__":
assert True == tf.executing_eagerly();
if len(sys.argv) != 2:
print("Usage: " + sys.argv[0] + " <directory>");
exit(1);
main(sys.argv[1]);