Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Kaldi Pitch feature #1243

Merged
merged 7 commits into from
Feb 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/unittest/linux/scripts/run_style_checks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ fi

printf "\x1b[34mRunning clang-format:\x1b[0m\n"
"${this_dir}"/run_clang_format.py \
-r torchaudio/csrc \
-r torchaudio/csrc third_party/kaldi/src \
--clang-format-executable "${clangformat_path}" \
&& git diff --exit-code
status=$?
Expand Down
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@
path = third_party/transducer/submodule
url = https://github.com/HawkAaron/warp-transducer
ignore = dirty
[submodule "kaldi"]
path = third_party/kaldi/submodule
url = https://github.com/kaldi-asr/kaldi
ignore = dirty
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ endif()

# Options
option(BUILD_SOX "Build libsox statically" OFF)
option(BUILD_KALDI "Build kaldi statically" ON)
option(BUILD_TRANSDUCER "Enable transducer" OFF)
option(BUILD_LIBTORCHAUDIO "Build C++ Library" ON)
option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)
Expand Down
1 change: 1 addition & 0 deletions build_tools/setup_helpers/extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def build_extension(self, ext):
'-DCMAKE_VERBOSE_MAKEFILE=ON',
f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
"-DBUILD_KALDI:BOOL=ON",
f"-DBUILD_TRANSDUCER:BOOL={'ON' if _BUILD_TRANSDUCER else 'OFF'}",
"-DBUILD_TORCHAUDIO_PYTHON_EXTENSION:BOOL=ON",
"-DBUILD_LIBTORCHAUDIO:BOOL=OFF",
Expand Down
5 changes: 5 additions & 0 deletions test/torchaudio_unittest/assets/kaldi_test_pitch_args.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{"sample_rate": 8000}
{"sample_rate": 8000, "frames_per_chunk": 200}
{"sample_rate": 8000, "frames_per_chunk": 200, "simulate_first_pass_online": true}
{"sample_rate": 16000}
{"sample_rate": 44100}
39 changes: 39 additions & 0 deletions test/torchaudio_unittest/common_utils/kaldi_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import subprocess

import torch


def convert_args(**kwargs):
args = []
for key, value in kwargs.items():
if key == 'sample_rate':
key = 'sample_frequency'
key = '--' + key.replace('_', '-')
value = str(value).lower() if value in [True, False] else str(value)
args.append('%s=%s' % (key, value))
return args


def run_kaldi(command, input_type, input_value):
"""Run provided Kaldi command, pass a tensor and get the resulting tensor

Args:
input_type: str
'ark' or 'scp'
input_value:
Tensor for 'ark'
string for 'scp' (path to an audio file)
"""
import kaldi_io

key = 'foo'
process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
if input_type == 'ark':
kaldi_io.write_mat(process.stdin, input_value.cpu().numpy(), key=key)
elif input_type == 'scp':
process.stdin.write(f'{key} {input_value}'.encode('utf8'))
else:
raise NotImplementedError('Unexpected type')
process.stdin.close()
result = dict(kaldi_io.read_mat_ark(process.stdout))['foo']
return torch.from_numpy(result.copy()) # copy supresses some torch warning
6 changes: 6 additions & 0 deletions test/torchaudio_unittest/functional/batch_consistency_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,9 @@ def test_vad(self):
waveform, sample_rate = torchaudio.load(filepath)
self.assert_batch_consistencies(
F.vad, waveform, sample_rate=sample_rate)

@common_utils.skipIfNoExtension
def test_compute_kaldi_pitch(self):
sample_rate = 44100
waveform = common_utils.get_whitenoise(sample_rate=sample_rate)
self.assert_batch_consistencies(F.compute_kaldi_pitch, waveform, sample_rate=sample_rate)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import torch

from torchaudio_unittest.common_utils import PytorchTestCase
from .kaldi_compatibility_test_impl import KaldiCPUOnly


class TestKaldiCPUOnly(KaldiCPUOnly, PytorchTestCase):
dtype = torch.float32
device = torch.device('cpu')
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from parameterized import parameterized
import torchaudio.functional as F

from torchaudio_unittest.common_utils import (
get_sinusoid,
load_params,
save_wav,
skipIfNoExec,
TempDirMixin,
TestBaseMixin,
)
from torchaudio_unittest.common_utils.kaldi_utils import (
convert_args,
run_kaldi,
)


class KaldiCPUOnly(TempDirMixin, TestBaseMixin):
def assert_equal(self, output, *, expected, rtol=None, atol=None):
expected = expected.to(dtype=self.dtype, device=self.device)
self.assertEqual(output, expected, rtol=rtol, atol=atol)

@parameterized.expand(load_params('kaldi_test_pitch_args.json'))
@skipIfNoExec('compute-kaldi-pitch-feats')
def test_pitch_feats(self, kwargs):
"""compute_kaldi_pitch produces numerically compatible result with compute-kaldi-pitch-feats"""
sample_rate = kwargs['sample_rate']
waveform = get_sinusoid(dtype='float32', sample_rate=sample_rate)
result = F.compute_kaldi_pitch(waveform[0], **kwargs)

waveform = get_sinusoid(dtype='int16', sample_rate=sample_rate)
wave_file = self.get_temp_path('test.wav')
save_wav(wave_file, waveform, sample_rate)

command = ['compute-kaldi-pitch-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = run_kaldi(command, 'scp', wave_file)
self.assert_equal(result, expected=kaldi_result)
Original file line number Diff line number Diff line change
Expand Up @@ -547,3 +547,15 @@ def func(tensor):

tensor = common_utils.get_whitenoise(sample_rate=44100)
self._assert_consistency(func, tensor)

@common_utils.skipIfNoExtension
def test_compute_kaldi_pitch(self):
if self.dtype != torch.float32 or self.device != torch.device('cpu'):
raise unittest.SkipTest("Only float32, cpu is supported.")

def func(tensor):
sample_rate: float = 44100.
return F.compute_kaldi_pitch(tensor, sample_rate)

tensor = common_utils.get_whitenoise(sample_rate=44100)
self._assert_consistency(func, tensor)
60 changes: 15 additions & 45 deletions test/torchaudio_unittest/kaldi_compatibility_impl.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,24 @@
"""Test suites for checking numerical compatibility against Kaldi"""
import subprocess

import kaldi_io
import torch
import torchaudio.functional as F
import torchaudio.compliance.kaldi
from parameterized import parameterized

from torchaudio_unittest.common_utils import (
TestBaseMixin,
TempDirMixin,
load_params,
skipIfNoExec,
get_asset_path,
load_wav
load_wav,
)
from torchaudio_unittest.common_utils.kaldi_utils import (
convert_args,
run_kaldi,
)


def _convert_args(**kwargs):
args = []
for key, value in kwargs.items():
key = '--' + key.replace('_', '-')
value = str(value).lower() if value in [True, False] else str(value)
args.append('%s=%s' % (key, value))
return args


def _run_kaldi(command, input_type, input_value):
"""Run provided Kaldi command, pass a tensor and get the resulting tensor

Args:
input_type: str
'ark' or 'scp'
input_value:
Tensor for 'ark'
string for 'scp' (path to an audio file)
"""
key = 'foo'
process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
if input_type == 'ark':
kaldi_io.write_mat(process.stdin, input_value.cpu().numpy(), key=key)
elif input_type == 'scp':
process.stdin.write(f'{key} {input_value}'.encode('utf8'))
else:
raise NotImplementedError('Unexpected type')
process.stdin.close()
result = dict(kaldi_io.read_mat_ark(process.stdout))['foo']
return torch.from_numpy(result.copy()) # copy supresses some torch warning


class Kaldi(TestBaseMixin):
class Kaldi(TempDirMixin, TestBaseMixin):
def assert_equal(self, output, *, expected, rtol=None, atol=None):
expected = expected.to(dtype=self.dtype, device=self.device)
self.assertEqual(output, expected, rtol=rtol, atol=atol)
Expand All @@ -65,8 +35,8 @@ def test_sliding_window_cmn(self):

tensor = torch.randn(40, 10, dtype=self.dtype, device=self.device)
result = F.sliding_window_cmn(tensor, **kwargs)
command = ['apply-cmvn-sliding'] + _convert_args(**kwargs) + ['ark:-', 'ark:-']
kaldi_result = _run_kaldi(command, 'ark', tensor)
command = ['apply-cmvn-sliding'] + convert_args(**kwargs) + ['ark:-', 'ark:-']
kaldi_result = run_kaldi(command, 'ark', tensor)
self.assert_equal(result, expected=kaldi_result)

@parameterized.expand(load_params('kaldi_test_fbank_args.json'))
Expand All @@ -76,8 +46,8 @@ def test_fbank(self, kwargs):
wave_file = get_asset_path('kaldi_file.wav')
waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
result = torchaudio.compliance.kaldi.fbank(waveform, **kwargs)
command = ['compute-fbank-feats'] + _convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = _run_kaldi(command, 'scp', wave_file)
command = ['compute-fbank-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = run_kaldi(command, 'scp', wave_file)
self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)

@parameterized.expand(load_params('kaldi_test_spectrogram_args.json'))
Expand All @@ -87,8 +57,8 @@ def test_spectrogram(self, kwargs):
wave_file = get_asset_path('kaldi_file.wav')
waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
result = torchaudio.compliance.kaldi.spectrogram(waveform, **kwargs)
command = ['compute-spectrogram-feats'] + _convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = _run_kaldi(command, 'scp', wave_file)
command = ['compute-spectrogram-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = run_kaldi(command, 'scp', wave_file)
self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)

@parameterized.expand(load_params('kaldi_test_mfcc_args.json'))
Expand All @@ -98,6 +68,6 @@ def test_mfcc(self, kwargs):
wave_file = get_asset_path('kaldi_file.wav')
waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
result = torchaudio.compliance.kaldi.mfcc(waveform, **kwargs)
command = ['compute-mfcc-feats'] + _convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = _run_kaldi(command, 'scp', wave_file)
command = ['compute-mfcc-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
kaldi_result = run_kaldi(command, 'scp', wave_file)
self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
8 changes: 8 additions & 0 deletions third_party/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ else()
endif()
list(APPEND TORCHAUDIO_THIRD_PARTIES libsox)

################################################################################
# kaldi
################################################################################
if (BUILD_KALDI)
add_subdirectory(kaldi)
list(APPEND TORCHAUDIO_THIRD_PARTIES kaldi)
endif()

################################################################################
# transducer
################################################################################
Expand Down
30 changes: 30 additions & 0 deletions third_party/kaldi/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
set(KALDI_REPO ${CMAKE_CURRENT_SOURCE_DIR}/submodule)

# Apply custom patch
execute_process(
WORKING_DIRECTORY ${KALDI_REPO}
COMMAND "git" "checkout" "."
)
execute_process(
WORKING_DIRECTORY ${KALDI_REPO}
COMMAND git apply ../kaldi.patch
)
# Update the version string
execute_process(
WORKING_DIRECTORY ${KALDI_REPO}/src/base
COMMAND sh get_version.sh
)

set(KALDI_SOURCES
src/matrix/kaldi-vector.cc
src/matrix/kaldi-matrix.cc
submodule/src/base/kaldi-error.cc
submodule/src/base/kaldi-math.cc
submodule/src/feat/feature-functions.cc
submodule/src/feat/pitch-functions.cc
submodule/src/feat/resample.cc
)

add_library(kaldi STATIC ${KALDI_SOURCES})
target_include_directories(kaldi PUBLIC src submodule/src)
target_link_libraries(kaldi ${TORCH_LIBRARIES})
6 changes: 6 additions & 0 deletions third_party/kaldi/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Custom Kaldi build

This directory contains original Kaldi repository (as submodule), [the custom implementation of Kaldi's vector/matrix](./src) and the build script.

We use the custom build process so that the resulting library only contains what torchaudio needs.
We use the custom vector/matrix implementation so that we can use the same BLAS library that PyTorch is compiled with, and so that we can (hopefully, in future) take advantage of other PyTorch features (such as differentiability and GPU support). The down side of this approach is that it adds a lot of overhead compared to the original Kaldi (operator dispatch and element-wise processing, which PyTorch is not efficient at). We can improve this gradually, and if you are interested in helping, please let us know by opening an issue.
76 changes: 76 additions & 0 deletions third_party/kaldi/kaldi.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
diff --git a/src/base/kaldi-types.h b/src/base/kaldi-types.h
index 7ebf4f853..c15b288b2 100644
--- a/src/base/kaldi-types.h
+++ b/src/base/kaldi-types.h
@@ -41,6 +41,7 @@ typedef float BaseFloat;

// for discussion on what to do if you need compile kaldi
// without OpenFST, see the bottom of this this file
+/*
#include <fst/types.h>

namespace kaldi {
@@ -53,10 +54,10 @@ namespace kaldi {
typedef float float32;
typedef double double64;
} // end namespace kaldi
+*/

// In a theoretical case you decide compile Kaldi without the OpenFST
// comment the previous namespace statement and uncomment the following
-/*
namespace kaldi {
typedef int8_t int8;
typedef int16_t int16;
@@ -70,6 +71,5 @@ namespace kaldi {
typedef float float32;
typedef double double64;
} // end namespace kaldi
-*/

#endif // KALDI_BASE_KALDI_TYPES_H_
diff --git a/src/matrix/matrix-lib.h b/src/matrix/matrix-lib.h
index b6059b06c..4fb9e1b16 100644
--- a/src/matrix/matrix-lib.h
+++ b/src/matrix/matrix-lib.h
@@ -25,14 +25,14 @@
#include "base/kaldi-common.h"
#include "matrix/kaldi-vector.h"
#include "matrix/kaldi-matrix.h"
-#include "matrix/sp-matrix.h"
-#include "matrix/tp-matrix.h"
+// #include "matrix/sp-matrix.h"
+// #include "matrix/tp-matrix.h"
#include "matrix/matrix-functions.h"
#include "matrix/srfft.h"
#include "matrix/compressed-matrix.h"
-#include "matrix/sparse-matrix.h"
+// #include "matrix/sparse-matrix.h"
#include "matrix/optimization.h"
-#include "matrix/numpy-array.h"
+// #include "matrix/numpy-array.h"

#endif

diff --git a/src/util/common-utils.h b/src/util/common-utils.h
index cfb0c255c..48d199e97 100644
--- a/src/util/common-utils.h
+++ b/src/util/common-utils.h
@@ -21,11 +21,11 @@

#include "base/kaldi-common.h"
#include "util/parse-options.h"
-#include "util/kaldi-io.h"
-#include "util/simple-io-funcs.h"
-#include "util/kaldi-holder.h"
-#include "util/kaldi-table.h"
-#include "util/table-types.h"
-#include "util/text-utils.h"
+// #include "util/kaldi-io.h"
+// #include "util/simple-io-funcs.h"
+// #include "util/kaldi-holder.h"
+// #include "util/kaldi-table.h"
+// #include "util/table-types.h"
+// #include "util/text-utils.h"

#endif // KALDI_UTIL_COMMON_UTILS_H_
Loading