import numpy as np
import itertools
from .contrib import compress_filter, smooth, residual_model
from .contrib import reduce_interferences
[docs]def expectation_maximization(y, x, iterations=2, verbose=0, eps=None):
r"""Expectation maximization algorithm, for refining source separation
estimates.
This algorithm allows to make source separation results better by
enforcing multichannel consistency for the estimates. This usually means
a better perceptual quality in terms of spatial artifacts.
The implementation follows the details presented in [1]_, taking
inspiration from the original EM algorithm proposed in [2]_ and its
weighted refinement proposed in [3]_, [4]_.
It works by iteratively:
* Re-estimate source parameters (power spectral densities and spatial
covariance matrices) through :func:`get_local_gaussian_model`.
* Separate again the mixture with the new parameters by first computing
the new modelled mixture covariance matrices with :func:`get_mix_model`,
prepare the Wiener filters through :func:`wiener_gain` and apply them
with :func:`apply_filter``.
References
----------
.. [1] S. Uhlich and M. Porcu and F. Giron and M. Enenkl and T. Kemp and
N. Takahashi and Y. Mitsufuji, "Improving music source separation based
on deep neural networks through data augmentation and network
blending." 2017 IEEE International Conference on Acoustics, Speech
and Signal Processing (ICASSP). IEEE, 2017.
.. [2] N.Q. Duong and E. Vincent and R.Gribonval. "Under-determined
reverberant audio source separation using a full-rank spatial
covariance model." IEEE Transactions on Audio, Speech, and Language
Processing 18.7 (2010): 1830-1840.
.. [3] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel audio source
separation with deep neural networks." IEEE/ACM Transactions on Audio,
Speech, and Language Processing 24.9 (2016): 1652-1664.
.. [4] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel music
separation with deep neural networks." 2016 24th European Signal
Processing Conference (EUSIPCO). IEEE, 2016.
.. [5] A. Liutkus and R. Badeau and G. Richard "Kernel additive models for
source separation." IEEE Transactions on Signal Processing
62.16 (2014): 4298-4310.
Parameters
----------
y: np.ndarray [shape=(nb_frames, nb_bins, nb_channels, nb_sources)]
initial estimates for the sources
x: np.ndarray [shape=(nb_frames, nb_bins, nb_channels)]
complex STFT of the mixture signal
iterations: int [scalar]
number of iterations for the EM algorithm.
verbose: boolean
display some information if True
eps: float or None [scalar]
The epsilon value to use for regularization and filters.
If None, the default will use the epsilon of np.real(x) dtype.
Returns
-------
y: np.ndarray [shape=(nb_frames, nb_bins, nb_channels, nb_sources)]
estimated sources after iterations
v: np.ndarray [shape=(nb_frames, nb_bins, nb_sources)]
estimated power spectral densities
R: np.ndarray [shape=(nb_bins, nb_channels, nb_channels, nb_sources)]
estimated spatial covariance matrices
Note
-----
* You need an initial estimate for the sources to apply this
algorithm. This is precisely what the :func:`wiener` function does.
* This algorithm *is not* an implementation of the "exact" EM
proposed in [1]_. In particular, it does compute the posterior
covariance matrices the same (exact) way. Instead, it uses the
simplified approximate scheme initially proposed in [5]_ and further
refined in [3]_, [4]_, that boils down to just take the empirical
covariance of the recent source estimates, followed by a weighted
average for the update of the spatial covariance matrix. It has been
empirically demonstrated that this simplified algorithm is more
robust for music separation.
Warning
-------
It is *very* important to make sure `x.dtype` is `np.complex`
if you want double precision, because this function will **not**
do such conversion for you from `np.complex64`, in case you want the
smaller RAM usage on purpose.
It is usually always better in terms of quality to have double
precision, by e.g. calling :func:`expectation_maximization`
with ``x.astype(np.complex)``.
This is notably needed if you let common deep learning frameworks like
PyTorch or TensorFlow do the STFT, because this usually happens in
single precision.
"""
# to avoid dividing by zero
if eps is None:
eps = np.finfo(np.real(x[0]).dtype).eps
# dimensions
(nb_frames, nb_bins, nb_channels) = x.shape
nb_sources = y.shape[-1]
# allocate the spatial covariance matrices and PSD
R = np.zeros((nb_bins, nb_channels, nb_channels, nb_sources), x.dtype)
v = np.zeros((nb_frames, nb_bins, nb_sources))
if verbose:
print('Number of iterations: ', iterations)
regularization = np.sqrt(eps) * (
np.tile(np.eye(nb_channels, dtype=np.complex64),
(1, nb_bins, 1, 1)))
for it in range(iterations):
# constructing the mixture covariance matrix. Doing it with a loop
# to avoid storing anytime in RAM the whole 6D tensor
if verbose:
print('EM, iteration %d' % (it+1))
for j in range(nb_sources):
# update the spectrogram model for source j
v[..., j], R[..., j] = get_local_gaussian_model(
y[..., j],
eps)
for t in range(nb_frames):
Cxx = get_mix_model(v[None, t, ...], R)
Cxx += regularization
inv_Cxx = _invert(Cxx, eps)
# separate the sources
for j in range(nb_sources):
W_j = wiener_gain(v[None, t, ..., j], R[..., j], inv_Cxx)
y[t, ..., j] = apply_filter(x[None, t, ...], W_j)[0]
return y, v, R
[docs]def wiener(v, x, iterations=1, use_softmask=True, eps=None):
"""Wiener-based separation for multichannel audio.
The method uses the (possibly multichannel) spectrograms `v` of the
sources to separate the (complex) Short Term Fourier Transform `x` of the
mix. Separation is done in a sequential way by:
* Getting an initial estimate. This can be done in two ways: either by
directly using the spectrograms with the mixture phase, or
by using :func:`softmask`.
* Refinining these initial estimates through a call to
:func:`expectation_maximization`.
This implementation also allows to specify the epsilon value used for
regularization. It is based on [1]_, [2]_, [3]_, [4]_.
References
----------
.. [1] S. Uhlich and M. Porcu and F. Giron and M. Enenkl and T. Kemp and
N. Takahashi and Y. Mitsufuji, "Improving music source separation based
on deep neural networks through data augmentation and network
blending." 2017 IEEE International Conference on Acoustics, Speech
and Signal Processing (ICASSP). IEEE, 2017.
.. [2] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel audio source
separation with deep neural networks." IEEE/ACM Transactions on Audio,
Speech, and Language Processing 24.9 (2016): 1652-1664.
.. [3] A. Nugraha and A. Liutkus and E. Vincent. "Multichannel music
separation with deep neural networks." 2016 24th European Signal
Processing Conference (EUSIPCO). IEEE, 2016.
.. [4] A. Liutkus and R. Badeau and G. Richard "Kernel additive models for
source separation." IEEE Transactions on Signal Processing
62.16 (2014): 4298-4310.
Parameters
----------
v: np.ndarray [shape=(nb_frames, nb_bins, {1,nb_channels}, nb_sources)]
spectrograms of the sources. This is a nonnegative tensor that is
usually the output of the actual separation method of the user. The
spectrograms may be mono, but they need to be 4-dimensional in all
cases.
x: np.ndarray [complex, shape=(nb_frames, nb_bins, nb_channels)]
STFT of the mixture signal.
iterations: int [scalar]
number of iterations for the EM algorithm
use_softmask: boolean
* if `False`, then the mixture phase will directly be used with the
spectrogram as initial estimates.
* if `True`, a softmasking strategy will be used as described in
:func:`softmask`.
eps: {None, float}
Epsilon value to use for computing the separations. This is used
whenever division with a model energy is performed, i.e. when
softmasking and when iterating the EM.
It can be understood as the energy of the additional white noise
that is taken out when separating.
If `None`, the default value is taken as `np.finfo(np.real(x[0])).eps`.
Returns
-------
y: np.ndarray
[complex, shape=(nb_frames, nb_bins, nb_channels, nb_sources)]
STFT of estimated sources
Note
----
* Be careful that you need *magnitude spectrogram estimates* for the
case `softmask==False`.
* We recommand to use `softmask=False` only if your spectrogram model is
pretty good, e.g. when the output of a deep neural net. In the case
it is not so great, opt for an initial softmasking strategy.
* The epsilon value will have a huge impact on performance. If it's large,
only the parts of the signal with a significant energy will be kept in
the sources. This epsilon then directly controls the energy of the
reconstruction error.
Warning
-------
As in :func:`expectation_maximization`, we recommend converting the
mixture `x` to double precision `np.complex` *before* calling
:func:`wiener`.
"""
if use_softmask:
y = softmask(v, x, eps=eps)
else:
y = v * np.exp(1j*np.angle(x[..., None]))
if not iterations:
return y
# we need to refine the estimates. Scales down the estimates for
# numerical stability
max_abs = max(1, np.abs(x).max()/10.)
x /= max_abs
y = expectation_maximization(y/max_abs, x, iterations, eps=eps)[0]
return y*max_abs
[docs]def softmask(v, x, logit=None, eps=None):
"""Separates a mixture with a ratio mask, using the provided sources
spectrograms estimates. Additionally allows compressing the mask with
a logit function for soft binarization.
The filter does *not* take multichannel correlations into account.
The masking strategy can be traced back to the work of N. Wiener in the
case of *power* spectrograms [1]_. In the case of *fractional* spectrograms
like magnitude, this filter is often referred to a "ratio mask", and
has been shown to be the optimal separation procedure under alpha-stable
assumptions [2]_.
References
----------
.. [1] N. Wiener,"Extrapolation, Inerpolation, and Smoothing of Stationary
Time Series." 1949.
.. [2] A. Liutkus and R. Badeau. "Generalized Wiener filtering with
fractional power spectrograms." 2015 IEEE International Conference on
Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2015.
Parameters
----------
v: np.ndarray [shape=(nb_frames, nb_bins, nb_channels, nb_sources)]
spectrograms of the sources
x: np.ndarray [shape=(nb_frames, nb_bins, nb_channels)]
mixture signal
logit: {None, float between 0 and 1}
enable a compression of the filter. If not None, it is the threshold
value for the logit function: a softmask above this threshold is
brought closer to 1, and a softmask below is brought closer to 0.
Returns
-------
ndarray, shape=(nb_frames, nb_bins, nb_channels, nb_sources)
estimated sources
"""
# to avoid dividing by zero
if eps is None:
eps = np.finfo(np.real(x[0]).dtype).eps
total_energy = np.sum(v, axis=-1, keepdims=True)
filter = v/(eps + total_energy.astype(x.dtype))
if logit is not None:
filter = compress_filter(filter, eps, thresh=logit, multichannel=False)
return filter * x[..., None]
def _invert(M, eps):
"""
Invert matrices, with special fast handling of the 1x1 and 2x2 cases.
Will generate errors if the matrices are singular: user must handle this
through his own regularization schemes.
Parameters
----------
M: np.ndarray [shape=(..., nb_channels, nb_channels)]
matrices to invert: must be square along the last two dimensions
eps: [scalar]
regularization parameter to use _only in the case of matrices
bigger than 2x2
Returns
-------
invM: np.ndarray, [shape=M.shape]
inverses of M
"""
nb_channels = M.shape[-1]
if nb_channels == 1:
# scalar case
invM = 1.0/(M+eps)
elif nb_channels == 2:
# two channels case: analytical expression
det = (
M[..., 0, 0]*M[..., 1, 1] -
M[..., 0, 1]*M[..., 1, 0])
invDet = 1.0/(det)
invM = np.empty_like(M)
invM[..., 0, 0] = invDet*M[..., 1, 1]
invM[..., 1, 0] = -invDet*M[..., 1, 0]
invM[..., 0, 1] = -invDet*M[..., 0, 1]
invM[..., 1, 1] = invDet*M[..., 0, 0]
else:
# general case : no use of analytical expression (slow!)
invM = np.linalg.pinv(M, eps)
return invM
[docs]def wiener_gain(v_j, R_j, inv_Cxx):
"""
Compute the wiener gain for separating one source, given all parameters.
It is the matrix applied to the mix to get the posterior mean of the source
as in [1]_
References
----------
.. [1] N.Q. Duong and E. Vincent and R.Gribonval. "Under-determined
reverberant audio source separation using a full-rank spatial
covariance model." IEEE Transactions on Audio, Speech, and Language
Processing 18.7 (2010): 1830-1840.
Parameters
----------
v_j: np.ndarray [shape=(nb_frames, nb_bins, nb_channels)]
power spectral density of the target source.
R_j: np.ndarray [shape=(nb_bins, nb_channels, nb_channels)]
spatial covariance matrix of the target source
inv_Cxx: np.ndarray [shape=(nb_frames, nb_bins, nb_channels, nb_channels)]
inverse of the mixture covariance matrices
Returns
-------
G: np.ndarray [shape=(nb_frames, nb_bins, nb_channels, nb_channels)]
wiener filtering matrices, to apply to the mix, e.g. through
:func:`apply_filter` to get the target source estimate.
"""
(_, nb_channels) = R_j.shape[:2]
# computes multichannel Wiener gain as v_j R_j inv_Cxx
G = np.zeros_like(inv_Cxx)
for (i1, i2, i3) in itertools.product(*(range(nb_channels),)*3):
G[..., i1, i2] += (R_j[None, :, i1, i3] * inv_Cxx[..., i3, i2])
G *= v_j[..., None, None]
return G
[docs]def apply_filter(x, W):
"""
Applies a filter on the mixture. Just corresponds to a matrix
multiplication.
Parameters
----------
x: np.ndarray [shape=(nb_frames, nb_bins, nb_channels)]
STFT of the signal on which to apply the filter.
W: np.ndarray [shape=(nb_frames, nb_bins, nb_channels, nb_channels)]
filtering matrices, as returned, e.g. by :func:`wiener_gain`
Returns
-------
y_hat: np.ndarray [shape=(nb_frames, nb_bins, nb_channels)]
filtered signal
"""
nb_channels = W.shape[-1]
# apply the filter
y_hat = 0+0j
for i in range(nb_channels):
y_hat += W[..., i] * x[..., i, None]
return y_hat
[docs]def get_mix_model(v, R):
"""
Compute the model covariance of a mixture based on local Gaussian models.
simply adds up all the v[..., j] * R[..., j]
Parameters
----------
v: np.ndarray [shape=(nb_frames, nb_bins, nb_sources)]
Power spectral densities for the sources
R: np.ndarray [shape=(nb_bins, nb_channels, nb_channels, nb_sources)]
Spatial covariance matrices of each sources
Returns
-------
Cxx: np.ndarray [shape=(nb_frames, nb_bins, nb_channels, nb_channels)]
Covariance matrix for the mixture
"""
nb_channels = R.shape[1]
(nb_frames, nb_bins, nb_sources) = v.shape
Cxx = np.zeros((nb_frames, nb_bins, nb_channels, nb_channels), R.dtype)
for j in range(nb_sources):
Cxx += v[..., j, None, None] * R[None, ..., j]
return Cxx
def _covariance(y_j):
"""
Compute the empirical covariance for a source.
Parameters
----------
y_j: np.ndarray [shape=(nb_frames, nb_bins, nb_channels)].
complex stft of the source.
Returns
-------
Cj: np.ndarray [shape=(nb_frames, nb_bins, nb_channels, nb_channels)]
just y_j * conj(y_j.T): empirical covariance for each TF bin.
"""
(nb_frames, nb_bins, nb_channels) = y_j.shape
Cj = np.zeros((nb_frames, nb_bins, nb_channels, nb_channels),
y_j.dtype)
for (i1, i2) in itertools.product(*(range(nb_channels),)*2):
Cj[..., i1, i2] += y_j[..., i1] * np.conj(y_j[..., i2])
return Cj
[docs]def get_local_gaussian_model(y_j, eps=1.):
r"""
Compute the local Gaussian model [1]_ for a source given the complex STFT.
First get the power spectral densities, and then the spatial covariance
matrix, as done in [1]_, [2]_
References
----------
.. [1] N.Q. Duong and E. Vincent and R.Gribonval. "Under-determined
reverberant audio source separation using a full-rank spatial
covariance model." IEEE Transactions on Audio, Speech, and Language
Processing 18.7 (2010): 1830-1840.
.. [2] A. Liutkus and R. Badeau and G. Richard. "Low bitrate informed
source separation of realistic mixtures." 2013 IEEE International
Conference on Acoustics, Speech and Signal Processing. IEEE, 2013.
Parameters
----------
y_j: np.ndarray [shape=(nb_frames, nb_bins, nb_channels)]
complex stft of the source.
eps: float [scalar]
regularization term
Returns
-------
v_j: np.ndarray [shape=(nb_frames, nb_bins)]
power spectral density of the source
R_J: np.ndarray [shape=(nb_bins, nb_channels, nb_channels)]
Spatial covariance matrix of the source
"""
v_j = np.mean(np.abs(y_j)**2, axis=2)
# updates the spatial covariance matrix
nb_frames = y_j.shape[0]
R_j = 0
weight = eps
for t in range(nb_frames):
R_j += _covariance(y_j[None, t, ...])
weight += v_j[None, t, ...]
R_j /= weight[..., None, None]
return v_j, R_j