import numpy as np
from scipy.ndimage import gaussian_filter
from scipy.ndimage.filters import gaussian_filter1d
def _logit(W, threshold, slope):
return 1. / (1.0 + np.exp(-slope*(W-threshold)))
[docs]def residual_model(v, x, alpha=1, autoscale=False):
r"""Compute a model for the residual based on spectral subtraction.
The method consists in two steps:
* The provided spectrograms are summed up to obtain the *input* model for
the mixture. This *input* model is scaled frequency-wise to best
fit with the actual observed mixture spectrogram.
* The residual model is obtained through spectral subtraction of the
input model from the mixture spectrogram, with flooring to 0.
Parameters
----------
v: np.ndarray [shape=(nb_frames, nb_bins, {1, nb_channels}, nb_sources)]
Estimated spectrograms for the sources
x: np.ndarray [shape=(nb_frames, nb_bins, nb_channels)]
complex mixture
alpha: float [scalar]
exponent for the spectrograms `v`. For instance, if `alpha==1`,
then `v` must be homogoneous to magnitudes, and if `alpha==2`, `v`
must homogeneous to squared magnitudes.
autoscale: boolean
in the case you know that the spectrograms will not have the right
magnitude, it is important that the models are scaled so that the
residual is correctly estimated.
Returns
-------
v: np.ndarray [shape=(nb_frames, nb_bins, nb_channels, nb_sources+1)]
Spectrograms of the sources, with an appended one for the residual.
Note
----
It is not mandatory to input multichannel spectrograms. However, the
output spectrograms *will* be multichannel.
Warning
-------
You must be careful to set `alpha` as the exponent that corresponds to `v`.
In other words, *you must have*: ``np.abs(x)**alpha`` homogeneous to `v`.
"""
# to avoid dividing by zero
eps = np.finfo(v.dtype).eps
# spectrogram for the mixture
vx = np.maximum(eps, np.abs(x)**alpha)
# compute the total model as provided
v_total = np.sum(v, axis=-1)
if autoscale:
# quick trick to scale the provided spectrograms to fit the mixture
nb_frames = x.shape[0]
gain = 0
weights = eps
for t in range(nb_frames):
gain = gain + vx[None, t] * v_total[None, t]
weights = weights + v_total[None, t]**2
gain /= weights
v *= gain[..., None]
# re-sum the sources to build the new current model
v_total = np.sum(v, axis=-1)
# residual is difference between the observation and the model
vr = np.maximum(0, vx - v_total)
return np.concatenate((v, vr[..., None]), axis=3)
[docs]def smooth(v, width=1, temporal=False):
"""
smoothes a ndarray with a Gaussian blur.
Parameters
----------
v: np.ndarray [shape=(nb_frames, ...)]
input array
sigma: int [scalar]
lengthscale of the gaussian blur
temporal: boolean
if True, will smooth only along time through 1d blur. Will use a
multidimensional Gaussian blur otherwise.
Returns
-------
result: np.ndarray [shape=(nb_frames, ...)]
filtered array
"""
if temporal:
return gaussian_filter1d(v, sigma=width, axis=0)
else:
return gaussian_filter(v, sigma=width, truncate=width)
[docs]def reduce_interferences(v, thresh=0.6, slope=15):
r"""
Reduction of interferences between spectrograms.
The objective of the method is to redistribute the energy of the input in
order to "sparsify" spectrograms along the "source" dimension. This is
motivated by the fact that sources are somewhat sparse and it is hence
unlikely that they are all energetic at the same time-frequency bins.
The method is inspired from [1]_ with ad-hoc modifications.
References
----------
.. [1] Thomas Prätzlich, Rachel Bittner, Antoine Liutkus, Meinard Müller.
"Kernel additive modeling for interference reduction in multi-
channel music recordings" Proc. of ICASSP 2015.
Parameters
----------
v: np.ndarray [shape=(..., nb_sources)]
non-negative data on which to apply interference reduction
thresh: float [scalar]
threshold for the compression, should be between 0 and 1. The closer
to 1, the more reduction of the interferences, at the price of more
distortion.
slope: float [scalar]
the slope at which binarization is done. The higher, the more
brutal
Returns
-------
v: np.ndarray [same shape as input]
`v` with reduced interferences
"""
eps = np.finfo(np.float32).eps
vsmooth = smooth(v, 10)
total_energy = eps + np.sum(vsmooth, axis=-1, keepdims=True)
v = _logit(vsmooth/total_energy, 0.4, 15) * v
return v
[docs]def compress_filter(W, thresh=0.6, slope=15):
'''Applies a logit compression to a filter. This enables to "binarize" a
separation filter. This allows to reduce interferences at the price
of distortion.
In the case of multichannel filters, decomposes them as the cascade of a
pure beamformer (selection of one direction in space), followed by a
single-channel mask. Then, compression is applied on the mask only.
Parameters
----------
W: ndarray, shape=(..., nb_channels, nb_channels)
filter on which to apply logit compression.
thresh: float
threshold for the compression, should be between 0 and 1. The closer
to 1, the less interferences, but the more distortion.
slope: float
the slope at which binarization is done. The higher, the more brutal
Returns
-------
W: np.ndarray [same shape as input]
Compressed filter
'''
eps = np.finfo(W).eps
nb_channels = W.shape[-1]
if nb_channels > 1:
gains = np.trace(W, axis1=-2, axis2=-1)
W *= (_logit(gains, thresh, slope) / (eps + gains))[..., None, None]
else:
W = _logit(W, thresh, slope)
return W