feat: add our xumx predictor

0cf96435 · chinyun_yu_joey · 4d6d15d1 · 0cf96435
Commit 0cf96435 authored 3 years ago by chinyun_yu_joey
--- a/test_xumx_2.py
+++ b/test_xumx_2.py
+#!/usr/bin/env python
+
+import norbert
+import numpy as np
+import soundfile as sf
+import torch
+
+from evaluator.music_demixing import MusicDemixingPredictor
+
+
+def stft(x, n_fft=4096, n_hopsize=1024):
+    window = torch.hann_window(n_fft, dtype=x.dtype, device=x.device)
+    X = torch.stft(
+        x,
+        n_fft,
+        n_hopsize,
+        n_fft,
+        window,
+        return_complex=True
+    )
+    return X
+
+
+def istft(X, n_fft=4096, n_hopsize=1024):
+    dtype = X.dtype
+    if dtype == torch.complex32:
+        dtype = torch.float16
+    elif dtype == torch.complex64:
+        dtype = torch.float32
+    elif dtype == torch.complex128:
+        dtype = torch.float64
+    window = torch.hann_window(n_fft, dtype=dtype, device=X.device)
+    x = torch.istft(
+        X,
+        n_fft,
+        n_hopsize,
+        n_fft,
+        window,
+    )
+    return x
+
+# Separation function - taken from
+#    https://github.com/asteroid-team/asteroid/blob/master/egs/musdb18/X-UMX/eval.py
+
+
+def separate(
+    audio,
+    x_umx_target,
+    niter=1,
+    softmask=False,
+    alpha=1.0,
+    residual_model=False,
+    device="cpu",
+):
+    """
+    Performing the separation on audio input
+    Parameters
+    ----------
+    audio: np.ndarray [shape=(nb_samples, nb_channels, nb_timesteps)]
+        mixture audio
+    x_umx_target: asteroid.models
+        X-UMX model used for separating
+    instruments: list
+        The list of instruments, e.g., ["bass", "drums", "vocals"]
+    niter: int
+         Number of EM steps for refining initial estimates in a
+         post-processing stage, defaults to 1.
+    softmask: boolean
+        if activated, then the initial estimates for the sources will
+        be obtained through a ratio mask of the mixture STFT, and not
+        by using the default behavior of reconstructing waveforms
+        by using the mixture phase, defaults to False
+    alpha: float
+        changes the exponent to use for building ratio masks, defaults to 1.0
+    residual_model: boolean
+        computes a residual target, for custom separation scenarios
+        when not all targets are available, defaults to False
+    device: str
+        set torch device. Defaults to `cpu`.
+    Returns
+    -------
+    estimates: `dict` [`str`, `np.ndarray`]
+        dictionary with all estimates obtained by the separation model.
+    """
+
+    # convert numpy audio to torch
+    audio_torch = torch.tensor(audio.T).float().to(device)
+    X = stft(audio_torch)
+
+    with torch.no_grad():
+        masked_tf_rep = x_umx_target(X.abs().unsqueeze(0)).squeeze()
+
+    Vj = masked_tf_rep.cpu().numpy()
+    if softmask:
+        Vj **= alpha
+    V = np.transpose(Vj, (3, 2, 1, 0))
+    X = X.permute(2, 1, 0).cpu().numpy()
+
+    if residual_model:
+        V = norbert.residual_model(V, X, alpha if softmask else 1)
+
+    Y = norbert.wiener(V, X.astype(np.complex128),
+                       niter, use_softmask=softmask)
+
+    Y = torch.from_numpy(Y).permute(3, 2, 1, 0)
+    estimates = istft(Y.view(-1, *Y.shape[2:])).view(*Y.shape[:2], -1).numpy()
+
+    return estimates
+
+
+class XUMXPredictor(MusicDemixingPredictor):
+    def prediction_setup(self):
+        # Load your model here and put it into `evaluation` mode
+        self.separator = torch.jit.load("test_model.pt")
+        self.separator.eval()
+
+    def prediction(
+        self,
+        mixture_file_path,
+        bass_file_path,
+        drums_file_path,
+        other_file_path,
+        vocals_file_path,
+    ):
+
+        # Step 1: Load mixture
+        # mixture is stereo with sample rate of 44.1kHz
+        x, rate = sf.read(mixture_file_path)
+
+        # Step 2: Pad mixture to compensate STFT truncation
+        x_padded = np.pad(x, ((0, 1024), (0, 0)))
+
+        # Step 3: Perform separation
+        estimates = separate(
+            x_padded,
+            self.separator,
+        )
+
+        # Step 4: Truncate to orignal length
+        estimates = estimates[..., :x.shape[0]]
+
+        # Step 5: Store results
+        target_file_map = {
+            "vocals": vocals_file_path,
+            "drums": drums_file_path,
+            "bass": bass_file_path,
+            "other": other_file_path,
+        }
+        for i, target in enumerate(['drums', 'bass', 'other', 'vocals']):
+            path = target_file_map[target]
+            sf.write(
+                path,
+                estimates[i].T,
+                rate
+            )
+
+
+if __name__ == "__main__":
+    submission = XUMXPredictor()
+    submission.run()
+    print("Successfully generated predictions!")