MTG · genisplaja · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025
diff --git a/ACKNOWLEDGEMENTS b/ACKNOWLEDGEMENTS
@@ -18,6 +18,7 @@ Jom Kuriakose
 Shrey Dutta
 Shubham Lohiya
 Swarada Bharadwaj
+Serafin Schweinitz 
 
 Project Musical AI - PID2019-111403GB-I00/AEI/10.13039/501100011033 funded by the Spanish 
 Ministerio de Ciencia, Innovación y Universidades (MCIU) and the Agencia Estatal de 

diff --git a/compiam/__init__.py b/compiam/__init__.py
@@ -71,7 +71,7 @@ def load_dataset(dataset_name, data_home=None, version="default"):
     """
     if dataset_name not in datasets_list:
         raise ValueError("Invalid dataset {}".format(dataset_name))
-    dataloader =  mirdata.initialize(
+    dataloader = mirdata.initialize(
         dataset_name=dataset_name, data_home=data_home, version=version
     )
     dataloader.download(["index"])  # Download index file

diff --git a/compiam/data.py b/compiam/data.py
@@ -234,6 +234,24 @@
             },
         },
     },
+    "separation:convtdf-vocal-finetune": {
+        "module_name": "compiam.separation.singing_voice_extraction.convtdf_vocal_finetune",
+        "class_name": "ConvTDFVocalFineTune",
+        "default_version": "v1",
+        "kwargs": {
+            "v1": {
+                "model_path": os.path.join(
+                    "models",
+                    "separation",
+                    "convtdf_vocal_finetune",
+                    "vocals",
+                    "checkpoint_finetuned.pt",
+                ),
+                "download_link": "https://zenodo.org/records/15121572/files/convtdf_vocal_finetune.zip?download=1",
+                "download_checksum": "170c7a25cb06911f2e4a9452ce943aed",
+            },
+        },
+    },
 }
 
 

diff --git a/compiam/dunya/__init__.py b/compiam/dunya/__init__.py
@@ -25,9 +25,7 @@ def __init__(self, tradition, token):
         dunya.set_token(self.token)
 
         if tradition not in ["carnatic", "hindustani"]:
-            raise ValueError(
-                "Please choose a valid tradition: carnatic or hindustani"
-            )
+            raise ValueError("Please choose a valid tradition: carnatic or hindustani")
         self.tradition = carnatic if tradition == "carnatic" else hindustani
 
         # Functions from the compmusic API are added as a method in the Corpora class
@@ -36,10 +34,12 @@ def __init__(self, tradition, token):
             if callable(func):
                 setattr(self, name, func)
 
-        logger.warning("""
+        logger.warning(
+            """
             Note that a part of the collection is under restricted access.
             To access the full collection please request permission at https://dunya.compmusic.upf.edu/user/profile/
-        """)
+        """
+        )
 
     def get_collection(self, recording_detail=False):
         """Get the documents (recordings) in a collection.
@@ -54,7 +54,7 @@ def get_collection(self, recording_detail=False):
                 + "Please note that it might take a few moments..."
             )
         return self.tradition.get_recordings(recording_detail)
-    
+
     @staticmethod
     def list_available_types(recording_id):
         """Get the available source filetypes for a Musicbrainz recording.

diff --git a/compiam/io.py b/compiam/io.py
@@ -94,11 +94,11 @@ def write_scalar_txt(data, output_path):
 
 def resolve_dottedname(dotted_name):
     """Resolve a dotted name to an actual object, similar to zope.dottedname.resolve
-    
+
     :param dotted_name: a dotted name
     :returns: the object the dotted name refers to
     """
-    module_name, _, attribute_name = dotted_name.rpartition('.')
+    module_name, _, attribute_name = dotted_name.rpartition(".")
     if not module_name:
         raise ImportError(f"Invalid dotted name: '{dotted_name}'")
     module = importlib.import_module(module_name)
@@ -111,6 +111,7 @@ def load_yaml(path):
     :param path: input file
     :returns: loaded yaml information
     """
+
     def constructor_dottedname(loader, node):
         value = loader.construct_scalar(node)
         return resolve_dottedname(value)

diff --git a/compiam/melody/pattern/sancara_search/__init__.py b/compiam/melody/pattern/sancara_search/__init__.py
@@ -185,7 +185,7 @@ def load_model(self, model_path, conf_path, spec_path):
         try:
             self.model.load_state_dict(
                 torch.load(model_path, weights_only=True, map_location=self.device),
-                strict=False
+                strict=False,
             )
         except:
             self.model.load_state_dict(

diff --git a/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py
@@ -270,6 +270,7 @@ def predict(
         hop_size=80,
         batch_size=5,
         out_step=None,
+        amplify_input=1.0,
         gpu="-1",
     ):
         """Extract melody from input_data.
@@ -283,6 +284,7 @@ def predict(
             (defaulted to 5, increase if enough computational power, reduce if
             needed).
         :param out_step: particular time-step duration if needed at output
+        :param amplify_input: for low volume inputs, we've found that overlouding it may provide better voicing detection (e.g. x10, x50)
         :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
         :returns: a 2-D list with time-stamps and pitch values per timestamp.
         """
@@ -323,6 +325,10 @@ def predict(
         xlist = []
         timestamps = []
 
+        # Applying loudness scaling
+        audio = audio / audio.max()
+        audio = audio * amplify_input
+
         audio_len = len(audio)
         batch_min = self.sample_rate * 60 * batch_size
         freqs = []

diff --git a/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py b/compiam/melody/pitch_extraction/ftaresnet_carnatic/__init__.py
@@ -83,7 +83,9 @@ def load_model(self, model_path):
         ## Ensuring we can load the model for different torch versions
         ## -- (weights only might be deprecated)
         try:
-            self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device))
+            self.model.load_state_dict(
+                torch.load(model_path, weights_only=True, map_location=self.device)
+            )
         except:
             self.model.load_state_dict(torch.load(model_path, map_location=self.device))
         self.model_path = model_path

diff --git a/compiam/melody/raga_recognition/deepsrgm/__init__.py b/compiam/melody/raga_recognition/deepsrgm/__init__.py
@@ -124,7 +124,9 @@ def load_model(self, model_path, rnn="lstm"):
 
         self.model_path = model_path
         try:
-            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+            weights = torch.load(
+                model_path, weights_only=True, map_location=self.device
+            )
         except:
             weights = torch.load(model_path, map_location=self.device)
         new_weights = weights.copy()
@@ -168,7 +170,7 @@ def load_raga_dataset(self, data_home=None, download=False):
             "compmusic_raga", data_home=data_home, version="default"
         )
         if download:
-            self.dataset.download()  # Downloads index and features
+            self.dataset.download()  # Downloads index and features
             logger.warning(
                 f"""
                 The features are downloaded, but the audio of this dataset is private. 

diff --git a/compiam/separation/README.md b/compiam/separation/README.md
@@ -3,9 +3,12 @@
 | **Tool**                  | **Task**                         | **Paper** |
 |---------------------------|----------------------------------|-----------|
 | ColdDiffSep               | Singing voice extraction         | [1]       |
-| MDXNet w/ mixer model     | Music source separation          | [2]       |
+| ConvTDF Vocal Fine-tuned  | Singing voice extraction         | [2]       |
+| MDXNet w/ mixer model     | Music source separation          | [3]       |
 
 
 [1] G. Plaja-Roglans, M. Miron, A. Shankar, and X. Serra, "Carnatic Singing Voice Separation using Cold Diffusion on Training Data with Bleeding", in International Society for Music Information Retrieval Conference (ISMIR 23), 2023. 
 
-[2] Work under review.
+[2] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025.
+
+[3] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025.
diff --git a/compiam/separation/__init__.py b/compiam/separation/__init__.py
@@ -12,6 +12,7 @@
 
 ### IMPORT HERE THE CONSIDERED TASKS
 from compiam.separation import singing_voice_extraction
+from compiam.separation import music_source_separation
 
 
 # Show user the available tasks

diff --git a/compiam/separation/music_source_separation/mixer_model/__init__.py b/compiam/separation/music_source_separation/mixer_model/__init__.py
@@ -69,6 +69,7 @@ def __init__(
             self.load_model(self.model_path)
 
         self.chunk_size = self.model.chunk_size
+        self.overlap = 0.25
 
     def forward(self, x):
         """Forward pass of the mixer model"""
@@ -86,7 +87,9 @@ def load_model(self, model_path):
         ## Ensuring we can load the model for different torch versions
         ## -- (weights only might be deprecated)
         try:
-            weights = torch.load(model_path, weights_only=True, map_location=self.device)
+            weights = torch.load(
+                model_path, weights_only=True, map_location=self.device
+            )
         except:
             weights = torch.load(model_path, map_location=self.device)
         self.model.load_state_dict(weights)
@@ -97,13 +100,15 @@ def separate(
         self,
         input_data,
         input_sr=44100,
+        normalize_input=True,
         gpu="-1",
     ):
         """Separate singing voice and violin from mixture.
 
         :param input_data: Audio signal to separate.
         :param input_sr: sampling rate of the input array of data (if any). This variable is only
             relevant if the input is an array of data instead of a filepath.
+        :param normalize_input: Normalize the input audio signal.
         :param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
         :return: Singing voice and violin signals.
         """
@@ -124,20 +129,22 @@ def separate(
                 raise FileNotFoundError("Target audio not found.")
             audio, input_sr = torchaudio.load(input_data)
         elif isinstance(input_data, np.ndarray):
-            input_data = torch.from_numpy(input_data).to(torch.float32).to(self.device)
+            audio = torch.from_numpy(input_data).to(torch.float32).to(self.device)
         elif isinstance(input_data, torch.Tensor):
-            input_data = input_data.to(torch.float32).to(self.device)
+            audio = input_data.to(torch.float32).to(self.device)
         else:
             raise ValueError("Input must be path to audio signal or an audio array")
-
-        if len(input_data.shape) == 1:
-            input_data = input_data.unsqueeze(0)
-
-        if len(input_data.shape) == 3:
-            if input_data.shape[0] != 1:
-                raise ValueError("Batching is not supported. Please provide a single audio signal.")
-            input_data = input_data.squeeze(0)
-
+
+        if len(audio.shape) == 1:
+            audio = audio.unsqueeze(0)  # Add mono channel if no audio channels
+
+        if len(audio.shape) == 3:
+            if audio.shape[0] != 1:
+                raise ValueError(
+                    "Batching is not supported. Please provide a single audio signal."
+                )
+            audio = audio.squeeze(0)  # Remove batch size 1
+
         # resample audio
         if input_sr != self.sample_rate:
             logger.warning(
@@ -146,38 +153,67 @@ def separate(
             )
             audio = torchaudio.transforms.Resample(
                 orig_freq=input_sr, new_freq=self.sample_rate
-            )(input_data)
+            )(audio)
 
-        # downsampling to mono
+        # downsampling to mono
         if audio.shape[0] == 2:
             audio = audio.mean(dim=0, keepdim=True)
             logger.info(
                 f"Downsampling to mono... your audio is stereo, \
                     and the model is trained on mono audio."
             )
 
-        # audio has shape B, 1, N
+        if normalize_input:
+            audio = audio / audio.max()
+        initial_length = audio.shape[-1]
         audio = audio.reshape(-1)
-        predictions = []
-        pad_length = self.chunk_size - (audio.shape[-1] % self.chunk_size)
+        pad_length = (
+            self.chunk_size - (audio.shape[-1] % self.chunk_size)
+        ) % self.chunk_size
         audio = torch.nn.functional.pad(audio, (0, pad_length))
 
-        for i in range(0, audio.shape[-1], self.chunk_size):
-            audio_chunk = audio[i : i + self.chunk_size].reshape(
-                1, 1, -1
-            )  # TODO Batching
-            predictions.append(self.forward(audio_chunk))
+        chunk_size = audio.shape[-1] // (
+            (audio.shape[-1] + self.chunk_size - 1) // self.chunk_size
+        )
+        hop_size = int(chunk_size * (1 - self.overlap))
+        num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1
+
+        window = torch.hann_window(chunk_size)
+        out = torch.zeros((2, audio.shape[-1]))  # (Channels=2, Time)
+        weight_sum = torch.zeros(
+            audio.shape[-1]
+        )  # Weight accumulation for normalization
 
-        result = torch.cat(predictions, dim=-1)
-        result = result[:, :, :-pad_length]
+        # Process chunks
+        for i in range(num_chunks):
+            start = i * hop_size
+            end = start + chunk_size
+
+            # Extract chunk (reshape for model input)
+            audio_chunk = audio[start:end].reshape(1, 1, -1)
+
+            # Apply model separation (assumes 2-channel output)
+            separated_chunk = self.forward(audio_chunk).reshape(
+                2, -1
+            )  # (2, chunk_size)
+
+            # Apply windowing
+            separated_chunk *= window  # Smooth transition
+
+            # Overlap-Add to output
+            out[:, start:end] += separated_chunk
+            weight_sum[start:end] += window  # Accumulate weights
+
+        out /= weight_sum.unsqueeze(0).clamp(min=1e-8)  # Avoid division by zero
+        out = out[..., :initial_length].unsqueeze(0)  # (1, 2, N)
 
         vocal_separation = torchaudio.transforms.Resample(
             orig_freq=self.sample_rate, new_freq=input_sr
-        )(result[:, 0, :])
+        )(out[:, 0, :])
         violin_separation = torchaudio.transforms.Resample(
             orig_freq=self.sample_rate, new_freq=input_sr
-        )(result[:, 1, :])
-        
+        )(out[:, 1, :])
+
         vocal_separation = vocal_separation.detach().cpu().numpy().reshape(-1)
         violin_separation = violin_separation.detach().cpu().numpy().reshape(-1)
         return (vocal_separation, violin_separation)

diff --git a/compiam/separation/music_source_separation/mixer_model/models.py b/compiam/separation/music_source_separation/mixer_model/models.py
@@ -7,18 +7,18 @@
 class ConvTDFNet(nn.Module):
     def __init__(
         self,
-        hop_length,
-        num_blocks,
-        dim_t,
-        n_fft,
-        dim_c,
-        dim_f,
-        g,
-        k,
-        l,
-        bn,
-        bias,
-        scale,
+        hop_length=558,
+        dim_t=256,
+        n_fft=6144,
+        dim_c=2,
+        dim_f=2048,
+        num_blocks=11,
+        g=32,
+        k=3,
+        l=3,
+        bn=4,
+        bias=False,
+        scale=2,
     ):
         super(ConvTDFNet, self).__init__()
         self.hop_length = hop_length

diff --git a/compiam/separation/singing_voice_extraction/__init__.py b/compiam/separation/singing_voice_extraction/__init__.py
@@ -7,6 +7,9 @@
 from compiam.separation.singing_voice_extraction.cold_diff_sep import (
     ColdDiffSep,
 )
+from compiam.separation.singing_voice_extraction.convtdf_vocal_finetune import (
+    ConvTDFVocalFineTune,
+)
 
 
 # Show user the available tools