Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ACKNOWLEDGEMENTS
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Jom Kuriakose
Shrey Dutta
Shubham Lohiya
Swarada Bharadwaj
Serafin Schweinitz

Project Musical AI - PID2019-111403GB-I00/AEI/10.13039/501100011033 funded by the Spanish
Ministerio de Ciencia, Innovación y Universidades (MCIU) and the Agencia Estatal de
Expand Down
2 changes: 1 addition & 1 deletion compiam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def load_dataset(dataset_name, data_home=None, version="default"):
"""
if dataset_name not in datasets_list:
raise ValueError("Invalid dataset {}".format(dataset_name))
dataloader = mirdata.initialize(
dataloader = mirdata.initialize(
dataset_name=dataset_name, data_home=data_home, version=version
)
dataloader.download(["index"]) # Download index file
Expand Down
18 changes: 18 additions & 0 deletions compiam/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,24 @@
},
},
},
"separation:convtdf-vocal-finetune": {
"module_name": "compiam.separation.singing_voice_extraction.convtdf_vocal_finetune",
"class_name": "ConvTDFVocalFineTune",
"default_version": "v1",
"kwargs": {
"v1": {
"model_path": os.path.join(
"models",
"separation",
"convtdf_vocal_finetune",
"vocals",
"checkpoint_finetuned.pt",
),
"download_link": "https://zenodo.org/records/15121572/files/convtdf_vocal_finetune.zip?download=1",
"download_checksum": "170c7a25cb06911f2e4a9452ce943aed",
},
},
},
}


Expand Down
12 changes: 6 additions & 6 deletions compiam/dunya/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,7 @@ def __init__(self, tradition, token):
dunya.set_token(self.token)

if tradition not in ["carnatic", "hindustani"]:
raise ValueError(
"Please choose a valid tradition: carnatic or hindustani"
)
raise ValueError("Please choose a valid tradition: carnatic or hindustani")
self.tradition = carnatic if tradition == "carnatic" else hindustani

# Functions from the compmusic API are added as a method in the Corpora class
Expand All @@ -36,10 +34,12 @@ def __init__(self, tradition, token):
if callable(func):
setattr(self, name, func)

logger.warning("""
logger.warning(
"""
Note that a part of the collection is under restricted access.
To access the full collection please request permission at https://dunya.compmusic.upf.edu/user/profile/
""")
"""
)

def get_collection(self, recording_detail=False):
"""Get the documents (recordings) in a collection.
Expand All @@ -54,7 +54,7 @@ def get_collection(self, recording_detail=False):
+ "Please note that it might take a few moments..."
)
return self.tradition.get_recordings(recording_detail)

@staticmethod
def list_available_types(recording_id):
"""Get the available source filetypes for a Musicbrainz recording.
Expand Down
5 changes: 3 additions & 2 deletions compiam/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,11 @@ def write_scalar_txt(data, output_path):

def resolve_dottedname(dotted_name):
"""Resolve a dotted name to an actual object, similar to zope.dottedname.resolve

:param dotted_name: a dotted name
:returns: the object the dotted name refers to
"""
module_name, _, attribute_name = dotted_name.rpartition('.')
module_name, _, attribute_name = dotted_name.rpartition(".")
if not module_name:
raise ImportError(f"Invalid dotted name: '{dotted_name}'")
module = importlib.import_module(module_name)
Expand All @@ -111,6 +111,7 @@ def load_yaml(path):
:param path: input file
:returns: loaded yaml information
"""

def constructor_dottedname(loader, node):
value = loader.construct_scalar(node)
return resolve_dottedname(value)
Expand Down
2 changes: 1 addition & 1 deletion compiam/melody/pattern/sancara_search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def load_model(self, model_path, conf_path, spec_path):
try:
self.model.load_state_dict(
torch.load(model_path, weights_only=True, map_location=self.device),
strict=False
strict=False,
)
except:
self.model.load_state_dict(
Expand Down
6 changes: 6 additions & 0 deletions compiam/melody/pitch_extraction/ftanet_carnatic/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def predict(
hop_size=80,
batch_size=5,
out_step=None,
amplify_input=1.0,
gpu="-1",
):
"""Extract melody from input_data.
Expand All @@ -283,6 +284,7 @@ def predict(
(defaulted to 5, increase if enough computational power, reduce if
needed).
:param out_step: particular time-step duration if needed at output
:param amplify_input: for low volume inputs, we've found that overlouding it may provide better voicing detection (e.g. x10, x50)
:param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
:returns: a 2-D list with time-stamps and pitch values per timestamp.
"""
Expand Down Expand Up @@ -323,6 +325,10 @@ def predict(
xlist = []
timestamps = []

# Applying loudness scaling
audio = audio / audio.max()
audio = audio * amplify_input

audio_len = len(audio)
batch_min = self.sample_rate * 60 * batch_size
freqs = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def load_model(self, model_path):
## Ensuring we can load the model for different torch versions
## -- (weights only might be deprecated)
try:
self.model.load_state_dict(torch.load(model_path, weights_only=True, map_location=self.device))
self.model.load_state_dict(
torch.load(model_path, weights_only=True, map_location=self.device)
)
except:
self.model.load_state_dict(torch.load(model_path, map_location=self.device))
self.model_path = model_path
Expand Down
6 changes: 4 additions & 2 deletions compiam/melody/raga_recognition/deepsrgm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,9 @@ def load_model(self, model_path, rnn="lstm"):

self.model_path = model_path
try:
weights = torch.load(model_path, weights_only=True, map_location=self.device)
weights = torch.load(
model_path, weights_only=True, map_location=self.device
)
except:
weights = torch.load(model_path, map_location=self.device)
new_weights = weights.copy()
Expand Down Expand Up @@ -168,7 +170,7 @@ def load_raga_dataset(self, data_home=None, download=False):
"compmusic_raga", data_home=data_home, version="default"
)
if download:
self.dataset.download() # Downloads index and features
self.dataset.download() # Downloads index and features
logger.warning(
f"""
The features are downloaded, but the audio of this dataset is private.
Expand Down
7 changes: 5 additions & 2 deletions compiam/separation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
| **Tool** | **Task** | **Paper** |
|---------------------------|----------------------------------|-----------|
| ColdDiffSep | Singing voice extraction | [1] |
| MDXNet w/ mixer model | Music source separation | [2] |
| ConvTDF Vocal Fine-tuned | Singing voice extraction | [2] |
| MDXNet w/ mixer model | Music source separation | [3] |


[1] G. Plaja-Roglans, M. Miron, A. Shankar, and X. Serra, "Carnatic Singing Voice Separation using Cold Diffusion on Training Data with Bleeding", in International Society for Music Information Retrieval Conference (ISMIR 23), 2023.

[2] Work under review.
[2] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025.

[3] A. Shankar, S. Schweinitz, G. Plaja-Roglans, X. Serra, and M. Rocamora, “Disentangling overlapping sources: Improving vocal and violin source separation in carnatic music”, in Workshop on Indian Music Snalysis and Generative Applications (WIMAGA) in ICASSP, 2025.
1 change: 1 addition & 0 deletions compiam/separation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

### IMPORT HERE THE CONSIDERED TASKS
from compiam.separation import singing_voice_extraction
from compiam.separation import music_source_separation


# Show user the available tasks
Expand Down
90 changes: 63 additions & 27 deletions compiam/separation/music_source_separation/mixer_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def __init__(
self.load_model(self.model_path)

self.chunk_size = self.model.chunk_size
self.overlap = 0.25

def forward(self, x):
"""Forward pass of the mixer model"""
Expand All @@ -86,7 +87,9 @@ def load_model(self, model_path):
## Ensuring we can load the model for different torch versions
## -- (weights only might be deprecated)
try:
weights = torch.load(model_path, weights_only=True, map_location=self.device)
weights = torch.load(
model_path, weights_only=True, map_location=self.device
)
except:
weights = torch.load(model_path, map_location=self.device)
self.model.load_state_dict(weights)
Expand All @@ -97,13 +100,15 @@ def separate(
self,
input_data,
input_sr=44100,
normalize_input=True,
gpu="-1",
):
"""Separate singing voice and violin from mixture.

:param input_data: Audio signal to separate.
:param input_sr: sampling rate of the input array of data (if any). This variable is only
relevant if the input is an array of data instead of a filepath.
:param normalize_input: Normalize the input audio signal.
:param gpu: Id of the available GPU to use (-1 by default, to run on CPU), use string: '0', '1', etc.
:return: Singing voice and violin signals.
"""
Expand All @@ -124,20 +129,22 @@ def separate(
raise FileNotFoundError("Target audio not found.")
audio, input_sr = torchaudio.load(input_data)
elif isinstance(input_data, np.ndarray):
input_data = torch.from_numpy(input_data).to(torch.float32).to(self.device)
audio = torch.from_numpy(input_data).to(torch.float32).to(self.device)
elif isinstance(input_data, torch.Tensor):
input_data = input_data.to(torch.float32).to(self.device)
audio = input_data.to(torch.float32).to(self.device)
else:
raise ValueError("Input must be path to audio signal or an audio array")

if len(input_data.shape) == 1:
input_data = input_data.unsqueeze(0)

if len(input_data.shape) == 3:
if input_data.shape[0] != 1:
raise ValueError("Batching is not supported. Please provide a single audio signal.")
input_data = input_data.squeeze(0)


if len(audio.shape) == 1:
audio = audio.unsqueeze(0) # Add mono channel if no audio channels

if len(audio.shape) == 3:
if audio.shape[0] != 1:
raise ValueError(
"Batching is not supported. Please provide a single audio signal."
)
audio = audio.squeeze(0) # Remove batch size 1

# resample audio
if input_sr != self.sample_rate:
logger.warning(
Expand All @@ -146,38 +153,67 @@ def separate(
)
audio = torchaudio.transforms.Resample(
orig_freq=input_sr, new_freq=self.sample_rate
)(input_data)
)(audio)

# downsampling to mono
# downsampling to mono
if audio.shape[0] == 2:
audio = audio.mean(dim=0, keepdim=True)
logger.info(
f"Downsampling to mono... your audio is stereo, \
and the model is trained on mono audio."
)

# audio has shape B, 1, N
if normalize_input:
audio = audio / audio.max()
initial_length = audio.shape[-1]
audio = audio.reshape(-1)
predictions = []
pad_length = self.chunk_size - (audio.shape[-1] % self.chunk_size)
pad_length = (
self.chunk_size - (audio.shape[-1] % self.chunk_size)
) % self.chunk_size
audio = torch.nn.functional.pad(audio, (0, pad_length))

for i in range(0, audio.shape[-1], self.chunk_size):
audio_chunk = audio[i : i + self.chunk_size].reshape(
1, 1, -1
) # TODO Batching
predictions.append(self.forward(audio_chunk))
chunk_size = audio.shape[-1] // (
(audio.shape[-1] + self.chunk_size - 1) // self.chunk_size
)
hop_size = int(chunk_size * (1 - self.overlap))
num_chunks = (audio.shape[-1] - chunk_size) // hop_size + 1

window = torch.hann_window(chunk_size)
out = torch.zeros((2, audio.shape[-1])) # (Channels=2, Time)
weight_sum = torch.zeros(
audio.shape[-1]
) # Weight accumulation for normalization

result = torch.cat(predictions, dim=-1)
result = result[:, :, :-pad_length]
# Process chunks
for i in range(num_chunks):
start = i * hop_size
end = start + chunk_size

# Extract chunk (reshape for model input)
audio_chunk = audio[start:end].reshape(1, 1, -1)

# Apply model separation (assumes 2-channel output)
separated_chunk = self.forward(audio_chunk).reshape(
2, -1
) # (2, chunk_size)

# Apply windowing
separated_chunk *= window # Smooth transition

# Overlap-Add to output
out[:, start:end] += separated_chunk
weight_sum[start:end] += window # Accumulate weights

out /= weight_sum.unsqueeze(0).clamp(min=1e-8) # Avoid division by zero
out = out[..., :initial_length].unsqueeze(0) # (1, 2, N)

vocal_separation = torchaudio.transforms.Resample(
orig_freq=self.sample_rate, new_freq=input_sr
)(result[:, 0, :])
)(out[:, 0, :])
violin_separation = torchaudio.transforms.Resample(
orig_freq=self.sample_rate, new_freq=input_sr
)(result[:, 1, :])
)(out[:, 1, :])

vocal_separation = vocal_separation.detach().cpu().numpy().reshape(-1)
violin_separation = violin_separation.detach().cpu().numpy().reshape(-1)
return (vocal_separation, violin_separation)
Expand Down
24 changes: 12 additions & 12 deletions compiam/separation/music_source_separation/mixer_model/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@
class ConvTDFNet(nn.Module):
def __init__(
self,
hop_length,
num_blocks,
dim_t,
n_fft,
dim_c,
dim_f,
g,
k,
l,
bn,
bias,
scale,
hop_length=558,
dim_t=256,
n_fft=6144,
dim_c=2,
dim_f=2048,
num_blocks=11,
g=32,
k=3,
l=3,
bn=4,
bias=False,
scale=2,
):
super(ConvTDFNet, self).__init__()
self.hop_length = hop_length
Expand Down
3 changes: 3 additions & 0 deletions compiam/separation/singing_voice_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from compiam.separation.singing_voice_extraction.cold_diff_sep import (
ColdDiffSep,
)
from compiam.separation.singing_voice_extraction.convtdf_vocal_finetune import (
ConvTDFVocalFineTune,
)


# Show user the available tools
Expand Down
Loading
Loading