diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index fcd8bf8c849..112e256ee07 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -172,7 +172,8 @@ jobs:
           gensim \
           opt-einsum \
           nltk \
-          fvcore
+          fvcore \
+          scikit-optimize 
         kill $KA 
         cd src/main/python
         python -m unittest discover -s tests/scuro -p 'test_*.py' -v
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
index 8471cc7c356..2fee7cbf5a3 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -45,11 +45,6 @@ def __init__(
 
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
-        # if not self.load_data_from_file:
-        #     self.metadata[file] = self.modality_type.create_metadata(
-        #         30, 10, 100, 100, 3
-        #     )
-        # else:
         cap = cv2.VideoCapture(file)
 
         if not cap.isOpened():
@@ -71,13 +66,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             self.fps, length, width, height, num_channels
         )
 
-        num_frames = (length + frame_interval - 1) // frame_interval
-
-        stacked_frames = np.zeros(
-            (num_frames, height, width, num_channels), dtype=self._data_type
-        )
-
-        frame_idx = 0
+        frames = []
         idx = 0
         while cap.isOpened():
             ret, frame = cap.read()
@@ -87,11 +76,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             if idx % frame_interval == 0:
                 frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 frame = frame.astype(self._data_type) / 255.0
-                stacked_frames[frame_idx] = frame
-                frame_idx += 1
+                frames.append(frame)
             idx += 1
 
-        if frame_idx < num_frames:
-            stacked_frames = stacked_frames[:frame_idx]
-
-        self.data.append(stacked_frames)
+        self.data.append(np.stack(frames))
diff --git a/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py b/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py
index 2a5f8262834..1605b7b87d9 100644
--- a/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py
+++ b/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py
@@ -19,8 +19,9 @@
 #
 # -------------------------------------------------------------
 from typing import Dict, List, Tuple, Any, Optional
-import numpy as np
-from sklearn.model_selection import ParameterGrid
+from skopt import gp_minimize
+from skopt.space import Real, Integer, Categorical
+from skopt.utils import use_named_args
 import json
 import logging
 from dataclasses import dataclass
@@ -28,7 +29,6 @@
 import copy
 
 from systemds.scuro.modality.modality import Modality
-from systemds.scuro.drsearch.task import Task
 
 
 @dataclass
@@ -163,18 +163,64 @@ def visit_node(node_id):
         start_time = time.time()
         rep_name = "_".join([rep.__name__ for rep in reps])
 
-        param_grid = list(ParameterGrid(hyperparams))
-        if max_evals and len(param_grid) > max_evals:
-            np.random.shuffle(param_grid)
-            param_grid = param_grid[:max_evals]
+        search_space = []
+        param_names = []
+        for param_name, param_values in hyperparams.items():
+            param_names.append(param_name)
+            if isinstance(param_values, list):
+                if all(isinstance(v, (int, float)) for v in param_values):
+                    if all(isinstance(v, int) for v in param_values):
+                        search_space.append(
+                            Integer(
+                                min(param_values), max(param_values), name=param_name
+                            )
+                        )
+                    else:
+                        search_space.append(
+                            Real(min(param_values), max(param_values), name=param_name)
+                        )
+                else:
+                    search_space.append(Categorical(param_values, name=param_name))
+            elif isinstance(param_values, tuple) and len(param_values) == 2:
+                if isinstance(param_values[0], int) and isinstance(
+                    param_values[1], int
+                ):
+                    search_space.append(
+                        Integer(param_values[0], param_values[1], name=param_name)
+                    )
+                else:
+                    search_space.append(
+                        Real(param_values[0], param_values[1], name=param_name)
+                    )
+            else:
+                search_space.append(Categorical([param_values], name=param_name))
+
+        n_calls = max_evals if max_evals else 50
 
         all_results = []
-        for params in param_grid:
+
+        @use_named_args(search_space)
+        def objective(**params):
             result = self.evaluate_dag_config(
                 dag, params, node_order, modality_ids, task
             )
             all_results.append(result)
 
+            score = result[1].average_scores[self.scoring_metric]
+            if self.maximize_metric:
+                return -score
+            else:
+                return score
+
+        result = gp_minimize(
+            objective,
+            search_space,
+            n_calls=n_calls,
+            random_state=42,
+            verbose=self.debug,
+            n_initial_points=min(10, n_calls // 2),
+        )
+
         if self.maximize_metric:
             best_params, best_score = max(
                 all_results, key=lambda x: x[1].average_scores[self.scoring_metric]
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index f7b7394e0fd..e4ed85cce38 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -156,26 +156,28 @@ def apply_representation(self, representation):
                 if current_length < target_length:
                     padding_needed = target_length - current_length
                     if pad_dim_one:
-                        padding = np.zeros((embeddings.shape[0], padding_needed))
-                        padded_embeddings.append(
-                            np.concatenate((embeddings, padding), axis=1)
+                        padded = np.pad(
+                            embeddings,
+                            ((0, 0), (0, padding_needed)),
+                            mode="constant",
+                            constant_values=0,
                         )
+                        padded_embeddings.append(padded)
                     else:
                         if len(embeddings.shape) == 1:
-                            padded = np.zeros(
-                                embeddings.shape[0] + padding_needed,
-                                dtype=embeddings.dtype,
+                            padded = np.pad(
+                                embeddings,
+                                (0, padding_needed),
+                                mode="constant",
+                                constant_values=0,
                             )
-                            padded[: embeddings.shape[0]] = embeddings
                         else:
-                            padded = np.zeros(
-                                (
-                                    embeddings.shape[0] + padding_needed,
-                                    embeddings.shape[1],
-                                ),
-                                dtype=embeddings.dtype,
+                            padded = np.pad(
+                                embeddings,
+                                ((0, padding_needed), (0, 0)),
+                                mode="constant",
+                                constant_values=0,
                             )
-                            padded[: embeddings.shape[0], :] = embeddings
                         padded_embeddings.append(padded)
                 else:
                     padded_embeddings.append(embeddings)
diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
index 2b338d30ee6..9d1d82a6be8 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -32,7 +32,7 @@
 @register_representation(ModalityType.TEXT)
 class BoW(UnimodalRepresentation):
     def __init__(self, ngram_range=2, min_df=2, output_file=None):
-        parameters = {"ngram_range": [ngram_range], "min_df": [min_df]}
+        parameters = {"ngram_range": [2, 3, 5, 10], "min_df": [1, 2, 4, 8]}
         super().__init__("BoW", ModalityType.EMBEDDING, parameters)
         self.ngram_range = int(ngram_range)
         self.min_df = int(min_df)
diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py
index 1d458aeb7d0..504681f2537 100644
--- a/src/main/python/systemds/scuro/representations/clip.py
+++ b/src/main/python/systemds/scuro/representations/clip.py
@@ -34,7 +34,7 @@
 from systemds.scuro.utils.torch_dataset import CustomDataset
 
 
-@register_representation(ModalityType.VIDEO)
+@register_representation([ModalityType.VIDEO, ModalityType.IMAGE])
 class CLIPVisual(UnimodalRepresentation):
     def __init__(self, output_file=None):
         parameters = {}
@@ -46,8 +46,10 @@ def __init__(self, output_file=None):
         self.output_file = output_file
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality, self)
-        self.data_type = numpy_dtype_to_torch_dtype(modality.data_type)
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
+        self.data_type = torch.float32
         if next(self.model.parameters()).dtype != self.data_type:
             self.model = self.model.to(self.data_type)
 
@@ -60,14 +62,20 @@ def transform(self, modality):
         return transformed_modality
 
     def create_visual_embeddings(self, modality):
-        tf = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])
+
+        clip_transform = transforms.Compose(
+            [
+                transforms.ToPILImage(),
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.ConvertImageDtype(dtype=self.data_type),
+            ]
+        )
         dataset = CustomDataset(
-            modality.data,
-            self.data_type,
-            get_device(),
-            (modality.metadata[0]["width"], modality.metadata[0]["height"]),
-            tf=tf,
+            modality.data, self.data_type, get_device(), tf=clip_transform
         )
+
         embeddings = {}
         for instance in torch.utils.data.DataLoader(dataset):
             id = int(instance["id"][0])
@@ -94,7 +102,7 @@ def create_visual_embeddings(self, modality):
                     .cpu()
                     .float()
                     .numpy()
-                    .astype(modality.data_type)
+                    .astype(np.float32)
                 )
 
             embeddings[id] = np.array(embeddings[id])
@@ -113,7 +121,9 @@ def __init__(self, output_file=None):
         self.output_file = output_file
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality, self)
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
 
         embeddings = self.create_text_embeddings(modality.data, self.model)
 
diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py
index addccadade7..7ac0200819c 100644
--- a/src/main/python/systemds/scuro/representations/fusion.py
+++ b/src/main/python/systemds/scuro/representations/fusion.py
@@ -91,8 +91,8 @@ def transform_with_training(self, modalities: List[Modality], task):
         transformed_data = np.zeros(
             (len(modalities[0].data), transformed_train.shape[1])
         )
-        transformed_data[task.train_indices] = transformed_train
-        transformed_data[task.test_indices] = transformed_other
+        transformed_data[fusion_train_indices] = transformed_train
+        transformed_data[all_other_indices] = transformed_other
 
         return transformed_data
 
diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py
index efc31272743..58b878820e6 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -188,11 +188,11 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
             criterion = nn.CrossEntropyLoss()
         optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
 
-        X_tensor = torch.FloatTensor(X).to(device)
+        X_tensor = torch.FloatTensor(X)
         if self.is_multilabel:
-            y_tensor = torch.FloatTensor(y).to(device)
+            y_tensor = torch.FloatTensor(y)
         else:
-            y_tensor = torch.LongTensor(y).to(device)
+            y_tensor = torch.LongTensor(y)
 
         dataset = TensorDataset(X_tensor, y_tensor)
         dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
@@ -201,6 +201,8 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
         for epoch in range(self.epochs):
             total_loss = 0
             for batch_X, batch_y in dataloader:
+                batch_X = batch_X.to(device)
+                batch_y = batch_y.to(device)
                 optimizer.zero_grad()
 
                 features, predictions = self.model(batch_X)
@@ -230,6 +232,7 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
                 TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False
             )
             for (batch_X,) in inference_dataloader:
+                batch_X = batch_X.to(device)
                 features, _ = self.model(batch_X)
                 all_features.append(features.cpu())
 
@@ -244,7 +247,7 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
         device = get_device()
         self.model.to(device)
 
-        X_tensor = torch.FloatTensor(X).to(device)
+        X_tensor = torch.FloatTensor(X)
         all_features = []
         self.model.eval()
         with torch.no_grad():
@@ -252,6 +255,7 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
                 TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False
             )
             for (batch_X,) in inference_dataloader:
+                batch_X = batch_X.to(device)
                 features, _ = self.model(batch_X)
                 all_features.append(features.cpu())
 
diff --git a/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py
index 3f86610550e..a295eaa267a 100644
--- a/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py
+++ b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py
@@ -162,12 +162,12 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
         )
 
         for modality_name in inputs:
-            inputs[modality_name] = inputs[modality_name].to(device)
+            inputs[modality_name] = inputs[modality_name]
 
         if self.is_multilabel:
-            labels_tensor = torch.from_numpy(y).float().to(device)
+            labels_tensor = torch.from_numpy(y).float()
         else:
-            labels_tensor = torch.from_numpy(y).long().to(device)
+            labels_tensor = torch.from_numpy(y).long()
 
         dataset_inputs = []
         for i in range(len(y)):
@@ -199,9 +199,9 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
                 for modality_name in batch_inputs:
                     batch_inputs[modality_name] = torch.stack(
                         batch_inputs[modality_name]
-                    )
+                    ).to(device)
 
-                batch_labels = torch.stack(batch_labels)
+                batch_labels = torch.stack(batch_labels).to(device)
 
                 optimizer.zero_grad()
 
@@ -250,7 +250,9 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
 
                 batch_inputs = {}
                 for modality_name, tensor in inputs.items():
-                    batch_inputs[modality_name] = tensor[batch_start:batch_end]
+                    batch_inputs[modality_name] = tensor[batch_start:batch_end].to(
+                        device
+                    )
 
                 encoder_output = self.encoder(batch_inputs)
                 all_features.append(encoder_output["fused"].cpu())
@@ -266,9 +268,6 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
         device = get_device()
         self.encoder.to(device)
 
-        for modality_name in inputs:
-            inputs[modality_name] = inputs[modality_name].to(device)
-
         self.encoder.eval()
         all_features = []
 
@@ -281,7 +280,9 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
 
                 batch_inputs = {}
                 for modality_name, tensor in inputs.items():
-                    batch_inputs[modality_name] = tensor[batch_start:batch_end]
+                    batch_inputs[modality_name] = tensor[batch_start:batch_end].to(
+                        device
+                    )
 
                 encoder_output = self.encoder(batch_inputs)
                 all_features.append(encoder_output["fused"].cpu())
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index 55e7d369481..50fe084b9f5 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -163,7 +163,7 @@ def hook(
                     .cpu()
                     .float()
                     .numpy()
-                    .astype(modality.data_type)
+                    .astype(np.float32)
                 )
 
             embeddings[video_id] = np.array(embeddings[video_id])
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
index c82961949fe..95fa6c111f7 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -32,7 +32,7 @@
 @register_representation(ModalityType.TEXT)
 class TfIdf(UnimodalRepresentation):
     def __init__(self, min_df=2, output_file=None):
-        parameters = {"min_df": [min_df]}
+        parameters = {"min_df": [min_df, 4, 8]}
         super().__init__("TF-IDF", ModalityType.EMBEDDING, parameters)
         self.min_df = int(min_df)
         self.output_file = output_file
diff --git a/src/main/python/systemds/scuro/representations/timeseries_representations.py b/src/main/python/systemds/scuro/representations/timeseries_representations.py
index 631294809ee..3270992a97c 100644
--- a/src/main/python/systemds/scuro/representations/timeseries_representations.py
+++ b/src/main/python/systemds/scuro/representations/timeseries_representations.py
@@ -184,7 +184,7 @@ def compute_feature(self, signal):
 @register_representation([ModalityType.TIMESERIES])
 class SpectralCentroid(TimeSeriesRepresentation):
     def __init__(self, fs=1.0):
-        super().__init__("SpectralCentroid", parameters={"fs": [1.0]})
+        super().__init__("SpectralCentroid", parameters={"fs": [0.5, 1.0, 2.0]})
         self.fs = fs
 
     def compute_feature(self, signal):
@@ -199,7 +199,8 @@ def compute_feature(self, signal):
 class BandpowerFFT(TimeSeriesRepresentation):
     def __init__(self, fs=1.0, f1=0.0, f2=0.5):
         super().__init__(
-            "BandpowerFFT", parameters={"fs": [1.0], "f1": [0.0], "f2": [0.5]}
+            "BandpowerFFT",
+            parameters={"fs": [0.5, 1.0], "f1": [0.0, 1.0], "f2": [0.5, 1.0]},
         )
         self.fs = fs
         self.f1 = f1
diff --git a/src/main/python/systemds/scuro/representations/vgg.py b/src/main/python/systemds/scuro/representations/vgg.py
index 4f4324a372d..8bc4a15b951 100644
--- a/src/main/python/systemds/scuro/representations/vgg.py
+++ b/src/main/python/systemds/scuro/representations/vgg.py
@@ -53,14 +53,14 @@ def forward(self, input_: torch.Tensor) -> torch.Tensor:
         self.model.fc = Identity()
 
     def _get_parameters(self):
-        parameters = {"layer_name": []}
-
-        parameters["layer_name"] = [
-            "features.35",
-            "classifier.0",
-            "classifier.3",
-            "classifier.6",
-        ]
+        parameters = {
+            "layer_name": [
+                "features.35",
+                "classifier.0",
+                "classifier.3",
+                "classifier.6",
+            ]
+        }
 
         return parameters
 
@@ -120,7 +120,7 @@ def hook(
                     .cpu()
                     .float()
                     .numpy()
-                    .astype(modality.data_type)
+                    .astype(np.float32)
                 )
 
             embeddings[video_id] = np.array(embeddings[video_id])
diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py
index adb92ceb530..f40b28ea871 100644
--- a/src/main/python/systemds/scuro/representations/window_aggregation.py
+++ b/src/main/python/systemds/scuro/representations/window_aggregation.py
@@ -171,7 +171,7 @@ def window_aggregate_nested_level(self, instance, new_length):
 class StaticWindow(Window):
     def __init__(self, aggregation_function="mean", num_windows=100):
         super().__init__("StaticWindow", aggregation_function)
-        self.parameters["num_windows"] = [num_windows]
+        self.parameters["num_windows"] = [10, num_windows]
         self.num_windows = int(num_windows)
 
     def execute(self, modality):
@@ -202,7 +202,7 @@ def execute(self, modality):
 class DynamicWindow(Window):
     def __init__(self, aggregation_function="mean", num_windows=100):
         super().__init__("DynamicWindow", aggregation_function)
-        self.parameters["num_windows"] = [num_windows]
+        self.parameters["num_windows"] = [10, num_windows]
         self.num_windows = int(num_windows)
 
     def execute(self, modality):
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index 837811935cd..737d72b8b0c 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -43,8 +43,8 @@ def get_embedding(sentence, model):
 class W2V(UnimodalRepresentation):
     def __init__(self, vector_size=150, min_count=1, output_file=None):
         parameters = {
-            "vector_size": [vector_size],
-            "min_count": [min_count],
+            "vector_size": [50, 100, 150, 200],
+            "min_count": [1, 2, 4, 8],
         }
         super().__init__("Word2Vec", ModalityType.EMBEDDING, parameters)
         self.vector_size = vector_size
diff --git a/src/main/python/tests/README.md b/src/main/python/tests/README.md
index 24e0f018634..bea078ca28d 100644
--- a/src/main/python/tests/README.md
+++ b/src/main/python/tests/README.md
@@ -46,3 +46,5 @@ To execute the Federated Tests, use:
 Federated experiments are a little different from the rest, since they require some setup in form of federated workers.
 
 See more details in the [script](federated/runFedTest.sh)
+
+https://github.com/nttcslab/byol-a/blob/master/pretrained_weights/AudioNTT2020-BYOLA-64x96d512.pth
\ No newline at end of file
diff --git a/src/main/python/tests/scuro/test_hp_tuner.py b/src/main/python/tests/scuro/test_hp_tuner.py
index 802f737b0a5..8484a352e44 100644
--- a/src/main/python/tests/scuro/test_hp_tuner.py
+++ b/src/main/python/tests/scuro/test_hp_tuner.py
@@ -239,10 +239,11 @@ def run_hp_for_modality(
                     fusion_results,
                     k=1,
                     optimize_unimodal=tune_unimodal_representations,
+                    max_eval_per_rep=20,
                 )
 
             else:
-                hp.tune_unimodal_representations()
+                hp.tune_unimodal_representations(max_eval_per_rep=10)
 
             assert len(hp.results) == len(self.tasks)
             assert len(hp.results[self.tasks[0].model.name]) == 2