apache · christinadionysio · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -172,7 +172,8 @@ jobs:
           gensim \
           opt-einsum \
           nltk \
-          fvcore
+          fvcore \
+          scikit-optimize 
         kill $KA 
         cd src/main/python
         python -m unittest discover -s tests/scuro -p 'test_*.py' -v
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -45,11 +45,6 @@ def __init__(
 
     def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
         self.file_sanity_check(file)
-        # if not self.load_data_from_file:
-        #     self.metadata[file] = self.modality_type.create_metadata(
-        #         30, 10, 100, 100, 3
-        #     )
-        # else:
         cap = cv2.VideoCapture(file)
 
         if not cap.isOpened():
@@ -71,13 +66,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             self.fps, length, width, height, num_channels
         )
 
-        num_frames = (length + frame_interval - 1) // frame_interval
-
-        stacked_frames = np.zeros(
-            (num_frames, height, width, num_channels), dtype=self._data_type
-        )
-
-        frame_idx = 0
+        frames = []
         idx = 0
         while cap.isOpened():
             ret, frame = cap.read()
@@ -87,11 +76,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
             if idx % frame_interval == 0:
                 frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 frame = frame.astype(self._data_type) / 255.0
-                stacked_frames[frame_idx] = frame
-                frame_idx += 1
+                frames.append(frame)
             idx += 1
 
-        if frame_idx < num_frames:
-            stacked_frames = stacked_frames[:frame_idx]
-
-        self.data.append(stacked_frames)
+        self.data.append(np.stack(frames))
diff --git a/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py b/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py
@@ -19,16 +19,16 @@
 #
 # -------------------------------------------------------------
 from typing import Dict, List, Tuple, Any, Optional
-import numpy as np
-from sklearn.model_selection import ParameterGrid
+from skopt import gp_minimize
+from skopt.space import Real, Integer, Categorical
+from skopt.utils import use_named_args
 import json
 import logging
 from dataclasses import dataclass
 import time
 import copy
 
 from systemds.scuro.modality.modality import Modality
-from systemds.scuro.drsearch.task import Task
 
 
 @dataclass
@@ -163,18 +163,64 @@ def visit_node(node_id):
         start_time = time.time()
         rep_name = "_".join([rep.__name__ for rep in reps])
 
-        param_grid = list(ParameterGrid(hyperparams))
-        if max_evals and len(param_grid) > max_evals:
-            np.random.shuffle(param_grid)
-            param_grid = param_grid[:max_evals]
+        search_space = []
+        param_names = []
+        for param_name, param_values in hyperparams.items():
+            param_names.append(param_name)
+            if isinstance(param_values, list):
+                if all(isinstance(v, (int, float)) for v in param_values):
+                    if all(isinstance(v, int) for v in param_values):
+                        search_space.append(
+                            Integer(
+                                min(param_values), max(param_values), name=param_name
+                            )
+                        )
+                    else:
+                        search_space.append(
+                            Real(min(param_values), max(param_values), name=param_name)
+                        )
+                else:
+                    search_space.append(Categorical(param_values, name=param_name))
+            elif isinstance(param_values, tuple) and len(param_values) == 2:
+                if isinstance(param_values[0], int) and isinstance(
+                    param_values[1], int
+                ):
+                    search_space.append(
+                        Integer(param_values[0], param_values[1], name=param_name)
+                    )
+                else:
+                    search_space.append(
+                        Real(param_values[0], param_values[1], name=param_name)
+                    )
+            else:
+                search_space.append(Categorical([param_values], name=param_name))
+
+        n_calls = max_evals if max_evals else 50
 
         all_results = []
-        for params in param_grid:
+
+        @use_named_args(search_space)
+        def objective(**params):
             result = self.evaluate_dag_config(
                 dag, params, node_order, modality_ids, task
             )
             all_results.append(result)
 
+            score = result[1].average_scores[self.scoring_metric]
+            if self.maximize_metric:
+                return -score
+            else:
+                return score
+
+        result = gp_minimize(
+            objective,
+            search_space,
+            n_calls=n_calls,
+            random_state=42,
+            verbose=self.debug,
+            n_initial_points=min(10, n_calls // 2),
+        )
+
         if self.maximize_metric:
             best_params, best_score = max(
                 all_results, key=lambda x: x[1].average_scores[self.scoring_metric]

diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -156,26 +156,28 @@ def apply_representation(self, representation):
                 if current_length < target_length:
                     padding_needed = target_length - current_length
                     if pad_dim_one:
-                        padding = np.zeros((embeddings.shape[0], padding_needed))
-                        padded_embeddings.append(
-                            np.concatenate((embeddings, padding), axis=1)
+                        padded = np.pad(
+                            embeddings,
+                            ((0, 0), (0, padding_needed)),
+                            mode="constant",
+                            constant_values=0,
                         )
+                        padded_embeddings.append(padded)
                     else:
                         if len(embeddings.shape) == 1:
-                            padded = np.zeros(
-                                embeddings.shape[0] + padding_needed,
-                                dtype=embeddings.dtype,
+                            padded = np.pad(
+                                embeddings,
+                                (0, padding_needed),
+                                mode="constant",
+                                constant_values=0,
                             )
-                            padded[: embeddings.shape[0]] = embeddings
                         else:
-                            padded = np.zeros(
-                                (
-                                    embeddings.shape[0] + padding_needed,
-                                    embeddings.shape[1],
-                                ),
-                                dtype=embeddings.dtype,
+                            padded = np.pad(
+                                embeddings,
+                                ((0, padding_needed), (0, 0)),
+                                mode="constant",
+                                constant_values=0,
                             )
-                            padded[: embeddings.shape[0], :] = embeddings
                         padded_embeddings.append(padded)
                 else:
                     padded_embeddings.append(embeddings)

diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
@@ -32,7 +32,7 @@
 @register_representation(ModalityType.TEXT)
 class BoW(UnimodalRepresentation):
     def __init__(self, ngram_range=2, min_df=2, output_file=None):
-        parameters = {"ngram_range": [ngram_range], "min_df": [min_df]}
+        parameters = {"ngram_range": [2, 3, 5, 10], "min_df": [1, 2, 4, 8]}
         super().__init__("BoW", ModalityType.EMBEDDING, parameters)
         self.ngram_range = int(ngram_range)
         self.min_df = int(min_df)

diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py
@@ -34,7 +34,7 @@
 from systemds.scuro.utils.torch_dataset import CustomDataset
 
 
-@register_representation(ModalityType.VIDEO)
+@register_representation([ModalityType.VIDEO, ModalityType.IMAGE])
 class CLIPVisual(UnimodalRepresentation):
     def __init__(self, output_file=None):
         parameters = {}
@@ -46,8 +46,10 @@ def __init__(self, output_file=None):
         self.output_file = output_file
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality, self)
-        self.data_type = numpy_dtype_to_torch_dtype(modality.data_type)
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
+        self.data_type = torch.float32
         if next(self.model.parameters()).dtype != self.data_type:
             self.model = self.model.to(self.data_type)
 
@@ -60,14 +62,20 @@ def transform(self, modality):
         return transformed_modality
 
     def create_visual_embeddings(self, modality):
-        tf = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])
+
+        clip_transform = transforms.Compose(
+            [
+                transforms.ToPILImage(),
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.ConvertImageDtype(dtype=self.data_type),
+            ]
+        )
         dataset = CustomDataset(
-            modality.data,
-            self.data_type,
-            get_device(),
-            (modality.metadata[0]["width"], modality.metadata[0]["height"]),
-            tf=tf,
+            modality.data, self.data_type, get_device(), tf=clip_transform
         )
+
         embeddings = {}
         for instance in torch.utils.data.DataLoader(dataset):
             id = int(instance["id"][0])
@@ -94,7 +102,7 @@ def create_visual_embeddings(self, modality):
                     .cpu()
                     .float()
                     .numpy()
-                    .astype(modality.data_type)
+                    .astype(np.float32)
                 )
 
             embeddings[id] = np.array(embeddings[id])
@@ -113,7 +121,9 @@ def __init__(self, output_file=None):
         self.output_file = output_file
 
     def transform(self, modality):
-        transformed_modality = TransformedModality(modality, self)
+        transformed_modality = TransformedModality(
+            modality, self, self.output_modality_type
+        )
 
         embeddings = self.create_text_embeddings(modality.data, self.model)
 

diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py
@@ -91,8 +91,8 @@ def transform_with_training(self, modalities: List[Modality], task):
         transformed_data = np.zeros(
             (len(modalities[0].data), transformed_train.shape[1])
         )
-        transformed_data[task.train_indices] = transformed_train
-        transformed_data[task.test_indices] = transformed_other
+        transformed_data[fusion_train_indices] = transformed_train
+        transformed_data[all_other_indices] = transformed_other
 
         return transformed_data
 

diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py
@@ -188,11 +188,11 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
             criterion = nn.CrossEntropyLoss()
         optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
 
-        X_tensor = torch.FloatTensor(X).to(device)
+        X_tensor = torch.FloatTensor(X)
         if self.is_multilabel:
-            y_tensor = torch.FloatTensor(y).to(device)
+            y_tensor = torch.FloatTensor(y)
         else:
-            y_tensor = torch.LongTensor(y).to(device)
+            y_tensor = torch.LongTensor(y)
 
         dataset = TensorDataset(X_tensor, y_tensor)
         dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
@@ -201,6 +201,8 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
         for epoch in range(self.epochs):
             total_loss = 0
             for batch_X, batch_y in dataloader:
+                batch_X = batch_X.to(device)
+                batch_y = batch_y.to(device)
                 optimizer.zero_grad()
 
                 features, predictions = self.model(batch_X)
@@ -230,6 +232,7 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
                 TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False
             )
             for (batch_X,) in inference_dataloader:
+                batch_X = batch_X.to(device)
                 features, _ = self.model(batch_X)
                 all_features.append(features.cpu())
 
@@ -244,14 +247,15 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
         device = get_device()
         self.model.to(device)
 
-        X_tensor = torch.FloatTensor(X).to(device)
+        X_tensor = torch.FloatTensor(X)
         all_features = []
         self.model.eval()
         with torch.no_grad():
             inference_dataloader = DataLoader(
                 TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False
             )
             for (batch_X,) in inference_dataloader:
+                batch_X = batch_X.to(device)
                 features, _ = self.model(batch_X)
                 all_features.append(features.cpu())
 

diff --git a/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py
@@ -162,12 +162,12 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
         )
 
         for modality_name in inputs:
-            inputs[modality_name] = inputs[modality_name].to(device)
+            inputs[modality_name] = inputs[modality_name]
 
         if self.is_multilabel:
-            labels_tensor = torch.from_numpy(y).float().to(device)
+            labels_tensor = torch.from_numpy(y).float()
         else:
-            labels_tensor = torch.from_numpy(y).long().to(device)
+            labels_tensor = torch.from_numpy(y).long()
 
         dataset_inputs = []
         for i in range(len(y)):
@@ -199,9 +199,9 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
                 for modality_name in batch_inputs:
                     batch_inputs[modality_name] = torch.stack(
                         batch_inputs[modality_name]
-                    )
+                    ).to(device)
 
-                batch_labels = torch.stack(batch_labels)
+                batch_labels = torch.stack(batch_labels).to(device)
 
                 optimizer.zero_grad()
 
@@ -250,7 +250,9 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None):
 
                 batch_inputs = {}
                 for modality_name, tensor in inputs.items():
-                    batch_inputs[modality_name] = tensor[batch_start:batch_end]
+                    batch_inputs[modality_name] = tensor[batch_start:batch_end].to(
+                        device
+                    )
 
                 encoder_output = self.encoder(batch_inputs)
                 all_features.append(encoder_output["fused"].cpu())
@@ -266,9 +268,6 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
         device = get_device()
         self.encoder.to(device)
 
-        for modality_name in inputs:
-            inputs[modality_name] = inputs[modality_name].to(device)
-
         self.encoder.eval()
         all_features = []
 
@@ -281,7 +280,9 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray:
 
                 batch_inputs = {}
                 for modality_name, tensor in inputs.items():
-                    batch_inputs[modality_name] = tensor[batch_start:batch_end]
+                    batch_inputs[modality_name] = tensor[batch_start:batch_end].to(
+                        device
+                    )
 
                 encoder_output = self.encoder(batch_inputs)
                 all_features.append(encoder_output["fused"].cpu())

diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
@@ -163,7 +163,7 @@ def hook(
                     .cpu()
                     .float()
                     .numpy()
-                    .astype(modality.data_type)
+                    .astype(np.float32)
                 )
 
             embeddings[video_id] = np.array(embeddings[video_id])

diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -32,7 +32,7 @@
 @register_representation(ModalityType.TEXT)
 class TfIdf(UnimodalRepresentation):
     def __init__(self, min_df=2, output_file=None):
-        parameters = {"min_df": [min_df]}
+        parameters = {"min_df": [min_df, 4, 8]}
         super().__init__("TF-IDF", ModalityType.EMBEDDING, parameters)
         self.min_df = int(min_df)
         self.output_file = output_file