apache · christinadionysio · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -171,7 +171,8 @@ jobs:
           h5py \
           gensim \
           opt-einsum \
-          nltk
+          nltk \
+          fvcore
         kill $KA 
         cd src/main/python
-        python -m unittest discover -s tests/scuro -p 'test_*.py' -v
+        python -m unittest discover -s tests/scuro -p 'test_*.py' -v
diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py
@@ -77,6 +77,7 @@
 )
 from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.representations.x3d import X3D
+from systemds.scuro.representations.color_histogram import ColorHistogram
 from systemds.scuro.models.model import Model
 from systemds.scuro.models.discrete_model import DiscreteModel
 from systemds.scuro.modality.joined import JoinedModality
@@ -97,7 +98,8 @@
 )
 from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
-
+from systemds.scuro.representations.vgg import VGG19
+from systemds.scuro.representations.clip import CLIPText, CLIPVisual
 
 __all__ = [
     "BaseLoader",
@@ -120,6 +122,7 @@
     "MFCC",
     "Hadamard",
     "OpticalFlow",
+    "ColorHistogram",
     "Representation",
     "NPY",
     "JSON",
@@ -169,4 +172,7 @@
     "Quantile",
     "BandpowerFFT",
     "ZeroCrossingRate",
+    "VGG19",
+    "CLIPVisual",
+    "CLIPText",
 ]
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
@@ -254,7 +254,16 @@ def create_video_metadata(self, frequency, length, width, height, num_channels):
         md["data_layout"]["representation"] = DataLayout.NESTED_LEVEL
         md["data_layout"]["type"] = float
         md["data_layout"]["shape"] = (width, height, num_channels)
+        return md
 
+    def create_image_metadata(self, width, height, num_channels):
+        md = deepcopy(self.get_schema())
+        md["width"] = width
+        md["height"] = height
+        md["num_channels"] = num_channels
+        md["data_layout"]["representation"] = DataLayout.SINGLE_LEVEL
+        md["data_layout"]["type"] = float
+        md["data_layout"]["shape"] = (width, height, num_channels)
         return md
 
 

diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -165,8 +165,9 @@ def apply_representation(self, representation):
                         padded = np.pad(
                             embeddings,
                             pad_width=(
-                                (0, padding_needed),
-                                (0, 0),
+                                (0, padding_needed)
+                                if len(embeddings.shape) == 1
+                                else ((0, padding_needed), (0, 0))
                             ),
                             mode="constant",
                             constant_values=0,

diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py
@@ -0,0 +1,133 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import numpy as np
+from torchvision import transforms
+
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+import torch
+from systemds.scuro.representations.utils import save_embeddings
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.drsearch.operator_registry import register_representation
+from transformers import CLIPProcessor, CLIPModel
+
+from systemds.scuro.utils.converter import numpy_dtype_to_torch_dtype
+from systemds.scuro.utils.static_variables import get_device
+from systemds.scuro.utils.torch_dataset import CustomDataset
+
+
+@register_representation(ModalityType.VIDEO)
+class CLIPVisual(UnimodalRepresentation):
+    def __init__(self, output_file=None):
+        parameters = {}
+        super().__init__("CLIPVisual", ModalityType.EMBEDDING, parameters)
+        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(
+            get_device()
+        )
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.output_file = output_file
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(modality, self)
+        self.data_type = numpy_dtype_to_torch_dtype(modality.data_type)
+        if next(self.model.parameters()).dtype != self.data_type:
+            self.model = self.model.to(self.data_type)
+
+        embeddings = self.create_visual_embeddings(modality)
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
+
+        transformed_modality.data = list(embeddings.values())
+        return transformed_modality
+
+    def create_visual_embeddings(self, modality):
+        tf = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])
+        dataset = CustomDataset(
+            modality.data,
+            self.data_type,
+            get_device(),
+            (modality.metadata[0]["width"], modality.metadata[0]["height"]),
+            tf=tf,
+        )
+        embeddings = {}
+        for instance in torch.utils.data.DataLoader(dataset):
+            id = int(instance["id"][0])
+            frames = instance["data"][0]
+            embeddings[id] = []
+            batch_size = 64
+
+            for start_index in range(0, len(frames), batch_size):
+                end_index = min(start_index + batch_size, len(frames))
+                frame_ids_range = range(start_index, end_index)
+                frame_batch = frames[frame_ids_range]
+
+                inputs = self.processor(images=frame_batch, return_tensors="pt")
+                with torch.no_grad():
+                    output = self.model.get_image_features(**inputs)
+
+                if len(output.shape) > 2:
+                    output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1))
+
+                embeddings[id].extend(
+                    torch.flatten(output, 1)
+                    .detach()
+                    .cpu()
+                    .float()
+                    .numpy()
+                    .astype(modality.data_type)
+                )
+
+            embeddings[id] = np.array(embeddings[id])
+        return embeddings
+
+
+@register_representation(ModalityType.TEXT)
+class CLIPText(UnimodalRepresentation):
+    def __init__(self, output_file=None):
+        parameters = {}
+        super().__init__("CLIPText", ModalityType.EMBEDDING, parameters)
+        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(
+            get_device()
+        )
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.output_file = output_file
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(modality, self)
+
+        embeddings = self.create_text_embeddings(modality.data, self.model)
+
+        if self.output_file is not None:
+            save_embeddings(embeddings, self.output_file)
+
+        transformed_modality.data = embeddings
+        return transformed_modality
+
+    def create_text_embeddings(self, data, model):
+        embeddings = []
+        for d in data:
+            inputs = self.processor(text=d, return_tensors="pt", padding=True)
+            with torch.no_grad():
+                text_embedding = model.get_text_features(**inputs)
+                embeddings.append(text_embedding.squeeze().numpy().reshape(1, -1))
+
+        return embeddings
diff --git a/src/main/python/systemds/scuro/representations/color_histogram.py b/src/main/python/systemds/scuro/representations/color_histogram.py
@@ -0,0 +1,111 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import numpy as np
+import cv2
+
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.modality.transformed import TransformedModality
+
+
+class ColorHistogram(UnimodalRepresentation):
+    def __init__(
+        self,
+        color_space="RGB",
+        bins=32,
+        normalize=True,
+        aggregation="mean",
+        output_file=None,
+    ):
+        super().__init__(
+            "ColorHistogram", ModalityType.EMBEDDING, self._get_parameters()
+        )
+        self.color_space = color_space
+        self.bins = bins
+        self.normalize = normalize
+        self.aggregation = aggregation
+        self.output_file = output_file
+
+    def _get_parameters(self):
+        return {
+            "color_space": ["RGB", "HSV", "GRAY"],
+            "bins": [8, 16, 32, 64, 128, 256, (8, 8, 8), (16, 16, 16)],
+            "normalize": [True, False],
+            "aggregation": ["mean", "max", "concat"],
+        }
+
+    def compute_histogram(self, image):
+        if self.color_space == "HSV":
+            img = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
+            channels = [0, 1, 2]
+        elif self.color_space == "GRAY":
+            img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+            channels = [0]
+        else:
+            img = image
+            channels = [0, 1, 2]
+
+        hist = self._region_histogram(img, channels)
+        return hist
+
+    def _region_histogram(self, img, channels):
+        if isinstance(self.bins, tuple):
+            bins = self.bins
+        elif len(channels) > 1:
+            bins = [self.bins] * len(channels)
+        else:
+            bins = [self.bins]
+        hist = cv2.calcHist([img], channels, None, bins, [0, 256] * len(channels))
+        hist = hist.flatten()
+        if self.normalize:
+            hist_sum = np.sum(hist)
+            if hist_sum > 0:
+                hist /= hist_sum
+        return hist.astype(np.float32)
+
+    def transform(self, modality):
+        if modality.modality_type == ModalityType.IMAGE:
+            images = modality.data
+            hist_list = [self.compute_histogram(img) for img in images]
+            transformed_modality = TransformedModality(
+                modality, self, ModalityType.EMBEDDING
+            )
+            transformed_modality.data = hist_list
+            return transformed_modality
+        elif modality.modality_type == ModalityType.VIDEO:
+            embeddings = []
+            for vid in modality.data:
+                frame_hists = [self.compute_histogram(frame) for frame in vid]
+                if self.aggregation == "mean":
+                    hist = np.mean(frame_hists, axis=0)
+                elif self.aggregation == "max":
+                    hist = np.max(frame_hists, axis=0)
+                elif self.aggregation == "concat":
+                    hist = np.concatenate(frame_hists)
+                embeddings.append(hist)
+            transformed_modality = TransformedModality(
+                modality, self, ModalityType.EMBEDDING
+            )
+            transformed_modality.data = embeddings
+            return transformed_modality
+        else:
+            raise ValueError("Unsupported data format for HistogramRepresentation")
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
@@ -144,17 +144,21 @@ def hook(
             embeddings[video_id] = []
             batch_size = 64
 
+            if modality.modality_type == ModalityType.IMAGE:
+                frames = frames.unsqueeze(0)
+
             for start_index in range(0, len(frames), batch_size):
                 end_index = min(start_index + batch_size, len(frames))
                 frame_ids_range = range(start_index, end_index)
                 frame_batch = frames[frame_ids_range]
 
                 _ = self.model(frame_batch)
-                values = res5c_output
-                pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
+                output = res5c_output
+                if len(output.shape) > 2:
+                    output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1))
 
                 embeddings[video_id].extend(
-                    torch.flatten(pooled, 1)
+                    torch.flatten(output, 1)
                     .detach()
                     .cpu()
                     .float()

diff --git a/src/main/python/systemds/scuro/representations/swin_video_transformer.py b/src/main/python/systemds/scuro/representations/swin_video_transformer.py
@@ -34,7 +34,7 @@
 from systemds.scuro.utils.static_variables import get_device
 
 
-# @register_representation([ModalityType.VIDEO])
+@register_representation([ModalityType.VIDEO])
 class SwinVideoTransformer(UnimodalRepresentation):
     def __init__(self, layer_name="avgpool"):
         parameters = {
@@ -50,7 +50,7 @@ def __init__(self, layer_name="avgpool"):
             ],
         }
         self.data_type = torch.float
-        super().__init__("SwinVideoTransformer", ModalityType.TIMESERIES, parameters)
+        super().__init__("SwinVideoTransformer", ModalityType.EMBEDDING, parameters)
         self.layer_name = layer_name
         self.model = swin3d_t(weights=models.video.Swin3D_T_Weights.KINETICS400_V1).to(
             get_device()
@@ -95,6 +95,7 @@ def hook(
                 .detach()
                 .cpu()
                 .numpy()
+                .flatten()
                 .astype(modality.data_type)
             )