Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ jobs:
h5py \
gensim \
opt-einsum \
nltk
nltk \
fvcore
kill $KA
cd src/main/python
python -m unittest discover -s tests/scuro -p 'test_*.py' -v
python -m unittest discover -s tests/scuro -p 'test_*.py' -v
8 changes: 7 additions & 1 deletion src/main/python/systemds/scuro/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
)
from systemds.scuro.representations.word2vec import W2V
from systemds.scuro.representations.x3d import X3D
from systemds.scuro.representations.color_histogram import ColorHistogram
from systemds.scuro.models.model import Model
from systemds.scuro.models.discrete_model import DiscreteModel
from systemds.scuro.modality.joined import JoinedModality
Expand All @@ -97,7 +98,8 @@
)
from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer

from systemds.scuro.representations.vgg import VGG19
from systemds.scuro.representations.clip import CLIPText, CLIPVisual

__all__ = [
"BaseLoader",
Expand All @@ -120,6 +122,7 @@
"MFCC",
"Hadamard",
"OpticalFlow",
"ColorHistogram",
"Representation",
"NPY",
"JSON",
Expand Down Expand Up @@ -169,4 +172,7 @@
"Quantile",
"BandpowerFFT",
"ZeroCrossingRate",
"VGG19",
"CLIPVisual",
"CLIPText",
]
9 changes: 9 additions & 0 deletions src/main/python/systemds/scuro/modality/type.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,16 @@ def create_video_metadata(self, frequency, length, width, height, num_channels):
md["data_layout"]["representation"] = DataLayout.NESTED_LEVEL
md["data_layout"]["type"] = float
md["data_layout"]["shape"] = (width, height, num_channels)
return md

def create_image_metadata(self, width, height, num_channels):
md = deepcopy(self.get_schema())
md["width"] = width
md["height"] = height
md["num_channels"] = num_channels
md["data_layout"]["representation"] = DataLayout.SINGLE_LEVEL
md["data_layout"]["type"] = float
md["data_layout"]["shape"] = (width, height, num_channels)
return md


Expand Down
5 changes: 3 additions & 2 deletions src/main/python/systemds/scuro/modality/unimodal_modality.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,9 @@ def apply_representation(self, representation):
padded = np.pad(
embeddings,
pad_width=(
(0, padding_needed),
(0, 0),
(0, padding_needed)
if len(embeddings.shape) == 1
else ((0, padding_needed), (0, 0))
),
mode="constant",
constant_values=0,
Expand Down
133 changes: 133 additions & 0 deletions src/main/python/systemds/scuro/representations/clip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# -------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# -------------------------------------------------------------
import numpy as np
from torchvision import transforms

from systemds.scuro.modality.transformed import TransformedModality
from systemds.scuro.representations.unimodal import UnimodalRepresentation
import torch
from systemds.scuro.representations.utils import save_embeddings
from systemds.scuro.modality.type import ModalityType
from systemds.scuro.drsearch.operator_registry import register_representation
from transformers import CLIPProcessor, CLIPModel

from systemds.scuro.utils.converter import numpy_dtype_to_torch_dtype
from systemds.scuro.utils.static_variables import get_device
from systemds.scuro.utils.torch_dataset import CustomDataset


@register_representation(ModalityType.VIDEO)
class CLIPVisual(UnimodalRepresentation):
def __init__(self, output_file=None):
parameters = {}
super().__init__("CLIPVisual", ModalityType.EMBEDDING, parameters)
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(
get_device()
)
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
self.output_file = output_file

def transform(self, modality):
transformed_modality = TransformedModality(modality, self)
self.data_type = numpy_dtype_to_torch_dtype(modality.data_type)
if next(self.model.parameters()).dtype != self.data_type:
self.model = self.model.to(self.data_type)

embeddings = self.create_visual_embeddings(modality)

if self.output_file is not None:
save_embeddings(embeddings, self.output_file)

transformed_modality.data = list(embeddings.values())
return transformed_modality

def create_visual_embeddings(self, modality):
tf = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()])
dataset = CustomDataset(
modality.data,
self.data_type,
get_device(),
(modality.metadata[0]["width"], modality.metadata[0]["height"]),
tf=tf,
)
embeddings = {}
for instance in torch.utils.data.DataLoader(dataset):
id = int(instance["id"][0])
frames = instance["data"][0]
embeddings[id] = []
batch_size = 64

for start_index in range(0, len(frames), batch_size):
end_index = min(start_index + batch_size, len(frames))
frame_ids_range = range(start_index, end_index)
frame_batch = frames[frame_ids_range]

inputs = self.processor(images=frame_batch, return_tensors="pt")
with torch.no_grad():
output = self.model.get_image_features(**inputs)

if len(output.shape) > 2:
output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1))

embeddings[id].extend(
torch.flatten(output, 1)
.detach()
.cpu()
.float()
.numpy()
.astype(modality.data_type)
)

embeddings[id] = np.array(embeddings[id])
return embeddings


@register_representation(ModalityType.TEXT)
class CLIPText(UnimodalRepresentation):
def __init__(self, output_file=None):
parameters = {}
super().__init__("CLIPText", ModalityType.EMBEDDING, parameters)
self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(
get_device()
)
self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
self.output_file = output_file

def transform(self, modality):
transformed_modality = TransformedModality(modality, self)

embeddings = self.create_text_embeddings(modality.data, self.model)

if self.output_file is not None:
save_embeddings(embeddings, self.output_file)

transformed_modality.data = embeddings
return transformed_modality

def create_text_embeddings(self, data, model):
embeddings = []
for d in data:
inputs = self.processor(text=d, return_tensors="pt", padding=True)
with torch.no_grad():
text_embedding = model.get_text_features(**inputs)
embeddings.append(text_embedding.squeeze().numpy().reshape(1, -1))

return embeddings
111 changes: 111 additions & 0 deletions src/main/python/systemds/scuro/representations/color_histogram.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# -------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# -------------------------------------------------------------

import numpy as np
import cv2

from systemds.scuro.modality.type import ModalityType
from systemds.scuro.representations.unimodal import UnimodalRepresentation
from systemds.scuro.modality.transformed import TransformedModality


class ColorHistogram(UnimodalRepresentation):
def __init__(
self,
color_space="RGB",
bins=32,
normalize=True,
aggregation="mean",
output_file=None,
):
super().__init__(
"ColorHistogram", ModalityType.EMBEDDING, self._get_parameters()
)
self.color_space = color_space
self.bins = bins
self.normalize = normalize
self.aggregation = aggregation
self.output_file = output_file

def _get_parameters(self):
return {
"color_space": ["RGB", "HSV", "GRAY"],
"bins": [8, 16, 32, 64, 128, 256, (8, 8, 8), (16, 16, 16)],
"normalize": [True, False],
"aggregation": ["mean", "max", "concat"],
}

def compute_histogram(self, image):
if self.color_space == "HSV":
img = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
channels = [0, 1, 2]
elif self.color_space == "GRAY":
img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
channels = [0]
else:
img = image
channels = [0, 1, 2]

hist = self._region_histogram(img, channels)
return hist

def _region_histogram(self, img, channels):
if isinstance(self.bins, tuple):
bins = self.bins
elif len(channels) > 1:
bins = [self.bins] * len(channels)
else:
bins = [self.bins]
hist = cv2.calcHist([img], channels, None, bins, [0, 256] * len(channels))
hist = hist.flatten()
if self.normalize:
hist_sum = np.sum(hist)
if hist_sum > 0:
hist /= hist_sum
return hist.astype(np.float32)

def transform(self, modality):
if modality.modality_type == ModalityType.IMAGE:
images = modality.data
hist_list = [self.compute_histogram(img) for img in images]
transformed_modality = TransformedModality(
modality, self, ModalityType.EMBEDDING
)
transformed_modality.data = hist_list
return transformed_modality
elif modality.modality_type == ModalityType.VIDEO:
embeddings = []
for vid in modality.data:
frame_hists = [self.compute_histogram(frame) for frame in vid]
if self.aggregation == "mean":
hist = np.mean(frame_hists, axis=0)
elif self.aggregation == "max":
hist = np.max(frame_hists, axis=0)
elif self.aggregation == "concat":
hist = np.concatenate(frame_hists)
embeddings.append(hist)
transformed_modality = TransformedModality(
modality, self, ModalityType.EMBEDDING
)
transformed_modality.data = embeddings
return transformed_modality
else:
raise ValueError("Unsupported data format for HistogramRepresentation")
10 changes: 7 additions & 3 deletions src/main/python/systemds/scuro/representations/resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,17 +144,21 @@ def hook(
embeddings[video_id] = []
batch_size = 64

if modality.modality_type == ModalityType.IMAGE:
frames = frames.unsqueeze(0)

for start_index in range(0, len(frames), batch_size):
end_index = min(start_index + batch_size, len(frames))
frame_ids_range = range(start_index, end_index)
frame_batch = frames[frame_ids_range]

_ = self.model(frame_batch)
values = res5c_output
pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
output = res5c_output
if len(output.shape) > 2:
output = torch.nn.functional.adaptive_avg_pool2d(output, (1, 1))

embeddings[video_id].extend(
torch.flatten(pooled, 1)
torch.flatten(output, 1)
.detach()
.cpu()
.float()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from systemds.scuro.utils.static_variables import get_device


# @register_representation([ModalityType.VIDEO])
@register_representation([ModalityType.VIDEO])
class SwinVideoTransformer(UnimodalRepresentation):
def __init__(self, layer_name="avgpool"):
parameters = {
Expand All @@ -50,7 +50,7 @@ def __init__(self, layer_name="avgpool"):
],
}
self.data_type = torch.float
super().__init__("SwinVideoTransformer", ModalityType.TIMESERIES, parameters)
super().__init__("SwinVideoTransformer", ModalityType.EMBEDDING, parameters)
self.layer_name = layer_name
self.model = swin3d_t(weights=models.video.Swin3D_T_Weights.KINETICS400_V1).to(
get_device()
Expand Down Expand Up @@ -95,6 +95,7 @@ def hook(
.detach()
.cpu()
.numpy()
.flatten()
.astype(modality.data_type)
)

Expand Down
Loading
Loading