diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index fcd8bf8c849..112e256ee07 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -172,7 +172,8 @@ jobs: gensim \ opt-einsum \ nltk \ - fvcore + fvcore \ + scikit-optimize kill $KA cd src/main/python python -m unittest discover -s tests/scuro -p 'test_*.py' -v diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py index 8471cc7c356..2fee7cbf5a3 100644 --- a/src/main/python/systemds/scuro/dataloader/video_loader.py +++ b/src/main/python/systemds/scuro/dataloader/video_loader.py @@ -45,11 +45,6 @@ def __init__( def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) - # if not self.load_data_from_file: - # self.metadata[file] = self.modality_type.create_metadata( - # 30, 10, 100, 100, 3 - # ) - # else: cap = cv2.VideoCapture(file) if not cap.isOpened(): @@ -71,13 +66,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.fps, length, width, height, num_channels ) - num_frames = (length + frame_interval - 1) // frame_interval - - stacked_frames = np.zeros( - (num_frames, height, width, num_channels), dtype=self._data_type - ) - - frame_idx = 0 + frames = [] idx = 0 while cap.isOpened(): ret, frame = cap.read() @@ -87,11 +76,7 @@ def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): if idx % frame_interval == 0: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame = frame.astype(self._data_type) / 255.0 - stacked_frames[frame_idx] = frame - frame_idx += 1 + frames.append(frame) idx += 1 - if frame_idx < num_frames: - stacked_frames = stacked_frames[:frame_idx] - - self.data.append(stacked_frames) + self.data.append(np.stack(frames)) diff --git a/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py b/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py index 2a5f8262834..1605b7b87d9 100644 --- a/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py +++ b/src/main/python/systemds/scuro/drsearch/hyperparameter_tuner.py @@ -19,8 +19,9 @@ # # ------------------------------------------------------------- from typing import Dict, List, Tuple, Any, Optional -import numpy as np -from sklearn.model_selection import ParameterGrid +from skopt import gp_minimize +from skopt.space import Real, Integer, Categorical +from skopt.utils import use_named_args import json import logging from dataclasses import dataclass @@ -28,7 +29,6 @@ import copy from systemds.scuro.modality.modality import Modality -from systemds.scuro.drsearch.task import Task @dataclass @@ -163,18 +163,64 @@ def visit_node(node_id): start_time = time.time() rep_name = "_".join([rep.__name__ for rep in reps]) - param_grid = list(ParameterGrid(hyperparams)) - if max_evals and len(param_grid) > max_evals: - np.random.shuffle(param_grid) - param_grid = param_grid[:max_evals] + search_space = [] + param_names = [] + for param_name, param_values in hyperparams.items(): + param_names.append(param_name) + if isinstance(param_values, list): + if all(isinstance(v, (int, float)) for v in param_values): + if all(isinstance(v, int) for v in param_values): + search_space.append( + Integer( + min(param_values), max(param_values), name=param_name + ) + ) + else: + search_space.append( + Real(min(param_values), max(param_values), name=param_name) + ) + else: + search_space.append(Categorical(param_values, name=param_name)) + elif isinstance(param_values, tuple) and len(param_values) == 2: + if isinstance(param_values[0], int) and isinstance( + param_values[1], int + ): + search_space.append( + Integer(param_values[0], param_values[1], name=param_name) + ) + else: + search_space.append( + Real(param_values[0], param_values[1], name=param_name) + ) + else: + search_space.append(Categorical([param_values], name=param_name)) + + n_calls = max_evals if max_evals else 50 all_results = [] - for params in param_grid: + + @use_named_args(search_space) + def objective(**params): result = self.evaluate_dag_config( dag, params, node_order, modality_ids, task ) all_results.append(result) + score = result[1].average_scores[self.scoring_metric] + if self.maximize_metric: + return -score + else: + return score + + result = gp_minimize( + objective, + search_space, + n_calls=n_calls, + random_state=42, + verbose=self.debug, + n_initial_points=min(10, n_calls // 2), + ) + if self.maximize_metric: best_params, best_score = max( all_results, key=lambda x: x[1].average_scores[self.scoring_metric] diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index f7b7394e0fd..e4ed85cce38 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -156,26 +156,28 @@ def apply_representation(self, representation): if current_length < target_length: padding_needed = target_length - current_length if pad_dim_one: - padding = np.zeros((embeddings.shape[0], padding_needed)) - padded_embeddings.append( - np.concatenate((embeddings, padding), axis=1) + padded = np.pad( + embeddings, + ((0, 0), (0, padding_needed)), + mode="constant", + constant_values=0, ) + padded_embeddings.append(padded) else: if len(embeddings.shape) == 1: - padded = np.zeros( - embeddings.shape[0] + padding_needed, - dtype=embeddings.dtype, + padded = np.pad( + embeddings, + (0, padding_needed), + mode="constant", + constant_values=0, ) - padded[: embeddings.shape[0]] = embeddings else: - padded = np.zeros( - ( - embeddings.shape[0] + padding_needed, - embeddings.shape[1], - ), - dtype=embeddings.dtype, + padded = np.pad( + embeddings, + ((0, padding_needed), (0, 0)), + mode="constant", + constant_values=0, ) - padded[: embeddings.shape[0], :] = embeddings padded_embeddings.append(padded) else: padded_embeddings.append(embeddings) diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py index 2b338d30ee6..9d1d82a6be8 100644 --- a/src/main/python/systemds/scuro/representations/bow.py +++ b/src/main/python/systemds/scuro/representations/bow.py @@ -32,7 +32,7 @@ @register_representation(ModalityType.TEXT) class BoW(UnimodalRepresentation): def __init__(self, ngram_range=2, min_df=2, output_file=None): - parameters = {"ngram_range": [ngram_range], "min_df": [min_df]} + parameters = {"ngram_range": [2, 3, 5, 10], "min_df": [1, 2, 4, 8]} super().__init__("BoW", ModalityType.EMBEDDING, parameters) self.ngram_range = int(ngram_range) self.min_df = int(min_df) diff --git a/src/main/python/systemds/scuro/representations/clip.py b/src/main/python/systemds/scuro/representations/clip.py index 1d458aeb7d0..504681f2537 100644 --- a/src/main/python/systemds/scuro/representations/clip.py +++ b/src/main/python/systemds/scuro/representations/clip.py @@ -34,7 +34,7 @@ from systemds.scuro.utils.torch_dataset import CustomDataset -@register_representation(ModalityType.VIDEO) +@register_representation([ModalityType.VIDEO, ModalityType.IMAGE]) class CLIPVisual(UnimodalRepresentation): def __init__(self, output_file=None): parameters = {} @@ -46,8 +46,10 @@ def __init__(self, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality(modality, self) - self.data_type = numpy_dtype_to_torch_dtype(modality.data_type) + transformed_modality = TransformedModality( + modality, self, self.output_modality_type + ) + self.data_type = torch.float32 if next(self.model.parameters()).dtype != self.data_type: self.model = self.model.to(self.data_type) @@ -60,14 +62,20 @@ def transform(self, modality): return transformed_modality def create_visual_embeddings(self, modality): - tf = transforms.Compose([transforms.ToPILImage(), transforms.ToTensor()]) + + clip_transform = transforms.Compose( + [ + transforms.ToPILImage(), + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.ConvertImageDtype(dtype=self.data_type), + ] + ) dataset = CustomDataset( - modality.data, - self.data_type, - get_device(), - (modality.metadata[0]["width"], modality.metadata[0]["height"]), - tf=tf, + modality.data, self.data_type, get_device(), tf=clip_transform ) + embeddings = {} for instance in torch.utils.data.DataLoader(dataset): id = int(instance["id"][0]) @@ -94,7 +102,7 @@ def create_visual_embeddings(self, modality): .cpu() .float() .numpy() - .astype(modality.data_type) + .astype(np.float32) ) embeddings[id] = np.array(embeddings[id]) @@ -113,7 +121,9 @@ def __init__(self, output_file=None): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality(modality, self) + transformed_modality = TransformedModality( + modality, self, self.output_modality_type + ) embeddings = self.create_text_embeddings(modality.data, self.model) diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py index addccadade7..7ac0200819c 100644 --- a/src/main/python/systemds/scuro/representations/fusion.py +++ b/src/main/python/systemds/scuro/representations/fusion.py @@ -91,8 +91,8 @@ def transform_with_training(self, modalities: List[Modality], task): transformed_data = np.zeros( (len(modalities[0].data), transformed_train.shape[1]) ) - transformed_data[task.train_indices] = transformed_train - transformed_data[task.test_indices] = transformed_other + transformed_data[fusion_train_indices] = transformed_train + transformed_data[all_other_indices] = transformed_other return transformed_data diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py index efc31272743..58b878820e6 100644 --- a/src/main/python/systemds/scuro/representations/lstm.py +++ b/src/main/python/systemds/scuro/representations/lstm.py @@ -188,11 +188,11 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None): criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) - X_tensor = torch.FloatTensor(X).to(device) + X_tensor = torch.FloatTensor(X) if self.is_multilabel: - y_tensor = torch.FloatTensor(y).to(device) + y_tensor = torch.FloatTensor(y) else: - y_tensor = torch.LongTensor(y).to(device) + y_tensor = torch.LongTensor(y) dataset = TensorDataset(X_tensor, y_tensor) dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True) @@ -201,6 +201,8 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None): for epoch in range(self.epochs): total_loss = 0 for batch_X, batch_y in dataloader: + batch_X = batch_X.to(device) + batch_y = batch_y.to(device) optimizer.zero_grad() features, predictions = self.model(batch_X) @@ -230,6 +232,7 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None): TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False ) for (batch_X,) in inference_dataloader: + batch_X = batch_X.to(device) features, _ = self.model(batch_X) all_features.append(features.cpu()) @@ -244,7 +247,7 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray: device = get_device() self.model.to(device) - X_tensor = torch.FloatTensor(X).to(device) + X_tensor = torch.FloatTensor(X) all_features = [] self.model.eval() with torch.no_grad(): @@ -252,6 +255,7 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray: TensorDataset(X_tensor), batch_size=self.batch_size, shuffle=False ) for (batch_X,) in inference_dataloader: + batch_X = batch_X.to(device) features, _ = self.model(batch_X) all_features.append(features.cpu()) diff --git a/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py index 3f86610550e..a295eaa267a 100644 --- a/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py +++ b/src/main/python/systemds/scuro/representations/multimodal_attention_fusion.py @@ -162,12 +162,12 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None): ) for modality_name in inputs: - inputs[modality_name] = inputs[modality_name].to(device) + inputs[modality_name] = inputs[modality_name] if self.is_multilabel: - labels_tensor = torch.from_numpy(y).float().to(device) + labels_tensor = torch.from_numpy(y).float() else: - labels_tensor = torch.from_numpy(y).long().to(device) + labels_tensor = torch.from_numpy(y).long() dataset_inputs = [] for i in range(len(y)): @@ -199,9 +199,9 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None): for modality_name in batch_inputs: batch_inputs[modality_name] = torch.stack( batch_inputs[modality_name] - ) + ).to(device) - batch_labels = torch.stack(batch_labels) + batch_labels = torch.stack(batch_labels).to(device) optimizer.zero_grad() @@ -250,7 +250,9 @@ def execute(self, modalities: List[Modality], labels: np.ndarray = None): batch_inputs = {} for modality_name, tensor in inputs.items(): - batch_inputs[modality_name] = tensor[batch_start:batch_end] + batch_inputs[modality_name] = tensor[batch_start:batch_end].to( + device + ) encoder_output = self.encoder(batch_inputs) all_features.append(encoder_output["fused"].cpu()) @@ -266,9 +268,6 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray: device = get_device() self.encoder.to(device) - for modality_name in inputs: - inputs[modality_name] = inputs[modality_name].to(device) - self.encoder.eval() all_features = [] @@ -281,7 +280,9 @@ def apply_representation(self, modalities: List[Modality]) -> np.ndarray: batch_inputs = {} for modality_name, tensor in inputs.items(): - batch_inputs[modality_name] = tensor[batch_start:batch_end] + batch_inputs[modality_name] = tensor[batch_start:batch_end].to( + device + ) encoder_output = self.encoder(batch_inputs) all_features.append(encoder_output["fused"].cpu()) diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 55e7d369481..50fe084b9f5 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -163,7 +163,7 @@ def hook( .cpu() .float() .numpy() - .astype(modality.data_type) + .astype(np.float32) ) embeddings[video_id] = np.array(embeddings[video_id]) diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py index c82961949fe..95fa6c111f7 100644 --- a/src/main/python/systemds/scuro/representations/tfidf.py +++ b/src/main/python/systemds/scuro/representations/tfidf.py @@ -32,7 +32,7 @@ @register_representation(ModalityType.TEXT) class TfIdf(UnimodalRepresentation): def __init__(self, min_df=2, output_file=None): - parameters = {"min_df": [min_df]} + parameters = {"min_df": [min_df, 4, 8]} super().__init__("TF-IDF", ModalityType.EMBEDDING, parameters) self.min_df = int(min_df) self.output_file = output_file diff --git a/src/main/python/systemds/scuro/representations/timeseries_representations.py b/src/main/python/systemds/scuro/representations/timeseries_representations.py index 631294809ee..3270992a97c 100644 --- a/src/main/python/systemds/scuro/representations/timeseries_representations.py +++ b/src/main/python/systemds/scuro/representations/timeseries_representations.py @@ -184,7 +184,7 @@ def compute_feature(self, signal): @register_representation([ModalityType.TIMESERIES]) class SpectralCentroid(TimeSeriesRepresentation): def __init__(self, fs=1.0): - super().__init__("SpectralCentroid", parameters={"fs": [1.0]}) + super().__init__("SpectralCentroid", parameters={"fs": [0.5, 1.0, 2.0]}) self.fs = fs def compute_feature(self, signal): @@ -199,7 +199,8 @@ def compute_feature(self, signal): class BandpowerFFT(TimeSeriesRepresentation): def __init__(self, fs=1.0, f1=0.0, f2=0.5): super().__init__( - "BandpowerFFT", parameters={"fs": [1.0], "f1": [0.0], "f2": [0.5]} + "BandpowerFFT", + parameters={"fs": [0.5, 1.0], "f1": [0.0, 1.0], "f2": [0.5, 1.0]}, ) self.fs = fs self.f1 = f1 diff --git a/src/main/python/systemds/scuro/representations/vgg.py b/src/main/python/systemds/scuro/representations/vgg.py index 4f4324a372d..8bc4a15b951 100644 --- a/src/main/python/systemds/scuro/representations/vgg.py +++ b/src/main/python/systemds/scuro/representations/vgg.py @@ -53,14 +53,14 @@ def forward(self, input_: torch.Tensor) -> torch.Tensor: self.model.fc = Identity() def _get_parameters(self): - parameters = {"layer_name": []} - - parameters["layer_name"] = [ - "features.35", - "classifier.0", - "classifier.3", - "classifier.6", - ] + parameters = { + "layer_name": [ + "features.35", + "classifier.0", + "classifier.3", + "classifier.6", + ] + } return parameters @@ -120,7 +120,7 @@ def hook( .cpu() .float() .numpy() - .astype(modality.data_type) + .astype(np.float32) ) embeddings[video_id] = np.array(embeddings[video_id]) diff --git a/src/main/python/systemds/scuro/representations/window_aggregation.py b/src/main/python/systemds/scuro/representations/window_aggregation.py index adb92ceb530..f40b28ea871 100644 --- a/src/main/python/systemds/scuro/representations/window_aggregation.py +++ b/src/main/python/systemds/scuro/representations/window_aggregation.py @@ -171,7 +171,7 @@ def window_aggregate_nested_level(self, instance, new_length): class StaticWindow(Window): def __init__(self, aggregation_function="mean", num_windows=100): super().__init__("StaticWindow", aggregation_function) - self.parameters["num_windows"] = [num_windows] + self.parameters["num_windows"] = [10, num_windows] self.num_windows = int(num_windows) def execute(self, modality): @@ -202,7 +202,7 @@ def execute(self, modality): class DynamicWindow(Window): def __init__(self, aggregation_function="mean", num_windows=100): super().__init__("DynamicWindow", aggregation_function) - self.parameters["num_windows"] = [num_windows] + self.parameters["num_windows"] = [10, num_windows] self.num_windows = int(num_windows) def execute(self, modality): diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py index 837811935cd..737d72b8b0c 100644 --- a/src/main/python/systemds/scuro/representations/word2vec.py +++ b/src/main/python/systemds/scuro/representations/word2vec.py @@ -43,8 +43,8 @@ def get_embedding(sentence, model): class W2V(UnimodalRepresentation): def __init__(self, vector_size=150, min_count=1, output_file=None): parameters = { - "vector_size": [vector_size], - "min_count": [min_count], + "vector_size": [50, 100, 150, 200], + "min_count": [1, 2, 4, 8], } super().__init__("Word2Vec", ModalityType.EMBEDDING, parameters) self.vector_size = vector_size diff --git a/src/main/python/tests/README.md b/src/main/python/tests/README.md index 24e0f018634..bea078ca28d 100644 --- a/src/main/python/tests/README.md +++ b/src/main/python/tests/README.md @@ -46,3 +46,5 @@ To execute the Federated Tests, use: Federated experiments are a little different from the rest, since they require some setup in form of federated workers. See more details in the [script](federated/runFedTest.sh) + +https://github.com/nttcslab/byol-a/blob/master/pretrained_weights/AudioNTT2020-BYOLA-64x96d512.pth \ No newline at end of file diff --git a/src/main/python/tests/scuro/test_hp_tuner.py b/src/main/python/tests/scuro/test_hp_tuner.py index 802f737b0a5..8484a352e44 100644 --- a/src/main/python/tests/scuro/test_hp_tuner.py +++ b/src/main/python/tests/scuro/test_hp_tuner.py @@ -239,10 +239,11 @@ def run_hp_for_modality( fusion_results, k=1, optimize_unimodal=tune_unimodal_representations, + max_eval_per_rep=20, ) else: - hp.tune_unimodal_representations() + hp.tune_unimodal_representations(max_eval_per_rep=10) assert len(hp.results) == len(self.tasks) assert len(hp.results[self.tasks[0].model.name]) == 2