This is an automated email from the ASF dual-hosted git repository. cdionysio pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new dc1f363565 [SYSTEMDS-3835] Add Modality Data Type dc1f363565 is described below commit dc1f36356516f677780b8dd82da3ec55e75ca1be Author: Christina Dionysio <diony...@tu-berlin.de> AuthorDate: Mon Jul 28 12:19:26 2025 +0200 [SYSTEMDS-3835] Add Modality Data Type This patch adds a data type for all modalities. Closes #2270. --- src/main/python/systemds/scuro/__init__.py | 6 +- .../systemds/scuro/dataloader/audio_loader.py | 18 ++- .../systemds/scuro/dataloader/base_loader.py | 63 ++++++++-- .../systemds/scuro/dataloader/json_loader.py | 5 +- .../systemds/scuro/dataloader/text_loader.py | 3 +- .../systemds/scuro/dataloader/video_loader.py | 16 ++- .../systemds/scuro/drsearch/operator_registry.py | 1 + .../scuro/drsearch/representation_cache.py | 3 +- .../systemds/scuro/modality/joined_transformed.py | 4 +- .../python/systemds/scuro/modality/modality.py | 10 +- .../python/systemds/scuro/modality/transformed.py | 31 +++-- src/main/python/systemds/scuro/modality/type.py | 8 ++ .../systemds/scuro/modality/unimodal_modality.py | 17 +-- .../representations/aggregated_representation.py | 4 +- .../systemds/scuro/representations/average.py | 2 + .../python/systemds/scuro/representations/bert.py | 38 ++++-- .../python/systemds/scuro/representations/bow.py | 4 +- .../scuro/representations/concatenation.py | 4 +- .../systemds/scuro/representations/fusion.py | 3 + .../python/systemds/scuro/representations/glove.py | 10 +- .../{multiplication.py => hadamard.py} | 24 ++-- .../systemds/scuro/representations/image_bind.py | 100 +++++++++++++++ .../python/systemds/scuro/representations/lstm.py | 18 +++ .../python/systemds/scuro/representations/max.py | 48 ++------ .../scuro/representations/mel_spectrogram.py | 2 +- .../python/systemds/scuro/representations/mfcc.py | 2 +- .../systemds/scuro/representations/optical_flow.py | 5 +- .../systemds/scuro/representations/resnet.py | 50 ++++++-- .../systemds/scuro/representations/spectrogram.py | 2 +- .../python/systemds/scuro/representations/tfidf.py | 4 +- .../systemds/scuro/representations/wav2vec.py | 2 +- .../{window.py => window_aggregation.py} | 0 .../systemds/scuro/representations/word2vec.py | 4 +- .../python/systemds/scuro/representations/x3d.py | 74 ++++++++++- .../converter.py} | 38 ++++-- .../python/systemds/scuro/utils/torch_dataset.py | 40 +++--- src/main/python/systemds/utils/helpers.py | 2 +- src/main/python/tests/scuro/data_generator.py | 137 +++++++++++++++++++-- src/main/python/tests/scuro/test_dr_search.py | 52 +++----- src/main/python/tests/scuro/test_fusion_orders.py | 95 ++++++++++++++ .../python/tests/scuro/test_multimodal_fusion.py | 65 ++++++---- .../python/tests/scuro/test_multimodal_join.py | 52 ++++---- .../python/tests/scuro/test_operator_registry.py | 4 +- .../python/tests/scuro/test_unimodal_optimizer.py | 57 +++++---- .../tests/scuro/test_unimodal_representations.py | 3 +- .../python/tests/scuro/test_window_operations.py | 2 +- 46 files changed, 823 insertions(+), 309 deletions(-) diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py index 4b2185316a..1c3cfe9223 100644 --- a/src/main/python/systemds/scuro/__init__.py +++ b/src/main/python/systemds/scuro/__init__.py @@ -39,7 +39,7 @@ from systemds.scuro.representations.lstm import LSTM from systemds.scuro.representations.max import RowMax from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.mfcc import MFCC -from systemds.scuro.representations.multiplication import Multiplication +from systemds.scuro.representations.hadamard import Hadamard from systemds.scuro.representations.optical_flow import OpticalFlow from systemds.scuro.representations.representation import Representation from systemds.scuro.representations.representation_dataloader import NPY @@ -52,7 +52,7 @@ from systemds.scuro.representations.swin_video_transformer import SwinVideoTrans from systemds.scuro.representations.tfidf import TfIdf from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.wav2vec import Wav2Vec -from systemds.scuro.representations.window import WindowAggregation +from systemds.scuro.representations.window_aggregation import WindowAggregation from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.x3d import X3D from systemds.scuro.models.model import Model @@ -94,7 +94,7 @@ __all__ = [ "RowMax", "MelSpectrogram", "MFCC", - "Multiplication", + "Hadamard", "OpticalFlow", "Representation", "NPY", diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py index a008962680..a1dad304e5 100644 --- a/src/main/python/systemds/scuro/dataloader/audio_loader.py +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -21,6 +21,8 @@ from typing import List, Optional, Union import librosa +import numpy as np + from systemds.scuro.dataloader.base_loader import BaseLoader from systemds.scuro.modality.type import ModalityType @@ -30,15 +32,27 @@ class AudioLoader(BaseLoader): self, source_path: str, indices: List[str], + data_type: Union[np.dtype, str] = np.float32, chunk_size: Optional[int] = None, normalize: bool = True, + load=True, ): - super().__init__(source_path, indices, chunk_size, ModalityType.AUDIO) + super().__init__( + source_path, indices, data_type, chunk_size, ModalityType.AUDIO + ) self.normalize = normalize + self.load_data_from_file = load def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) - audio, sr = librosa.load(file) + # if not self.load_data_from_file: + # import numpy as np + # + # self.metadata[file] = self.modality_type.create_audio_metadata( + # 1000, np.array([0]) + # ) + # else: + audio, sr = librosa.load(file, dtype=self._data_type) if self.normalize: audio = librosa.util.normalize(audio) diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py index ea2b25bbb4..f21f212e7a 100644 --- a/src/main/python/systemds/scuro/dataloader/base_loader.py +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -21,6 +21,9 @@ import os from abc import ABC, abstractmethod from typing import List, Optional, Union +import math + +import numpy as np class BaseLoader(ABC): @@ -28,6 +31,7 @@ class BaseLoader(ABC): self, source_path: str, indices: List[str], + data_type: Union[np.dtype, str], chunk_size: Optional[int] = None, modality_type=None, ): @@ -48,6 +52,7 @@ class BaseLoader(ABC): self._next_chunk = 0 self._num_chunks = 1 self._chunk_size = None + self._data_type = data_type if chunk_size: self.chunk_size = chunk_size @@ -59,7 +64,7 @@ class BaseLoader(ABC): @chunk_size.setter def chunk_size(self, value): self._chunk_size = value - self._num_chunks = int(len(self.indices) / self._chunk_size) + self._num_chunks = int(math.ceil(len(self.indices) / self._chunk_size)) @property def num_chunks(self): @@ -69,6 +74,14 @@ class BaseLoader(ABC): def next_chunk(self): return self._next_chunk + @property + def data_type(self): + return self._data_type + + @data_type.setter + def data_type(self, data_type): + self._data_type = self.resolve_data_type(data_type) + def reset(self): self._next_chunk = 0 self.data = [] @@ -110,16 +123,25 @@ class BaseLoader(ABC): return self._load(next_chunk_indices) def _load(self, indices: List[str]): - is_dir = True if os.path.isdir(self.source_path) else False + file_names = self.get_file_names(indices) + if isinstance(file_names, str): + self.extract(file_names, indices) + else: + for file_name in file_names: + self.extract(file_name) + + return self.data, self.metadata + def get_file_names(self, indices=None): + is_dir = True if os.path.isdir(self.source_path) else False + file_names = [] if is_dir: _, ext = os.path.splitext(os.listdir(self.source_path)[0]) - for index in indices: - self.extract(self.source_path + index + ext) + for index in self.indices if indices is None else indices: + file_names.append(self.source_path + index + ext) + return file_names else: - self.extract(self.source_path, indices) - - return self.data, self.metadata + return self.source_path @abstractmethod def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): @@ -137,3 +159,30 @@ class BaseLoader(ABC): if file_size == 0: raise ("File {0} is empty".format(file)) + + @staticmethod + def resolve_data_type(data_type): + if isinstance(data_type, str): + if data_type.lower() in [ + "float16", + "float32", + "float64", + "int16", + "int32", + "int64", + ]: + return np.dtype(data_type) + else: + raise ValueError(f"Unsupported data_type string: {data_type}") + elif data_type in [ + np.float16, + np.float32, + np.float64, + np.int16, + np.int32, + np.int64, + str, + ]: + return data_type + else: + raise ValueError(f"Unsupported data_type: {data_type}") diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py index edef7f205b..a355edded8 100644 --- a/src/main/python/systemds/scuro/dataloader/json_loader.py +++ b/src/main/python/systemds/scuro/dataloader/json_loader.py @@ -20,6 +20,8 @@ # ------------------------------------------------------------- import json +import numpy as np + from systemds.scuro.modality.type import ModalityType from systemds.scuro.dataloader.base_loader import BaseLoader from typing import Optional, List, Union @@ -31,9 +33,10 @@ class JSONLoader(BaseLoader): source_path: str, indices: List[str], field: str, + data_type: Union[np.dtype, str] = str, chunk_size: Optional[int] = None, ): - super().__init__(source_path, indices, chunk_size, ModalityType.TEXT) + super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT) self.field = field def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py index 3f87155147..6689fb6d92 100644 --- a/src/main/python/systemds/scuro/dataloader/text_loader.py +++ b/src/main/python/systemds/scuro/dataloader/text_loader.py @@ -29,10 +29,11 @@ class TextLoader(BaseLoader): self, source_path: str, indices: List[str], + data_type: str = str, chunk_size: Optional[int] = None, prefix: Optional[Pattern[str]] = None, ): - super().__init__(source_path, indices, chunk_size, ModalityType.TEXT) + super().__init__(source_path, indices, data_type, chunk_size, ModalityType.TEXT) self.prefix = prefix def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py index 333960e698..96ea5f11f6 100644 --- a/src/main/python/systemds/scuro/dataloader/video_loader.py +++ b/src/main/python/systemds/scuro/dataloader/video_loader.py @@ -32,12 +32,22 @@ class VideoLoader(BaseLoader): self, source_path: str, indices: List[str], + data_type: Union[np.dtype, str] = np.float16, chunk_size: Optional[int] = None, + load=True, ): - super().__init__(source_path, indices, chunk_size, ModalityType.VIDEO) + super().__init__( + source_path, indices, data_type, chunk_size, ModalityType.VIDEO + ) + self.load_data_from_file = load def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): self.file_sanity_check(file) + # if not self.load_data_from_file: + # self.metadata[file] = self.modality_type.create_video_metadata( + # 30, 10, 100, 100, 3 + # ) + # else: cap = cv2.VideoCapture(file) if not cap.isOpened(): @@ -60,8 +70,8 @@ class VideoLoader(BaseLoader): if not ret: break frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - frame = frame.astype(np.float32) / 255.0 + frame = frame.astype(self._data_type) / 255.0 frames.append(frame) - self.data.append(frames) + self.data.append(np.stack(frames)) diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py b/src/main/python/systemds/scuro/drsearch/operator_registry.py index 942e5bb80e..cfd313eb56 100644 --- a/src/main/python/systemds/scuro/drsearch/operator_registry.py +++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py @@ -58,6 +58,7 @@ class Registry: return self._representations[modality] def get_context_operators(self): + # TODO: return modality specific context operations return self._context_operators def get_fusion_operators(self): diff --git a/src/main/python/systemds/scuro/drsearch/representation_cache.py b/src/main/python/systemds/scuro/drsearch/representation_cache.py index fc78167f2e..4df478272d 100644 --- a/src/main/python/systemds/scuro/drsearch/representation_cache.py +++ b/src/main/python/systemds/scuro/drsearch/representation_cache.py @@ -112,7 +112,8 @@ class RepresentationCache: metadata = pickle.load(f) transformed_modality = TransformedModality( - modality.modality_type, op_names, modality.modality_id, metadata + modality, + op_names, ) data = None with open(f"{filename}.pkl", "rb") as f: diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py b/src/main/python/systemds/scuro/modality/joined_transformed.py index a0ab8c4ce9..6c6190e03c 100644 --- a/src/main/python/systemds/scuro/modality/joined_transformed.py +++ b/src/main/python/systemds/scuro/modality/joined_transformed.py @@ -25,7 +25,7 @@ import numpy as np from systemds.scuro.modality.modality import Modality from systemds.scuro.representations.utils import pad_sequences -from systemds.scuro.representations.window import WindowAggregation +from systemds.scuro.representations.window_aggregation import WindowAggregation class JoinedTransformedModality(Modality): @@ -70,7 +70,7 @@ class JoinedTransformedModality(Modality): self.data = pad_sequences(self.data) return self - def window(self, window_size, aggregation): + def window_aggregation(self, window_size, aggregation): w = WindowAggregation(window_size, aggregation) self.left_modality.data = w.execute(self.left_modality) self.right_modality.data = w.execute(self.right_modality) diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index c16db00172..87d5b5ee4e 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -29,7 +29,9 @@ from systemds.scuro.representations import utils class Modality: - def __init__(self, modalityType: ModalityType, modality_id=-1, metadata={}): + def __init__( + self, modalityType: ModalityType, modality_id=-1, metadata={}, data_type=None + ): """ Parent class of the different Modalities (unimodal & multimodal) :param modality_type: Type of the modality @@ -38,7 +40,7 @@ class Modality: self.schema = modalityType.get_schema() self.metadata = metadata self.data = [] - self.data_type = None + self.data_type = data_type self.cost = None self.shape = None self.modality_id = modality_id @@ -67,7 +69,9 @@ class Modality: """ Create a copy of the modality instance """ - return type(self)(self.modality_type, self.metadata) + return type(self)( + self.modality_type, self.modality_id, self.metadata, self.data_type + ) def update_metadata(self): """ diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py index aba59c1efb..362764d21e 100644 --- a/src/main/python/systemds/scuro/modality/transformed.py +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -24,24 +24,28 @@ from operator import or_ from systemds.scuro.modality.type import ModalityType from systemds.scuro.modality.joined import JoinedModality from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.window import WindowAggregation +from systemds.scuro.representations.window_aggregation import WindowAggregation class TransformedModality(Modality): - def __init__(self, modality_type, transformation, modality_id, metadata): + def __init__(self, modality, transformation, new_modality_type=None): """ Parent class of the different Modalities (unimodal & multimodal) :param modality_type: Type of the original modality(ies) :param transformation: Representation to be applied on the modality """ - super().__init__(modality_type, modality_id, metadata) + if new_modality_type is None: + new_modality_type = modality.modality_type + + metadata = modality.metadata.copy() if modality.metadata is not None else None + super().__init__( + new_modality_type, modality.modality_id, metadata, modality.data_type + ) self.transformation = transformation def copy_from_instance(self): - return type(self)( - self.modality_type, self.transformation, self.modality_id, self.metadata - ) + return type(self)(self, self.transformation) def join(self, right, join_condition): chunked_execution = False @@ -65,19 +69,15 @@ class TransformedModality(Modality): return joined_modality - def window(self, windowSize, aggregation): - transformed_modality = TransformedModality( - self.modality_type, "window", self.modality_id, self.metadata - ) + def window_aggregation(self, windowSize, aggregation): w = WindowAggregation(windowSize, aggregation) + transformed_modality = TransformedModality(self, w) transformed_modality.data = w.execute(self) return transformed_modality def context(self, context_operator): - transformed_modality = TransformedModality( - self.modality_type, context_operator.name, self.modality_id, self.metadata - ) + transformed_modality = TransformedModality(self, context_operator) transformed_modality.data = context_operator.execute(self) return transformed_modality @@ -94,10 +94,7 @@ class TransformedModality(Modality): :param fusion_method: The fusion method to be used to combine modalities """ fused_modality = TransformedModality( - ModalityType.EMBEDDING, - fusion_method, - self.modality_id, - self.metadata, + self, fusion_method, ModalityType.EMBEDDING ) modalities = [self] if isinstance(other, list): diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py index 4b59c263d6..a479e07085 100644 --- a/src/main/python/systemds/scuro/modality/type.py +++ b/src/main/python/systemds/scuro/modality/type.py @@ -191,6 +191,14 @@ class ModalityType(Flag): def update_metadata(self, md, data): return ModalitySchemas.update_metadata(self.name, md, data) + def add_alignment(self, md, alignment_timestamps): + md["alignment_timestamps"] = alignment_timestamps + return md + + def add_field(self, md, field, data): + md[field] = data + return md + def create_audio_metadata(self, sampling_rate, data): md = deepcopy(self.get_schema()) md = ModalitySchemas.update_base_metadata(md, data, True) diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py index 714fe42c33..c0ee70557c 100644 --- a/src/main/python/systemds/scuro/modality/unimodal_modality.py +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -37,7 +37,12 @@ class UnimodalModality(Modality): :param data_loader: Defines how the raw data should be loaded :param modality_type: Type of the modality """ - super().__init__(data_loader.modality_type, ModalityIdentifier().new_id(), None) + super().__init__( + data_loader.modality_type, + ModalityIdentifier().new_id(), + {}, + data_loader.data_type, + ) self.data_loader = data_loader def copy_from_instance(self): @@ -84,9 +89,7 @@ class UnimodalModality(Modality): if not self.has_data(): self.extract_raw_data() - transformed_modality = TransformedModality( - self.modality_type, context_operator.name, self.modality_id, self.metadata - ) + transformed_modality = TransformedModality(self, context_operator) transformed_modality.data = context_operator.execute(self) return transformed_modality @@ -101,10 +104,8 @@ class UnimodalModality(Modality): def apply_representation(self, representation): new_modality = TransformedModality( - self.modality_type, - representation.name, - self.modality_id, - self.data_loader.metadata.copy(), + self, + representation, ) new_modality.data = [] diff --git a/src/main/python/systemds/scuro/representations/aggregated_representation.py b/src/main/python/systemds/scuro/representations/aggregated_representation.py index 46e6b8bed2..9412c5be00 100644 --- a/src/main/python/systemds/scuro/representations/aggregated_representation.py +++ b/src/main/python/systemds/scuro/representations/aggregated_representation.py @@ -28,8 +28,6 @@ class AggregatedRepresentation(Representation): self.aggregation = aggregation def transform(self, modality): - aggregated_modality = TransformedModality( - modality.modality_type, self.name, modality.modality_id, modality.metadata - ) + aggregated_modality = TransformedModality(modality, self) aggregated_modality.data = self.aggregation.execute(modality) return aggregated_modality diff --git a/src/main/python/systemds/scuro/representations/average.py b/src/main/python/systemds/scuro/representations/average.py index 4c6b0e1787..8a7e6b9ec8 100644 --- a/src/main/python/systemds/scuro/representations/average.py +++ b/src/main/python/systemds/scuro/representations/average.py @@ -37,6 +37,8 @@ class Average(Fusion): Combines modalities using averaging """ super().__init__("Average") + self.associative = True + self.commutative = True def transform(self, modalities: List[Modality]): for modality in modalities: diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py index 802d7e3d0b..8d8d40f4fd 100644 --- a/src/main/python/systemds/scuro/representations/bert.py +++ b/src/main/python/systemds/scuro/representations/bert.py @@ -22,11 +22,15 @@ from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation import torch -from transformers import BertTokenizer, BertModel +from transformers import BertTokenizerFast, BertModel from systemds.scuro.representations.utils import save_embeddings from systemds.scuro.modality.type import ModalityType from systemds.scuro.drsearch.operator_registry import register_representation +import os + +os.environ["TOKENIZERS_PARALLELISM"] = "false" + @register_representation(ModalityType.TEXT) class Bert(UnimodalRepresentation): @@ -38,17 +42,15 @@ class Bert(UnimodalRepresentation): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality( - modality.modality_type, self, modality.modality_id, modality.metadata - ) + transformed_modality = TransformedModality(modality, self) model_name = "bert-base-uncased" - tokenizer = BertTokenizer.from_pretrained( + tokenizer = BertTokenizerFast.from_pretrained( model_name, clean_up_tokenization_spaces=True ) model = BertModel.from_pretrained(model_name) - embeddings = self.create_embeddings(modality.data, model, tokenizer) + embeddings = self.create_embeddings(modality, model, tokenizer) if self.output_file is not None: save_embeddings(embeddings, self.output_file) @@ -56,15 +58,29 @@ class Bert(UnimodalRepresentation): transformed_modality.data = embeddings return transformed_modality - def create_embeddings(self, data, model, tokenizer): + def create_embeddings(self, modality, model, tokenizer): embeddings = [] - for d in data: - inputs = tokenizer(d, return_tensors="pt", padding=True, truncation=True) + for i, d in enumerate(modality.data): + inputs = tokenizer( + d, + return_offsets_mapping=True, + return_tensors="pt", + padding=True, + truncation=True, + ) + + ModalityType.TEXT.add_field( + list(modality.metadata.values())[i], + "token_to_character_mapping", + inputs.data["offset_mapping"][0].tolist(), + ) + + del inputs.data["offset_mapping"] with torch.no_grad(): outputs = model(**inputs) - cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy() - embeddings.append(cls_embedding.reshape(1, -1)) + cls_embedding = outputs.last_hidden_state[0].numpy() + embeddings.append(cls_embedding) return embeddings diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py index e2bc94041f..6778811c49 100644 --- a/src/main/python/systemds/scuro/representations/bow.py +++ b/src/main/python/systemds/scuro/representations/bow.py @@ -39,9 +39,7 @@ class BoW(UnimodalRepresentation): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality( - modality.modality_type, self, modality.modality_id, modality.metadata - ) + transformed_modality = TransformedModality(modality, self) vectorizer = CountVectorizer( ngram_range=(1, self.ngram_range), min_df=self.min_df ) diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py index 1265563b6c..c7ce33ab5c 100644 --- a/src/main/python/systemds/scuro/representations/concatenation.py +++ b/src/main/python/systemds/scuro/representations/concatenation.py @@ -58,7 +58,9 @@ class Concatenation(Fusion): [ data, pad_sequences( - modality.data, maxlen=max_emb_size, dtype="float32" + modality.data, + maxlen=max_emb_size, + dtype=modality.data.dtype, ), ], axis=-1, diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py index 773452371b..cbbb5606e6 100644 --- a/src/main/python/systemds/scuro/representations/fusion.py +++ b/src/main/python/systemds/scuro/representations/fusion.py @@ -33,6 +33,9 @@ class Fusion(Representation): :param name: Name of the fusion type """ super().__init__(name, parameters) + self.associative = False + self.commutative = False + self.needs_alignment = False def transform(self, modalities: List[Modality]): """ diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py index 66a6847a94..d948567f3f 100644 --- a/src/main/python/systemds/scuro/representations/glove.py +++ b/src/main/python/systemds/scuro/representations/glove.py @@ -21,7 +21,7 @@ import numpy as np from gensim.utils import tokenize - +from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.utils import save_embeddings from systemds.scuro.modality.type import ModalityType @@ -46,11 +46,12 @@ class GloVe(UnimodalRepresentation): self.glove_path = glove_path self.output_file = output_file - def transform(self, data): + def transform(self, modality): + transformed_modality = TransformedModality(modality, self) glove_embeddings = load_glove_embeddings(self.glove_path) embeddings = [] - for sentences in data: + for sentences in modality.data: tokens = list(tokenize(sentences.lower())) embeddings.append( np.mean( @@ -66,4 +67,5 @@ class GloVe(UnimodalRepresentation): if self.output_file is not None: save_embeddings(np.array(embeddings), self.output_file) - return np.array(embeddings) + transformed_modality.data = np.array(embeddings) + return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/multiplication.py b/src/main/python/systemds/scuro/representations/hadamard.py similarity index 67% rename from src/main/python/systemds/scuro/representations/multiplication.py rename to src/main/python/systemds/scuro/representations/hadamard.py index 8d1e7f8c90..138003b874 100644 --- a/src/main/python/systemds/scuro/representations/multiplication.py +++ b/src/main/python/systemds/scuro/representations/hadamard.py @@ -24,7 +24,6 @@ from typing import List import numpy as np from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.utils import pad_sequences from systemds.scuro.representations.fusion import Fusion @@ -32,23 +31,18 @@ from systemds.scuro.drsearch.operator_registry import register_fusion_operator @register_fusion_operator() -class Multiplication(Fusion): +class Hadamard(Fusion): def __init__(self): """ - Combines modalities using elementwise multiply + Combines modalities using elementwise multiply (Hadamard product) """ - super().__init__("Multiplication") + super().__init__("Hadamard") + self.needs_alignment = True # zero padding falsifies the result + self.commutative = True + self.associative = True def transform(self, modalities: List[Modality], train_indices=None): - max_emb_size = self.get_max_embedding_size(modalities) + # TODO: check for alignment in the metadata + fused_data = np.prod([m.data for m in modalities], axis=0) - data = pad_sequences(modalities[0].data, maxlen=max_emb_size, dtype="float32") - - for m in range(1, len(modalities)): - # scaled = self.scale_data(modalities[m].data, train_indices) - data = np.multiply( - data, - pad_sequences(modalities[m].data, maxlen=max_emb_size, dtype="float32"), - ) - - return data + return fused_data diff --git a/src/main/python/systemds/scuro/representations/image_bind.py b/src/main/python/systemds/scuro/representations/image_bind.py new file mode 100644 index 0000000000..e934d521af --- /dev/null +++ b/src/main/python/systemds/scuro/representations/image_bind.py @@ -0,0 +1,100 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import torch +import imagebind.data as data + +from imagebind.models.imagebind_model import ModalityType as IBModalityType + +from imagebind.models import imagebind_model +from systemds.scuro.modality.transformed import TransformedModality +from systemds.scuro.representations.unimodal import UnimodalRepresentation +from systemds.scuro.representations.utils import save_embeddings + +from systemds.scuro.modality.type import ModalityType +from systemds.scuro.drsearch.operator_registry import register_representation + +if torch.backends.mps.is_available(): + DEVICE = torch.device("mps") +# elif torch.cuda.is_available(): +# DEVICE = torch.device("cuda") +else: + DEVICE = torch.device("cpu") + + +# @register_representation([ModalityType.TEXT, ModalityType.AUDIO, ModalityType.VIDEO]) +class ImageBind(UnimodalRepresentation): + def __init__(self): + parameters = {} + super().__init__("ImageBind", ModalityType.EMBEDDING, parameters) + self.model = imagebind_model.imagebind_huge(pretrained=True) + for param in self.model.parameters(): + param.requires_grad = False + self.model.eval() + self.model.to(DEVICE) + + def transform(self, modality): + transformed_modality = TransformedModality( + modality, self, ModalityType.EMBEDDING + ) + + result = [] + if modality.modality_type == ModalityType.TEXT: + for i, instance in enumerate(modality.data): + text_inputs = data.load_and_transform_text(instance, DEVICE) + text_embeddings = self.model({IBModalityType.TEXT: text_inputs})[ + IBModalityType.TEXT + ] + result.append(text_embeddings.mean(axis=0).cpu().detach().numpy()) + if modality.modality_type == ModalityType.AUDIO: + audio_inputs = data.load_and_transform_audio_data( + list(modality.metadata)[ + (modality.data_loader.next_chunk - 1) + * (modality.data_loader.chunk_size) : ( + modality.data_loader.next_chunk - 1 + ) + * (modality.data_loader.chunk_size) + + (modality.data_loader.chunk_size) + ], + DEVICE, + ) + audio_embeddings = self.model({IBModalityType.AUDIO: audio_inputs})[ + IBModalityType.AUDIO + ] + result.extend(audio_embeddings.cpu().detach().numpy()) + if modality.modality_type == ModalityType.VIDEO: + video_inputs = data.load_and_transform_video_data( + list(modality.metadata)[ + (modality.data_loader.next_chunk - 1) + * (modality.data_loader.chunk_size) : ( + modality.data_loader.next_chunk - 1 + ) + * (modality.data_loader.chunk_size) + + (modality.data_loader.chunk_size) + ], + DEVICE, + ) + video_embeddings = self.model({IBModalityType.VISION: video_inputs})[ + IBModalityType.VISION + ] + result.extend(video_embeddings.cpu().detach().numpy()) + + transformed_modality.data = result + return transformed_modality diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py index a82a1e2500..cbab0f6897 100644 --- a/src/main/python/systemds/scuro/representations/lstm.py +++ b/src/main/python/systemds/scuro/representations/lstm.py @@ -18,6 +18,9 @@ # under the License. # # ------------------------------------------------------------- +import os +import random + import torch from torch import nn @@ -31,6 +34,8 @@ from systemds.scuro.representations.fusion import Fusion from systemds.scuro.drsearch.operator_registry import register_fusion_operator +# TODO: concatenate before embedding +# Make this a hyperparameter @register_fusion_operator() class LSTM(Fusion): def __init__(self, width=128, depth=1, dropout_rate=0.1): @@ -42,8 +47,18 @@ class LSTM(Fusion): self.width = width self.dropout_rate = dropout_rate self.unimodal_embeddings = {} + seed = 42 + + os.environ["PYTHONHASHSEED"] = str(seed) + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False def transform(self, modalities: List[Modality]): + self.unimodal_embeddings = {} size = len(modalities[0].data) result = np.zeros((size, 0)) @@ -60,6 +75,9 @@ class LSTM(Fusion): return result def run_lstm(self, data): + if isinstance(data, list): + data = np.array(data) + d = data.astype(np.float32) dim = d.shape[-1] d = torch.from_numpy(d) diff --git a/src/main/python/systemds/scuro/representations/max.py b/src/main/python/systemds/scuro/representations/max.py index 5a787dcf0c..6ecf5fd52f 100644 --- a/src/main/python/systemds/scuro/representations/max.py +++ b/src/main/python/systemds/scuro/representations/max.py @@ -18,14 +18,11 @@ # under the License. # # ------------------------------------------------------------- -import itertools from typing import List import numpy as np from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.utils import pad_sequences - from systemds.scuro.representations.fusion import Fusion from systemds.scuro.drsearch.operator_registry import register_fusion_operator @@ -33,52 +30,21 @@ from systemds.scuro.drsearch.operator_registry import register_fusion_operator @register_fusion_operator() class RowMax(Fusion): - def __init__(self, split=4): + def __init__(self): """ Combines modalities by computing the outer product of a modality combination and taking the row max """ super().__init__("RowMax") - self.split = split + self.needs_alignment = True + self.associative = True + self.commutative = True def transform( self, modalities: List[Modality], ): - if len(modalities) < 2: - return np.array(modalities[0].data) - - max_emb_size = self.get_max_embedding_size(modalities) - - padded_modalities = [] - for modality in modalities: - d = pad_sequences(modality.data, maxlen=max_emb_size, dtype="float32") - padded_modalities.append(d) - - split_rows = int(len(modalities[0].data) / self.split) - - data = [] - - for combination in itertools.combinations(padded_modalities, 2): - combined = None - for i in range(0, self.split): - start = split_rows * i - end = ( - split_rows * (i + 1) - if i < (self.split - 1) - else len(modalities[0].data) - ) - m = np.einsum( - "bi,bo->bio", combination[0][start:end], combination[1][start:end] - ) - m = m.max(axis=2) - if combined is None: - combined = m - else: - combined = np.concatenate((combined, m), axis=0) - data.append(combined) - - data = np.stack(data) - data = data.max(axis=0) + # TODO: need to check if data is aligned - same number of dimension + fused_data = np.maximum.reduce([m.data for m in modalities]) - return np.array(data) + return fused_data diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 4095ceead0..8c14c03ac6 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -43,7 +43,7 @@ class MelSpectrogram(UnimodalRepresentation): def transform(self, modality): transformed_modality = TransformedModality( - self.output_modality_type, self, modality.modality_id, modality.metadata + modality, self, self.output_modality_type ) result = [] max_length = 0 diff --git a/src/main/python/systemds/scuro/representations/mfcc.py b/src/main/python/systemds/scuro/representations/mfcc.py index 75cc00d62d..234e93246f 100644 --- a/src/main/python/systemds/scuro/representations/mfcc.py +++ b/src/main/python/systemds/scuro/representations/mfcc.py @@ -45,7 +45,7 @@ class MFCC(UnimodalRepresentation): def transform(self, modality): transformed_modality = TransformedModality( - self.output_modality_type, self, modality.modality_id, modality.metadata + modality, self, self.output_modality_type ) result = [] max_length = 0 diff --git a/src/main/python/systemds/scuro/representations/optical_flow.py b/src/main/python/systemds/scuro/representations/optical_flow.py index 1fb922d7a3..27817302d4 100644 --- a/src/main/python/systemds/scuro/representations/optical_flow.py +++ b/src/main/python/systemds/scuro/representations/optical_flow.py @@ -48,10 +48,7 @@ class OpticalFlow(UnimodalRepresentation): def transform(self, modality): transformed_modality = TransformedModality( - self.output_modality_type, - "opticalFlow", - modality.modality_id, - modality.metadata, + modality, self, self.output_modality_type ) for video_id, instance in enumerate(modality.data): diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 68771eccdd..bdfbfb17fc 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -18,10 +18,11 @@ # under the License. # # ------------------------------------------------------------- +from systemds.scuro.utils.converter import numpy_dtype_to_torch_dtype from systemds.scuro.utils.torch_dataset import CustomDataset from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.representations.unimodal import UnimodalRepresentation -from typing import Callable, Dict, Tuple, Any +from typing import Tuple, Any from systemds.scuro.drsearch.operator_registry import register_representation import torch.utils.data import torch @@ -42,6 +43,7 @@ else: ) class ResNet(UnimodalRepresentation): def __init__(self, layer="avgpool", model_name="ResNet18", output_file=None): + self.data_type = torch.bfloat16 self.model_name = model_name parameters = self._get_parameters() super().__init__( @@ -68,25 +70,38 @@ class ResNet(UnimodalRepresentation): def model_name(self, model_name): self._model_name = model_name if model_name == "ResNet18": - self.model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT).to( - DEVICE + self.model = ( + models.resnet18(weights=models.ResNet18_Weights.DEFAULT) + .to(DEVICE) + .to(self.data_type) ) + elif model_name == "ResNet34": self.model = models.resnet34(weights=models.ResNet34_Weights.DEFAULT).to( DEVICE ) + self.model = self.model.to(self.data_type) elif model_name == "ResNet50": - self.model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT).to( - DEVICE + self.model = ( + models.resnet50(weights=models.ResNet50_Weights.DEFAULT) + .to(DEVICE) + .to(self.data_type) ) + elif model_name == "ResNet101": - self.model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT).to( - DEVICE + self.model = ( + models.resnet101(weights=models.ResNet101_Weights.DEFAULT) + .to(DEVICE) + .to(self.data_type) ) + elif model_name == "ResNet152": - self.model = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to( - DEVICE + self.model = ( + models.resnet152(weights=models.ResNet152_Weights.DEFAULT) + .to(DEVICE) + .to(self.data_type) ) + else: raise NotImplementedError @@ -110,7 +125,11 @@ class ResNet(UnimodalRepresentation): return parameters def transform(self, modality): - dataset = CustomDataset(modality.data) + self.data_type = numpy_dtype_to_torch_dtype(modality.data_type) + if next(self.model.parameters()).dtype != self.data_type: + self.model = self.model.to(self.data_type) + + dataset = CustomDataset(modality.data, self.data_type, DEVICE) embeddings = {} res5c_output = None @@ -132,7 +151,7 @@ class ResNet(UnimodalRepresentation): for instance in torch.utils.data.DataLoader(dataset): video_id = instance["id"][0] - frames = instance["data"][0].to(DEVICE) + frames = instance["data"][0] embeddings[video_id] = [] batch_size = 64 @@ -146,13 +165,18 @@ class ResNet(UnimodalRepresentation): pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1)) embeddings[video_id].extend( - torch.flatten(pooled, 1).detach().cpu().numpy() + torch.flatten(pooled, 1) + .detach() + .cpu() + .float() + .numpy() + .astype(modality.data_type) ) embeddings[video_id] = np.array(embeddings[video_id]) transformed_modality = TransformedModality( - self.output_modality_type, "resnet", modality.modality_id, modality.metadata + modality, self, self.output_modality_type ) transformed_modality.data = list(embeddings.values()) diff --git a/src/main/python/systemds/scuro/representations/spectrogram.py b/src/main/python/systemds/scuro/representations/spectrogram.py index b5558b1b26..6a713a3d21 100644 --- a/src/main/python/systemds/scuro/representations/spectrogram.py +++ b/src/main/python/systemds/scuro/representations/spectrogram.py @@ -38,7 +38,7 @@ class Spectrogram(UnimodalRepresentation): def transform(self, modality): transformed_modality = TransformedModality( - self.output_modality_type, self, modality.modality_id, modality.metadata + modality, self, self.output_modality_type ) result = [] max_length = 0 diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py index c17527b476..1df5a1fde0 100644 --- a/src/main/python/systemds/scuro/representations/tfidf.py +++ b/src/main/python/systemds/scuro/representations/tfidf.py @@ -38,9 +38,7 @@ class TfIdf(UnimodalRepresentation): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality( - modality.modality_type, self, modality.modality_id, modality.metadata - ) + transformed_modality = TransformedModality(modality, self) vectorizer = TfidfVectorizer(min_df=self.min_df) diff --git a/src/main/python/systemds/scuro/representations/wav2vec.py b/src/main/python/systemds/scuro/representations/wav2vec.py index bf251b101c..29f5bcbea0 100644 --- a/src/main/python/systemds/scuro/representations/wav2vec.py +++ b/src/main/python/systemds/scuro/representations/wav2vec.py @@ -46,7 +46,7 @@ class Wav2Vec(UnimodalRepresentation): def transform(self, modality): transformed_modality = TransformedModality( - self.output_modality_type, self, modality.modality_id, modality.metadata + modality, self, self.output_modality_type ) result = [] diff --git a/src/main/python/systemds/scuro/representations/window.py b/src/main/python/systemds/scuro/representations/window_aggregation.py similarity index 100% rename from src/main/python/systemds/scuro/representations/window.py rename to src/main/python/systemds/scuro/representations/window_aggregation.py diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py index e1d1669d9b..0210207a01 100644 --- a/src/main/python/systemds/scuro/representations/word2vec.py +++ b/src/main/python/systemds/scuro/representations/word2vec.py @@ -54,9 +54,7 @@ class W2V(UnimodalRepresentation): self.output_file = output_file def transform(self, modality): - transformed_modality = TransformedModality( - modality.modality_type, self, modality.modality_id, modality.metadata - ) + transformed_modality = TransformedModality(modality, self) t = [list(tokenize(s.lower())) for s in modality.data] model = Word2Vec( sentences=t, diff --git a/src/main/python/systemds/scuro/representations/x3d.py b/src/main/python/systemds/scuro/representations/x3d.py index bb5d1ec5ed..1629ac6f30 100644 --- a/src/main/python/systemds/scuro/representations/x3d.py +++ b/src/main/python/systemds/scuro/representations/x3d.py @@ -30,11 +30,12 @@ import torchvision.transforms as transforms import numpy as np from systemds.scuro.modality.type import ModalityType from systemds.scuro.drsearch.operator_registry import register_representation +import math if torch.backends.mps.is_available(): DEVICE = torch.device("mps") -# elif torch.cuda.is_available(): -# DEVICE = torch.device("cuda") +elif torch.cuda.is_available(): + DEVICE = torch.device("cuda") else: DEVICE = torch.device("cpu") @@ -127,7 +128,74 @@ class X3D(UnimodalRepresentation): embeddings[video_id] = np.array(embeddings[video_id]) transformed_modality = TransformedModality( - self.output_modality_type, "x3d", modality.modality_id, modality.metadata + modality, self, self.output_modality_type + ) + + transformed_modality.data = list(embeddings.values()) + + return transformed_modality + + +class I3D(UnimodalRepresentation): + def __init__(self, layer="avgpool", model_name="i3d", output_file=None): + self.model_name = model_name + parameters = self._get_parameters() + self.model = torch.hub.load( + "facebookresearch/pytorchvideo", "i3d_r50", pretrained=True + ).to(DEVICE) + super().__init__("I3D", ModalityType.TIMESERIES, parameters) + + self.output_file = output_file + self.layer_name = layer + self.model.eval() + for param in self.model.parameters(): + param.requires_grad = False + + def _get_parameters(self, high_level=True): + parameters = {"model_name": [], "layer_name": []} + for m in ["r3d", "s3d"]: + parameters["model_name"].append(m) + + if high_level: + parameters["layer_name"] = [ + "conv1", + "layer1", + "layer2", + "layer3", + "layer4", + "avgpool", + ] + else: + for name, layer in self.model.named_modules(): + parameters["layer_name"].append(name) + return parameters + + def transform(self, modality): + dataset = CustomDataset(modality.data, torch.float32, DEVICE) + embeddings = {} + + features = None + + def hook(module, input, output): + pooled = torch.nn.functional.adaptive_avg_pool3d(output, 1).squeeze() + nonlocal features + features = pooled.detach().cpu().numpy() + + handle = self.model.blocks[6].dropout.register_forward_hook(hook) + + for instance in dataset: + video_id = instance["id"] + frames = instance["data"].to(DEVICE) + embeddings[video_id] = [] + + batch = torch.transpose(frames, 1, 0) + batch = batch.unsqueeze(0) + _ = self.model(batch) + + embeddings[video_id] = features + + transformed_modality = TransformedModality( + modality, self, self.output_modality_type ) transformed_modality.data = list(embeddings.values()) diff --git a/src/main/python/systemds/scuro/representations/aggregated_representation.py b/src/main/python/systemds/scuro/utils/converter.py similarity index 54% copy from src/main/python/systemds/scuro/representations/aggregated_representation.py copy to src/main/python/systemds/scuro/utils/converter.py index 46e6b8bed2..030fc4ae29 100644 --- a/src/main/python/systemds/scuro/representations/aggregated_representation.py +++ b/src/main/python/systemds/scuro/utils/converter.py @@ -18,18 +18,32 @@ # under the License. # # ------------------------------------------------------------- -from systemds.scuro.modality.transformed import TransformedModality -from systemds.scuro.representations.representation import Representation +import numpy as np +import torch -class AggregatedRepresentation(Representation): - def __init__(self, aggregation): - super().__init__("AggregatedRepresentation", aggregation.parameters) - self.aggregation = aggregation - def transform(self, modality): - aggregated_modality = TransformedModality( - modality.modality_type, self.name, modality.modality_id, modality.metadata - ) - aggregated_modality.data = self.aggregation.execute(modality) - return aggregated_modality +def numpy_dtype_to_torch_dtype(dtype): + """ + Convert a NumPy dtype (or dtype string) to the corresponding PyTorch dtype. + Raises ValueError if the dtype is not supported. + """ + if isinstance(dtype, torch.dtype): + return dtype + + mapping = { + np.float32: torch.float32, + np.float64: torch.float64, + np.float16: torch.bfloat16, + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + } + + np_dtype = np.dtype(dtype) + if np_dtype.type in mapping: + return mapping[np_dtype.type] + else: + raise ValueError(f"No corresponding torch dtype for NumPy dtype {np_dtype}") diff --git a/src/main/python/systemds/scuro/utils/torch_dataset.py b/src/main/python/systemds/scuro/utils/torch_dataset.py index a0f3d88b6a..c04be0ec7b 100644 --- a/src/main/python/systemds/scuro/utils/torch_dataset.py +++ b/src/main/python/systemds/scuro/utils/torch_dataset.py @@ -20,20 +20,26 @@ # ------------------------------------------------------------- from typing import Dict -import numpy as np import torch import torchvision.transforms as transforms class CustomDataset(torch.utils.data.Dataset): - def __init__(self, data): + def __init__(self, data, data_type, device, size=None): self.data = data + self.data_type = data_type + self.device = device + self.size = size + if size is None: + self.size = (256, 224) + self.tf = transforms.Compose( [ transforms.ToPILImage(), - transforms.Resize(256), - transforms.CenterCrop(224), + transforms.Resize(self.size[0]), + transforms.CenterCrop(self.size[1]), transforms.ToTensor(), + transforms.ConvertImageDtype(dtype=self.data_type), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), @@ -42,20 +48,18 @@ class CustomDataset(torch.utils.data.Dataset): def __getitem__(self, index) -> Dict[str, object]: data = self.data[index] - if type(data) is np.ndarray: - output = torch.empty((1, 3, 224, 224)) - d = torch.tensor(data) - d = d.repeat(3, 1, 1) - output[0] = self.tf(d) - else: - output = torch.empty((len(data), 3, 224, 224)) - - for i, d in enumerate(data): - if data[0].ndim < 3: - d = torch.tensor(d) - d = d.repeat(3, 1, 1) - - output[i] = self.tf(d) + output = torch.empty( + (len(data), 3, self.size[1], self.size[1]), + dtype=self.data_type, + device=self.device, + ) + + for i, d in enumerate(data): + if data[0].ndim < 3: + d = torch.tensor(d) + d = d.repeat(3, 1, 1) + + output[i] = self.tf(d) return {"id": index, "data": output} diff --git a/src/main/python/systemds/utils/helpers.py b/src/main/python/systemds/utils/helpers.py index 05c9bf0647..887b3140eb 100644 --- a/src/main/python/systemds/utils/helpers.py +++ b/src/main/python/systemds/utils/helpers.py @@ -23,7 +23,7 @@ import os from importlib.util import find_spec from itertools import chain from typing import Dict, Iterable - +import torch from systemds.utils.consts import MODULE_NAME diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index e31887ff83..fbb50ac180 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -26,6 +26,7 @@ from scipy.io.wavfile import write import random import os +from systemds.scuro.dataloader.base_loader import BaseLoader from systemds.scuro.dataloader.video_loader import VideoLoader from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.text_loader import TextLoader @@ -34,10 +35,31 @@ from systemds.scuro.modality.transformed import TransformedModality from systemds.scuro.modality.type import ModalityType +class TestDataLoader(BaseLoader): + def __init__(self, indices, chunk_size, modality_type, data, data_type, metadata): + super().__init__("", indices, data_type, chunk_size, modality_type) + + self.metadata = metadata + self.test_data = data + + def reset(self): + self._next_chunk = 0 + self.data = [] + + def extract(self, file, indices): + if isinstance(self.test_data, list): + self.data = [self.test_data[i] for i in indices] + else: + self.data = self.test_data[indices] + + class ModalityRandomDataGenerator: def __init__(self): - self._modality_id = 0 + self.modality_id = 0 + self.modality_type = None + self.metadata = {} + self.data_type = np.float32 def create1DModality( self, @@ -45,32 +67,125 @@ class ModalityRandomDataGenerator: num_features, modality_type, ): - data = np.random.rand(num_instances, num_features) + data = np.random.rand(num_instances, num_features).astype(self.data_type) + data.dtype = self.data_type + # TODO: write a dummy method to create the same metadata for all instances to avoid the for loop - metadata = {} + self.modality_type = modality_type for i in range(num_instances): if modality_type == ModalityType.AUDIO: - metadata[i] = modality_type.create_audio_metadata( + self.metadata[i] = modality_type.create_audio_metadata( num_features / 10, data[i] ) elif modality_type == ModalityType.TEXT: - metadata[i] = modality_type.create_text_metadata( + self.metadata[i] = modality_type.create_text_metadata( num_features / 10, data[i] ) elif modality_type == ModalityType.VIDEO: - metadata[i] = modality_type.create_video_metadata( + self.metadata[i] = modality_type.create_video_metadata( num_features / 30, 10, 0, 0, 1 ) else: raise NotImplementedError - tf_modality = TransformedModality( - modality_type, "test_transformation", self._modality_id, metadata - ) + tf_modality = TransformedModality(self, "test_transformation") tf_modality.data = data - self._modality_id += 1 + self.modality_id += 1 return tf_modality + def create_audio_data(self, num_instances, num_features): + data = np.random.rand(num_instances, num_features).astype(np.float32) + metadata = { + i: ModalityType.AUDIO.create_audio_metadata(16000, data[i]) + for i in range(num_instances) + } + + return data, metadata + + def create_text_data(self, num_instances): + subjects = [ + "The cat", + "A dog", + "The student", + "The teacher", + "The bird", + "The child", + "The programmer", + "The scientist", + "A researcher", + ] + verbs = [ + "reads", + "writes", + "studies", + "analyzes", + "creates", + "develops", + "designs", + "implements", + "examines", + ] + objects = [ + "the document", + "the code", + "the data", + "the problem", + "the solution", + "the project", + "the research", + "the paper", + ] + adverbs = [ + "carefully", + "quickly", + "efficiently", + "thoroughly", + "diligently", + "precisely", + "methodically", + ] + + sentences = [] + for _ in range(num_instances): + include_adverb = np.random.random() < 0.7 + + subject = np.random.choice(subjects) + verb = np.random.choice(verbs) + obj = np.random.choice(objects) + adverb = np.random.choice(adverbs) if include_adverb else "" + + sentence = f"{subject} {adverb} {verb} {obj}" + + sentences.append(sentence) + + metadata = { + i: ModalityType.TEXT.create_text_metadata(len(sentences[i]), sentences[i]) + for i in range(num_instances) + } + + return sentences, metadata + + def create_visual_modality(self, num_instances, num_frames=1, height=28, width=28): + if num_frames == 1: + print(f"TODO: create image metadata") + else: + metadata = { + i: ModalityType.VIDEO.create_video_metadata( + 30, num_frames, width, height, 1 + ) + for i in range(num_instances) + } + + return ( + np.random.randint( + 0, + 256, + (num_instances, num_frames, height, width), + # ).astype(np.float16).tolist(), + ).astype(np.float16), + metadata, + ) + def setup_data(modalities, num_instances, path): if os.path.isdir(path): @@ -202,7 +317,7 @@ class TestDataGenerator: def __create_audio_data(self, idx, duration, speed_factor): path = f"{self.path}/AUDIO/{idx}.wav" - sample_rate = 44100 + sample_rate = 16000 t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False) frequency_variation = random.uniform(200.0, 500.0) diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index 521ff3f468..50f57eebb2 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -38,10 +38,10 @@ from systemds.scuro.representations.concatenation import Concatenation from systemds.scuro.representations.lstm import LSTM from systemds.scuro.representations.max import RowMax from systemds.scuro.representations.mel_spectrogram import MelSpectrogram -from systemds.scuro.representations.multiplication import Multiplication +from systemds.scuro.representations.hadamard import Hadamard from systemds.scuro.representations.resnet import ResNet from systemds.scuro.representations.sum import Sum -from tests.scuro.data_generator import setup_data +from tests.scuro.data_generator import ModalityRandomDataGenerator import warnings @@ -91,36 +91,27 @@ class TestDataLoaders(unittest.TestCase): @classmethod def setUpClass(cls): - cls.test_file_path = "test_data_dr_search" cls.num_instances = 20 - modalities = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] - - cls.data_generator = setup_data( - modalities, cls.num_instances, cls.test_file_path - ) - os.makedirs(f"{cls.test_file_path}/embeddings") + cls.data_generator = ModalityRandomDataGenerator() + cls.labels = np.random.choice([0, 1], size=cls.num_instances) # TODO: adapt the representation so they return non aggregated values. Apply windowing operation instead - cls.bert = cls.data_generator.modalities_by_type[ - ModalityType.TEXT - ].apply_representation(Bert()) - cls.mel_spe = ( - cls.data_generator.modalities_by_type[ModalityType.AUDIO] - .apply_representation(MelSpectrogram()) - .flatten() + cls.video = cls.data_generator.create1DModality( + cls.num_instances, 100, ModalityType.VIDEO + ) + cls.text = cls.data_generator.create1DModality( + cls.num_instances, 100, ModalityType.TEXT ) - cls.resnet = ( - cls.data_generator.modalities_by_type[ModalityType.VIDEO] - .apply_representation(ResNet()) - .window(10, "mean") - .flatten() + cls.audio = cls.data_generator.create1DModality( + cls.num_instances, 100, ModalityType.AUDIO ) - cls.mods = [cls.bert, cls.mel_spe, cls.resnet] + + cls.mods = [cls.video, cls.audio, cls.text] split = train_test_split( - cls.data_generator.indices, - cls.data_generator.labels, + np.array(range(cls.num_instances)), + cls.labels, test_size=0.2, random_state=42, ) @@ -134,22 +125,17 @@ class TestDataLoaders(unittest.TestCase): cls.representations = [ Concatenation(), Average(), - RowMax(100), - Multiplication(), + RowMax(), + Hadamard(), Sum(), LSTM(width=256, depth=3), ] - @classmethod - def tearDownClass(cls): - print("Cleaning up test data") - shutil.rmtree(cls.test_file_path) - def test_enumerate_all(self): task = Task( "TestTask", TestSVM(), - self.data_generator.labels, + self.labels, self.train_indizes, self.val_indizes, ) @@ -164,7 +150,7 @@ class TestDataLoaders(unittest.TestCase): task = Task( "TestTask", TestSVM(), - self.data_generator.labels, + self.labels, self.train_indizes, self.val_indizes, ) diff --git a/src/main/python/tests/scuro/test_fusion_orders.py b/src/main/python/tests/scuro/test_fusion_orders.py new file mode 100644 index 0000000000..eb01d18ffe --- /dev/null +++ b/src/main/python/tests/scuro/test_fusion_orders.py @@ -0,0 +1,95 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import os +import shutil +import unittest +import numpy as np + +from systemds.scuro import Concatenation, RowMax, Hadamard +from systemds.scuro.modality.unimodal_modality import UnimodalModality +from systemds.scuro.representations.bert import Bert +from systemds.scuro.representations.mel_spectrogram import MelSpectrogram +from systemds.scuro.representations.average import Average +from tests.scuro.data_generator import ModalityRandomDataGenerator +from systemds.scuro.modality.type import ModalityType + + +class TestFusionOrders(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.num_instances = 40 + cls.data_generator = ModalityRandomDataGenerator() + cls.r_1 = cls.data_generator.create1DModality(40, 100, ModalityType.AUDIO) + cls.r_2 = cls.data_generator.create1DModality(40, 100, ModalityType.TEXT) + cls.r_3 = cls.data_generator.create1DModality(40, 100, ModalityType.TEXT) + + def test_fusion_order_avg(self): + r_1_r_2 = self.r_1.combine(self.r_2, Average()) + r_2_r_1 = self.r_2.combine(self.r_1, Average()) + r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Average()) + r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Average()) + + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Average()) + + self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) + self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) + self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) + + def test_fusion_order_concat(self): + r_1_r_2 = self.r_1.combine(self.r_2, Concatenation()) + r_2_r_1 = self.r_2.combine(self.r_1, Concatenation()) + r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Concatenation()) + r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Concatenation()) + + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Concatenation()) + + self.assertFalse(np.array_equal(r_1_r_2.data, r_2_r_1.data)) + self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) + self.assertFalse(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) + self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) + + def test_fusion_order_max(self): + r_1_r_2 = self.r_1.combine(self.r_2, RowMax()) + r_2_r_1 = self.r_2.combine(self.r_1, RowMax()) + r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, RowMax()) + r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, RowMax()) + + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], RowMax()) + + self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) + self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) + + def test_fusion_order_hadamard(self): + r_1_r_2 = self.r_1.combine(self.r_2, Hadamard()) + r_2_r_1 = self.r_2.combine(self.r_1, Hadamard()) + r_1_r_2_r_3 = r_1_r_2.combine(self.r_3, Hadamard()) + r_2_r_1_r_3 = r_2_r_1.combine(self.r_3, Hadamard()) + + r1_r2_r3 = self.r_1.combine([self.r_2, self.r_3], Hadamard()) + + self.assertTrue(np.array_equal(r_1_r_2.data, r_2_r_1.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r_2_r_1_r_3.data)) + self.assertTrue(np.array_equal(r_1_r_2_r_3.data, r1_r2_r3.data)) + self.assertFalse(np.array_equal(r_1_r_2.data, r1_r2_r3.data)) diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py b/src/main/python/tests/scuro/test_multimodal_fusion.py index 8456279c3d..77f03054eb 100644 --- a/src/main/python/tests/scuro/test_multimodal_fusion.py +++ b/src/main/python/tests/scuro/test_multimodal_fusion.py @@ -42,7 +42,11 @@ from systemds.scuro.representations.spectrogram import Spectrogram from systemds.scuro.representations.word2vec import W2V from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.resnet import ResNet -from tests.scuro.data_generator import setup_data +from tests.scuro.data_generator import ( + setup_data, + TestDataLoader, + ModalityRandomDataGenerator, +) from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.video_loader import VideoLoader @@ -109,15 +113,14 @@ class TestMultimodalRepresentationOptimizer(unittest.TestCase): @classmethod def setUpClass(cls): - cls.test_file_path = "fusion_optimizer_test_data" - cls.num_instances = 10 cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + cls.labels = np.random.choice([0, 1], size=cls.num_instances) + cls.indices = np.array(range(cls.num_instances)) - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) split = train_test_split( - cls.data_generator.indices, - cls.data_generator.labels, + cls.indices, + cls.labels, test_size=0.2, random_state=42, ) @@ -129,48 +132,52 @@ class TestMultimodalRepresentationOptimizer(unittest.TestCase): Task( "UnimodalRepresentationTask1", TestSVM(), - cls.data_generator.labels, + cls.labels, cls.train_indizes, cls.val_indizes, ), Task( "UnimodalRepresentationTask2", TestCNN(), - cls.data_generator.labels, + cls.labels, cls.train_indizes, cls.val_indizes, ), ] - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.test_file_path) - def test_multimodal_fusion(self): task = Task( "UnimodalRepresentationTask1", TestSVM(), - self.data_generator.labels, + self.labels, self.train_indizes, self.val_indizes, ) - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, - ) - audio = UnimodalModality(audio_data_loader) - text_data_loader = TextLoader( - self.data_generator.get_modality_path(ModalityType.TEXT), - self.data_generator.indices, + audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( + self.num_instances, 100 ) - text = UnimodalModality(text_data_loader) - - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, + text_data, text_md = ModalityRandomDataGenerator().create_text_data( + self.num_instances + ) + video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( + self.num_instances, 60 + ) + audio = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md + ) + ) + video = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md + ) + ) + text = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.TEXT, text_data, str, text_md + ) ) - video = UnimodalModality(video_data_loader) with patch.object( Registry, @@ -200,3 +207,7 @@ class TestMultimodalRepresentationOptimizer(unittest.TestCase): debug=False, ) multimodal_optimizer.optimize() + + +if __name__ == "__main__": + unittest.main() diff --git a/src/main/python/tests/scuro/test_multimodal_join.py b/src/main/python/tests/scuro/test_multimodal_join.py index a5e3a7caf9..9e3a16ffca 100644 --- a/src/main/python/tests/scuro/test_multimodal_join.py +++ b/src/main/python/tests/scuro/test_multimodal_join.py @@ -23,11 +23,13 @@ import shutil import unittest +import numpy as np +import copy from systemds.scuro.modality.joined import JoinCondition from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.resnet import ResNet -from tests.scuro.data_generator import setup_data +from tests.scuro.data_generator import TestDataLoader, ModalityRandomDataGenerator from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.video_loader import VideoLoader @@ -46,16 +48,15 @@ class TestMultimodalJoin(unittest.TestCase): @classmethod def setUpClass(cls): - cls.test_file_path = "join_test_data" cls.num_instances = 4 - cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO] - - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) + cls.indices = np.array(range(cls.num_instances)) + cls.audio_data, cls.audio_md = ModalityRandomDataGenerator().create_audio_data( + cls.num_instances, 32000 + ) - @classmethod - def tearDownClass(cls): - print("Cleaning up test data") - shutil.rmtree(cls.test_file_path) + cls.video_data, cls.video_md = ( + ModalityRandomDataGenerator().create_visual_modality(cls.num_instances, 60) + ) def test_video_audio_join(self): self._execute_va_join() @@ -91,19 +92,26 @@ class TestMultimodalJoin(unittest.TestCase): self._join(audio, video, 2) def _prepare_data(self, l_chunk_size=None, r_chunk_size=None): - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, - chunk_size=l_chunk_size, + audio = UnimodalModality( + TestDataLoader( + self.indices, + r_chunk_size, + ModalityType.AUDIO, + copy.deepcopy(self.audio_data), + np.float32, + copy.deepcopy(self.audio_md), + ) ) - video = UnimodalModality(video_data_loader) - - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, - r_chunk_size, + video = UnimodalModality( + TestDataLoader( + self.indices, + l_chunk_size, + ModalityType.VIDEO, + copy.deepcopy(self.video_data), + np.float32, + copy.deepcopy(self.video_md), + ) ) - audio = UnimodalModality(audio_data_loader) mel_audio = audio.apply_representation(MelSpectrogram()) @@ -114,8 +122,8 @@ class TestMultimodalJoin(unittest.TestCase): left_modality.join( right_modality, JoinCondition("timestamp", "timestamp", "<") ) - .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet50")) - .window(window_size, "mean") + .apply_representation(ResNet(layer="layer1.0.conv2", model_name="ResNet18")) + .window_aggregation(window_size, "mean") .combine("concat") ) diff --git a/src/main/python/tests/scuro/test_operator_registry.py b/src/main/python/tests/scuro/test_operator_registry.py index aaecde2991..7f2a752722 100644 --- a/src/main/python/tests/scuro/test_operator_registry.py +++ b/src/main/python/tests/scuro/test_operator_registry.py @@ -23,7 +23,7 @@ import unittest from systemds.scuro.representations.mfcc import MFCC from systemds.scuro.representations.wav2vec import Wav2Vec -from systemds.scuro.representations.window import WindowAggregation +from systemds.scuro.representations.window_aggregation import WindowAggregation from systemds.scuro.representations.bow import BoW from systemds.scuro.representations.word2vec import W2V from systemds.scuro.representations.tfidf import TfIdf @@ -36,7 +36,7 @@ from systemds.scuro.representations.lstm import LSTM from systemds.scuro.representations.max import RowMax from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.spectrogram import Spectrogram -from systemds.scuro.representations.multiplication import Multiplication +from systemds.scuro.representations.hadamard import Hadamard from systemds.scuro.representations.resnet import ResNet from systemds.scuro.representations.sum import Sum diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py b/src/main/python/tests/scuro/test_unimodal_optimizer.py index bfc52f0103..9ed034e5fe 100644 --- a/src/main/python/tests/scuro/test_unimodal_optimizer.py +++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py @@ -39,7 +39,7 @@ from systemds.scuro.representations.spectrogram import Spectrogram from systemds.scuro.representations.word2vec import W2V from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.resnet import ResNet -from tests.scuro.data_generator import setup_data +from tests.scuro.data_generator import ModalityRandomDataGenerator, TestDataLoader from systemds.scuro.dataloader.audio_loader import AudioLoader from systemds.scuro.dataloader.video_loader import VideoLoader @@ -101,21 +101,19 @@ from unittest.mock import patch class TestUnimodalRepresentationOptimizer(unittest.TestCase): - test_file_path = None data_generator = None num_instances = 0 @classmethod def setUpClass(cls): - cls.test_file_path = "unimodal_optimizer_test_data" - cls.num_instances = 10 cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT] + cls.labels = np.random.choice([0, 1], size=cls.num_instances) + cls.indices = np.array(range(cls.num_instances)) - cls.data_generator = setup_data(cls.mods, cls.num_instances, cls.test_file_path) split = train_test_split( - cls.data_generator.indices, - cls.data_generator.labels, + cls.indices, + cls.labels, test_size=0.2, random_state=42, ) @@ -127,46 +125,51 @@ class TestUnimodalRepresentationOptimizer(unittest.TestCase): Task( "UnimodalRepresentationTask1", TestSVM(), - cls.data_generator.labels, + cls.labels, cls.train_indizes, cls.val_indizes, ), Task( "UnimodalRepresentationTask2", TestCNN(), - cls.data_generator.labels, + cls.labels, cls.train_indizes, cls.val_indizes, ), ] - @classmethod - def tearDownClass(cls): - shutil.rmtree(cls.test_file_path) - def test_unimodal_optimizer_for_audio_modality(self): - audio_data_loader = AudioLoader( - self.data_generator.get_modality_path(ModalityType.AUDIO), - self.data_generator.indices, + audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data( + self.num_instances, 100 + ) + audio = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.AUDIO, audio_data, np.float32, audio_md + ) ) - audio = UnimodalModality(audio_data_loader) self.optimize_unimodal_representation_for_modality(audio) def test_unimodal_optimizer_for_text_modality(self): - text_data_loader = TextLoader( - self.data_generator.get_modality_path(ModalityType.TEXT), - self.data_generator.indices, + text_data, text_md = ModalityRandomDataGenerator().create_text_data( + self.num_instances + ) + text = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.TEXT, text_data, str, text_md + ) ) - text = UnimodalModality(text_data_loader) self.optimize_unimodal_representation_for_modality(text) def test_unimodal_optimizer_for_video_modality(self): - video_data_loader = VideoLoader( - self.data_generator.get_modality_path(ModalityType.VIDEO), - self.data_generator.indices, + video_data, video_md = ModalityRandomDataGenerator().create_visual_modality( + self.num_instances, 60 + ) + video = UnimodalModality( + TestDataLoader( + self.indices, None, ModalityType.VIDEO, video_data, np.float32, video_md + ) ) - video = UnimodalModality(video_data_loader) self.optimize_unimodal_representation_for_modality(video) def optimize_unimodal_representation_for_modality(self, modality): @@ -201,3 +204,7 @@ class TestUnimodalRepresentationOptimizer(unittest.TestCase): ) >= 1 ) + + +if __name__ == "__main__": + unittest.main() diff --git a/src/main/python/tests/scuro/test_unimodal_representations.py b/src/main/python/tests/scuro/test_unimodal_representations.py index ac167e8fbf..2f2e64efd7 100644 --- a/src/main/python/tests/scuro/test_unimodal_representations.py +++ b/src/main/python/tests/scuro/test_unimodal_representations.py @@ -29,6 +29,7 @@ from systemds.scuro.representations.tfidf import TfIdf from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.bert import Bert from systemds.scuro.representations.mel_spectrogram import MelSpectrogram +from systemds.scuro.representations.mfcc import MFCC from systemds.scuro.representations.resnet import ResNet from tests.scuro.data_generator import setup_data @@ -63,7 +64,7 @@ class TestUnimodalRepresentations(unittest.TestCase): shutil.rmtree(cls.test_file_path) def test_audio_representations(self): - audio_representations = [MelSpectrogram()] # TODO: add FFT, TFN, 1DCNN + audio_representations = [MFCC()] # TODO: add FFT, TFN, 1DCNN audio_data_loader = AudioLoader( self.data_generator.get_modality_path(ModalityType.AUDIO), self.data_generator.indices, diff --git a/src/main/python/tests/scuro/test_window_operations.py b/src/main/python/tests/scuro/test_window_operations.py index d7210ddb6d..ea1b0f46f2 100644 --- a/src/main/python/tests/scuro/test_window_operations.py +++ b/src/main/python/tests/scuro/test_window_operations.py @@ -51,7 +51,7 @@ class TestWindowOperations(unittest.TestCase): def run_window_operations_for_modality(self, modality_type, window_size): r = self.data_generator.create1DModality(40, 100, modality_type) for aggregation in self.aggregations: - windowed_modality = r.window(window_size, aggregation) + windowed_modality = r.window_aggregation(window_size, aggregation) self.verify_window_operation(aggregation, r, windowed_modality, window_size)