This is an automated email from the ASF dual-hosted git repository.
cdionysio pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 6658c8fdb4 [SYSTEMDS-3936] Refactor metadata generation and add
additional dataloader
6658c8fdb4 is described below
commit 6658c8fdb441ca84b15d4953099c0a9b0ef84f07
Author: Christina Dionysio <[email protected]>
AuthorDate: Thu Nov 27 15:26:25 2025 +0100
[SYSTEMDS-3936] Refactor metadata generation and add additional dataloader
This patch introduces a new dataloader for the image modality.
Additionally, it refines the way how the metadata for each modality is created.
---
src/main/python/systemds/scuro/__init__.py | 2 ++
.../systemds/scuro/dataloader/audio_loader.py | 4 +--
.../systemds/scuro/dataloader/base_loader.py | 11 ++++---
.../{audio_loader.py => image_loader.py} | 34 ++++++++++++----------
.../systemds/scuro/dataloader/json_loader.py | 22 +++++++++-----
.../systemds/scuro/dataloader/text_loader.py | 2 +-
.../systemds/scuro/dataloader/timeseries_loader.py | 4 +--
.../systemds/scuro/dataloader/video_loader.py | 4 +--
.../systemds/scuro/modality/joined_transformed.py | 3 +-
.../python/systemds/scuro/modality/transformed.py | 12 ++++++--
src/main/python/systemds/scuro/modality/type.py | 29 ++++++++++++++++++
.../systemds/scuro/modality/unimodal_modality.py | 28 +++++++++++-------
src/main/python/tests/scuro/data_generator.py | 21 ++++++-------
13 files changed, 118 insertions(+), 58 deletions(-)
diff --git a/src/main/python/systemds/scuro/__init__.py
b/src/main/python/systemds/scuro/__init__.py
index e74ae53f36..8b5a8621d1 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -20,6 +20,7 @@
# -------------------------------------------------------------
from systemds.scuro.dataloader.base_loader import BaseLoader
from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.image_loader import ImageLoader
from systemds.scuro.dataloader.video_loader import VideoLoader
from systemds.scuro.dataloader.text_loader import TextLoader
from systemds.scuro.dataloader.json_loader import JSONLoader
@@ -103,6 +104,7 @@ from systemds.scuro.representations.clip import CLIPText,
CLIPVisual
__all__ = [
"BaseLoader",
+ "ImageLoader",
"AudioLoader",
"VideoLoader",
"TextLoader",
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py
b/src/main/python/systemds/scuro/dataloader/audio_loader.py
index d8080c607d..8b224e6f25 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -47,7 +47,7 @@ class AudioLoader(BaseLoader):
if not self.load_data_from_file:
import numpy as np
- self.metadata[file] = self.modality_type.create_audio_metadata(
+ self.metadata[file] = self.modality_type.create_metadata(
1000, np.array([0])
)
else:
@@ -56,6 +56,6 @@ class AudioLoader(BaseLoader):
if self.normalize:
audio = librosa.util.normalize(audio)
- self.metadata[file] = self.modality_type.create_audio_metadata(sr,
audio)
+ self.metadata[file] = self.modality_type.create_metadata(sr, audio)
self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py
b/src/main/python/systemds/scuro/dataloader/base_loader.py
index 33b418efb3..777c72d5d5 100644
--- a/src/main/python/systemds/scuro/dataloader/base_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -34,6 +34,7 @@ class BaseLoader(ABC):
data_type: Union[np.dtype, str],
chunk_size: Optional[int] = None,
modality_type=None,
+ ext=None,
):
"""
Base class to load raw data for a given list of indices and stores
them in the data object
@@ -53,6 +54,7 @@ class BaseLoader(ABC):
self._num_chunks = 1
self._chunk_size = None
self._data_type = data_type
+ self._ext = ext
if chunk_size:
self.chunk_size = chunk_size
@@ -136,9 +138,10 @@ class BaseLoader(ABC):
is_dir = True if os.path.isdir(self.source_path) else False
file_names = []
if is_dir:
- _, ext = os.path.splitext(os.listdir(self.source_path)[0])
+ if self._ext is None:
+ _, self._ext =
os.path.splitext(os.listdir(self.source_path)[0])
for index in self.indices if indices is None else indices:
- file_names.append(self.source_path + index + ext)
+ file_names.append(self.source_path + index + self._ext)
return file_names
else:
return self.source_path
@@ -155,10 +158,10 @@ class BaseLoader(ABC):
try:
file_size = os.path.getsize(file)
except:
- raise (f"Error: File {0} not found!".format(file))
+ raise ValueError(f"Error: File {0} not found!".format(file))
if file_size == 0:
- raise ("File {0} is empty".format(file))
+ raise ValueError("File {0} is empty".format(file))
@staticmethod
def resolve_data_type(data_type):
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py
b/src/main/python/systemds/scuro/dataloader/image_loader.py
similarity index 70%
copy from src/main/python/systemds/scuro/dataloader/audio_loader.py
copy to src/main/python/systemds/scuro/dataloader/image_loader.py
index d8080c607d..0667e703b1 100644
--- a/src/main/python/systemds/scuro/dataloader/audio_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/image_loader.py
@@ -19,43 +19,45 @@
#
# -------------------------------------------------------------
from typing import List, Optional, Union
-import librosa
+
import numpy as np
from systemds.scuro.dataloader.base_loader import BaseLoader
+import cv2
from systemds.scuro.modality.type import ModalityType
-class AudioLoader(BaseLoader):
+class ImageLoader(BaseLoader):
def __init__(
self,
source_path: str,
indices: List[str],
- data_type: Union[np.dtype, str] = np.float32,
+ data_type: Union[np.dtype, str] = np.float16,
chunk_size: Optional[int] = None,
- normalize: bool = True,
load=True,
+ ext=".jpg",
):
super().__init__(
- source_path, indices, data_type, chunk_size, ModalityType.AUDIO
+ source_path, indices, data_type, chunk_size, ModalityType.IMAGE,
ext
)
- self.normalize = normalize
self.load_data_from_file = load
def extract(self, file: str, index: Optional[Union[str, List[str]]] =
None):
self.file_sanity_check(file)
- if not self.load_data_from_file:
- import numpy as np
- self.metadata[file] = self.modality_type.create_audio_metadata(
- 1000, np.array([0])
- )
+ image = cv2.imread(file, cv2.IMREAD_COLOR)
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+ if image.ndim == 2:
+ height, width = image.shape
+ channels = 1
else:
- audio, sr = librosa.load(file, dtype=self._data_type)
+ height, width, channels = image.shape
- if self.normalize:
- audio = librosa.util.normalize(audio)
+ image = image.astype(np.float32) / 255.0
- self.metadata[file] = self.modality_type.create_audio_metadata(sr,
audio)
+ self.metadata[file] = self.modality_type.create_metadata(
+ width, height, channels
+ )
- self.data.append(audio)
+ self.data.append(image)
diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py
b/src/main/python/systemds/scuro/dataloader/json_loader.py
index a355edded8..89ba6b43d5 100644
--- a/src/main/python/systemds/scuro/dataloader/json_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -32,20 +32,28 @@ class JSONLoader(BaseLoader):
self,
source_path: str,
indices: List[str],
- field: str,
+ field: str, # TODO: make this a list so it is easier to get multiple
fields from a json file. (i.e. Mustard: context + sentence)
data_type: Union[np.dtype, str] = str,
chunk_size: Optional[int] = None,
+ ext: str = ".json",
):
- super().__init__(source_path, indices, data_type, chunk_size,
ModalityType.TEXT)
+ super().__init__(
+ source_path, indices, data_type, chunk_size, ModalityType.TEXT, ext
+ )
self.field = field
def extract(self, file: str, index: Optional[Union[str, List[str]]] =
None):
self.file_sanity_check(file)
with open(file) as f:
json_file = json.load(f)
+
+ if isinstance(index, str):
+ index = [index]
for idx in index:
- sentence = json_file[idx][self.field]
- self.data.append(sentence)
- self.metadata[idx] = self.modality_type.create_text_metadata(
- len(sentence), sentence
- )
+ try:
+ text = json_file[idx][self.field]
+ except:
+ text = json_file[self.field]
+
+ self.data.append(text)
+ self.metadata[idx] =
self.modality_type.create_metadata(len(text), text)
diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py
b/src/main/python/systemds/scuro/dataloader/text_loader.py
index 6689fb6d92..ff75a4f318 100644
--- a/src/main/python/systemds/scuro/dataloader/text_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -43,7 +43,7 @@ class TextLoader(BaseLoader):
if self.prefix:
line = re.sub(self.prefix, "", line)
line = line.replace("\n", "")
- self.metadata[file] = self.modality_type.create_text_metadata(
+ self.metadata[file] = self.modality_type.create_metadata(
len(line.split()), line
)
self.data.append(line)
diff --git a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
index 6887d6974f..1e6bfa8fc2 100644
--- a/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/timeseries_loader.py
@@ -70,12 +70,12 @@ class TimeseriesLoader(BaseLoader):
data = self._normalize_signals(data)
if file:
- self.metadata[index] = self.modality_type.create_ts_metadata(
+ self.metadata[index] = self.modality_type.create_metadata(
self.signal_names, data, self.sampling_rate
)
else:
for i, index in enumerate(self.indices):
- self.metadata[str(index)] =
self.modality_type.create_ts_metadata(
+ self.metadata[str(index)] = self.modality_type.create_metadata(
self.signal_names, data[i], self.sampling_rate
)
self.data.append(data)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py
b/src/main/python/systemds/scuro/dataloader/video_loader.py
index 0e77d5dc57..2c154ecbaf 100644
--- a/src/main/python/systemds/scuro/dataloader/video_loader.py
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -46,7 +46,7 @@ class VideoLoader(BaseLoader):
def extract(self, file: str, index: Optional[Union[str, List[str]]] =
None):
self.file_sanity_check(file)
# if not self.load_data_from_file:
- # self.metadata[file] = self.modality_type.create_video_metadata(
+ # self.metadata[file] = self.modality_type.create_metadata(
# 30, 10, 100, 100, 3
# )
# else:
@@ -67,7 +67,7 @@ class VideoLoader(BaseLoader):
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
num_channels = 3
- self.metadata[file] = self.modality_type.create_video_metadata(
+ self.metadata[file] = self.modality_type.create_metadata(
self.fps, length, width, height, num_channels
)
diff --git a/src/main/python/systemds/scuro/modality/joined_transformed.py
b/src/main/python/systemds/scuro/modality/joined_transformed.py
index 078ce86f7a..442d72fe76 100644
--- a/src/main/python/systemds/scuro/modality/joined_transformed.py
+++ b/src/main/python/systemds/scuro/modality/joined_transformed.py
@@ -36,7 +36,8 @@ class JoinedTransformedModality(Modality):
:param transformation: Representation to be applied on the modality
"""
super().__init__(
- reduce(or_, [left_modality.modality_type],
right_modality.modality_type),
+ left_modality.modality_type,
+ # reduce(or_, [left_modality.modality_type],
right_modality.modality_type),
data_type=left_modality.data_type,
)
self.transformation = transformation
diff --git a/src/main/python/systemds/scuro/modality/transformed.py
b/src/main/python/systemds/scuro/modality/transformed.py
index 7e8e54eff3..f7739f03df 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -87,7 +87,8 @@ class TransformedModality(Modality):
right.extract_raw_data()
joined_modality = JoinedModality(
- reduce(or_, [right.modality_type], self.modality_type),
+ self.modality_type,
+ # reduce(or_, [right.modality_type], self.modality_type), #TODO
self,
right,
join_condition,
@@ -136,8 +137,10 @@ class TransformedModality(Modality):
fused_modality = TransformedModality(
self, fusion_method, ModalityType.EMBEDDING
)
+ start_time = time.time()
fused_modality.data =
fusion_method.transform(self.create_modality_list(other))
-
+ end_time = time.time()
+ fused_modality.transform_time = end_time - start_time
return fused_modality
def combine_with_training(
@@ -147,7 +150,12 @@ class TransformedModality(Modality):
self, fusion_method, ModalityType.EMBEDDING
)
modalities = self.create_modality_list(other)
+ start_time = time.time()
fused_modality.data =
fusion_method.transform_with_training(modalities, task)
+ end_time = time.time()
+ fused_modality.transform_time = (
+ end_time - start_time
+ ) # Note: this incldues the training time
return fused_modality
diff --git a/src/main/python/systemds/scuro/modality/type.py
b/src/main/python/systemds/scuro/modality/type.py
index 382e2631ad..c6f713df24 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -195,6 +195,14 @@ class ModalityType(Flag):
EMBEDDING = auto()
PHYSIOLOGICAL = auto()
+ _metadata_factory_methods = {
+ "TEXT": "create_text_metadata",
+ "AUDIO": "create_audio_metadata",
+ "VIDEO": "create_video_metadata",
+ "IMAGE": "create_image_metadata",
+ "TIMESERIES": "create_ts_metadata",
+ }
+
def get_schema(self):
return ModalitySchemas.get(self.name)
@@ -215,6 +223,27 @@ class ModalityType(Flag):
return md
+ def create_metadata(self, *args, **kwargs):
+ if self.name is None or "|" in self.name:
+ raise ValueError(
+ f"Composite modality types are not supported for metadata
creation: {self}"
+ )
+
+ factory_methods = type(self)._metadata_factory_methods
+ method_name = factory_methods.value.get(self.name)
+ if method_name is None:
+ raise NotImplementedError(
+ f"Metadata creation not implemented for modality type:
{self.name}"
+ )
+
+ method = getattr(type(self), method_name, None)
+ if method is None:
+ raise NotImplementedError(
+ f"Metadata creation method '{method_name}' not found for
{self.name}"
+ )
+
+ return method(self, *args, **kwargs)
+
def create_audio_metadata(self, sampling_rate, data,
is_single_instance=True):
md = deepcopy(self.get_schema())
md = ModalitySchemas.update_base_metadata(md, data, is_single_instance)
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py
b/src/main/python/systemds/scuro/modality/unimodal_modality.py
index 4ae1067c62..5898ea98c1 100644
--- a/src/main/python/systemds/scuro/modality/unimodal_modality.py
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -72,7 +72,8 @@ class UnimodalModality(Modality):
self.data_loader.update_chunk_sizes(other.data_loader)
joined_modality = JoinedModality(
- reduce(or_, [other.modality_type], self.modality_type),
+ self.modality_type,
+ # reduce(or_, [other.modality_type], self.modality_type), # TODO
self,
other,
join_condition,
@@ -162,16 +163,21 @@ class UnimodalModality(Modality):
np.concatenate((embeddings, padding), axis=1)
)
else:
- padded = np.pad(
- embeddings,
- pad_width=(
- (0, padding_needed)
- if len(embeddings.shape) == 1
- else ((0, padding_needed), (0, 0))
- ),
- mode="constant",
- constant_values=0,
- )
+ if len(embeddings.shape) == 1:
+ padded = np.zeros(
+ embeddings.shape[0] + padding_needed,
+ dtype=embeddings.dtype,
+ )
+ padded[: embeddings.shape[0]] = embeddings
+ else:
+ padded = np.zeros(
+ (
+ embeddings.shape[0] + padding_needed,
+ embeddings.shape[1],
+ ),
+ dtype=embeddings.dtype,
+ )
+ padded[: embeddings.shape[0], :] = embeddings
padded_embeddings.append(padded)
else:
padded_embeddings.append(embeddings)
diff --git a/src/main/python/tests/scuro/data_generator.py
b/src/main/python/tests/scuro/data_generator.py
index 7676906505..3c43cabb3e 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -74,19 +74,19 @@ class ModalityRandomDataGenerator:
self.modality_type = modality_type
for i in range(num_instances):
if modality_type == ModalityType.AUDIO:
- self.metadata[i] = modality_type.create_audio_metadata(
+ self.metadata[i] = modality_type.create_metadata(
num_features / 10, data[i]
)
elif modality_type == ModalityType.TEXT:
- self.metadata[i] = modality_type.create_text_metadata(
+ self.metadata[i] = modality_type.create_metadata(
num_features / 10, data[i]
)
elif modality_type == ModalityType.VIDEO:
- self.metadata[i] = modality_type.create_video_metadata(
+ self.metadata[i] = modality_type.create_metadata(
num_features / 30, 10, 0, 0, 1
)
elif modality_type == ModalityType.TIMESERIES:
- self.metadata[i] = modality_type.create_ts_metadata(["test"],
data[i])
+ self.metadata[i] = modality_type.create_metadata(["test"],
data[i])
else:
raise NotImplementedError
@@ -96,6 +96,7 @@ class ModalityRandomDataGenerator:
return tf_modality
def create_audio_data(self, num_instances, max_audio_length):
+ modality_type = ModalityType.AUDIO
data = [
[
random.random()
@@ -108,7 +109,7 @@ class ModalityRandomDataGenerator:
data[i] = np.array(data[i]).astype(self.data_type)
metadata = {
- i: ModalityType.AUDIO.create_audio_metadata(16000,
np.array(data[i]))
+ i: modality_type.create_metadata(16000, np.array(data[i]))
for i in range(num_instances)
}
@@ -122,7 +123,7 @@ class ModalityRandomDataGenerator:
if num_features == 1:
data = [d.squeeze(-1) for d in data]
metadata = {
- i: ModalityType.TIMESERIES.create_ts_metadata(
+ i: ModalityType.TIMESERIES.create_metadata(
[f"feature_{j}" for j in range(num_features)], data[i]
)
for i in range(num_instances)
@@ -186,7 +187,7 @@ class ModalityRandomDataGenerator:
sentences.append(sentence)
metadata = {
- i: ModalityType.TEXT.create_text_metadata(len(sentences[i]),
sentences[i])
+ i: ModalityType.TEXT.create_metadata(len(sentences[i]),
sentences[i])
for i in range(num_instances)
}
@@ -200,13 +201,13 @@ class ModalityRandomDataGenerator:
np.random.randint(
0,
256,
- (np.random.randint(1, max_num_frames + 1), height, width,
3),
+ (np.random.randint(10, max_num_frames + 1), height, width,
3),
dtype=np.uint8,
)
for _ in range(num_instances)
]
metadata = {
- i: ModalityType.VIDEO.create_video_metadata(
+ i: ModalityType.VIDEO.create_metadata(
30, data[i].shape[0], width, height, 3
)
for i in range(num_instances)
@@ -222,7 +223,7 @@ class ModalityRandomDataGenerator:
for _ in range(num_instances)
]
metadata = {
- i: ModalityType.IMAGE.create_image_metadata(width, height, 3)
+ i: ModalityType.IMAGE.create_metadata(width, height, 3)
for i in range(num_instances)
}