This is an automated email from the ASF dual-hosted git repository.
cdionysio pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 6c163b2ee8 [SYSTEMDS-3835] Add additional text and context operations
6c163b2ee8 is described below
commit 6c163b2ee8f5a3f2c83291ec649d93325a7131d2
Author: Christina Dionysio <[email protected]>
AuthorDate: Fri Jan 9 10:34:24 2026 +0100
[SYSTEMDS-3835] Add additional text and context operations
This patch adds a few additional context operations specifically for the
text modality, and new text representations of the bert family and elmo.
---
.github/workflows/python.yml | 3 +-
src/main/python/systemds/scuro/__init__.py | 27 +-
.../systemds/scuro/drsearch/operator_registry.py | 24 +-
.../systemds/scuro/drsearch/unimodal_optimizer.py | 41 ++-
.../python/systemds/scuro/modality/transformed.py | 2 -
src/main/python/systemds/scuro/modality/type.py | 18 +-
.../systemds/scuro/representations/aggregate.py | 2 +-
.../python/systemds/scuro/representations/bert.py | 221 ++++++++++++---
.../python/systemds/scuro/representations/clip.py | 9 +-
.../python/systemds/scuro/representations/elmo.py | 154 +++++++++++
.../python/systemds/scuro/representations/glove.py | 15 +-
.../systemds/scuro/representations/text_context.py | 221 +++++++++++++++
.../representations/text_context_with_indices.py | 300 +++++++++++++++++++++
.../systemds/scuro/representations/unimodal.py | 2 +
.../scuro/representations/window_aggregation.py | 8 +-
src/main/python/tests/scuro/data_generator.py | 86 +++++-
src/main/python/tests/scuro/test_hp_tuner.py | 100 +------
.../python/tests/scuro/test_multimodal_fusion.py | 92 +------
.../python/tests/scuro/test_operator_registry.py | 17 +-
.../tests/scuro/test_text_context_operators.py | 113 ++++++++
.../python/tests/scuro/test_unimodal_optimizer.py | 101 +------
21 files changed, 1200 insertions(+), 356 deletions(-)
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 006e20d488..26a2e35ac4 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -173,7 +173,8 @@ jobs:
opt-einsum \
nltk \
fvcore \
- scikit-optimize
+ scikit-optimize \
+ flair
kill $KA
cd src/main/python
python -m unittest discover -s tests/scuro -p 'test_*.py' -v
diff --git a/src/main/python/systemds/scuro/__init__.py
b/src/main/python/systemds/scuro/__init__.py
index 8b5a8621d1..7849c03816 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -30,7 +30,13 @@ from
systemds.scuro.representations.aggregated_representation import (
AggregatedRepresentation,
)
from systemds.scuro.representations.average import Average
-from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.bert import (
+ Bert,
+ RoBERTa,
+ DistillBERT,
+ ALBERT,
+ ELECTRA,
+)
from systemds.scuro.representations.bow import BoW
from systemds.scuro.representations.concatenation import Concatenation
from systemds.scuro.representations.context import Context
@@ -101,6 +107,16 @@ from systemds.scuro.drsearch.multimodal_optimizer import
MultimodalOptimizer
from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
from systemds.scuro.representations.vgg import VGG19
from systemds.scuro.representations.clip import CLIPText, CLIPVisual
+from systemds.scuro.representations.text_context import (
+ SentenceBoundarySplit,
+ OverlappingSplit,
+)
+from systemds.scuro.representations.text_context_with_indices import (
+ SentenceBoundarySplitIndices,
+ OverlappingSplitIndices,
+)
+from systemds.scuro.representations.elmo import ELMoRepresentation
+
__all__ = [
"BaseLoader",
@@ -113,6 +129,10 @@ __all__ = [
"AggregatedRepresentation",
"Average",
"Bert",
+ "RoBERTa",
+ "DistillBERT",
+ "ALBERT",
+ "ELECTRA",
"BoW",
"Concatenation",
"Context",
@@ -177,4 +197,9 @@ __all__ = [
"VGG19",
"CLIPVisual",
"CLIPText",
+ "SentenceBoundarySplit",
+ "OverlappingSplit",
+ "ELMoRepresentation",
+ "SentenceBoundarySplitIndices",
+ "OverlappingSplitIndices",
]
diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py
b/src/main/python/systemds/scuro/drsearch/operator_registry.py
index 3b20245956..dc62e9b65b 100644
--- a/src/main/python/systemds/scuro/drsearch/operator_registry.py
+++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py
@@ -33,8 +33,10 @@ class Registry:
_instance = None
_representations = {}
- _context_operators = []
+ _context_operators = {}
_fusion_operators = []
+ _text_context_operators = []
+ _video_context_operators = []
def __new__(cls):
if not cls._instance:
@@ -60,8 +62,13 @@ class Registry:
):
self._representations[modality].append(representation)
- def add_context_operator(self, context_operator):
- self._context_operators.append(context_operator)
+ def add_context_operator(self, context_operator, modality_type):
+ if not isinstance(modality_type, list):
+ modality_type = [modality_type]
+ for m_type in modality_type:
+ if not m_type in self._context_operators.keys():
+ self._context_operators[m_type] = []
+ self._context_operators[m_type].append(context_operator)
def add_fusion_operator(self, fusion_operator):
self._fusion_operators.append(fusion_operator)
@@ -76,9 +83,8 @@ class Registry:
reps.append(rep)
return reps
- def get_context_operators(self):
- # TODO: return modality specific context operations
- return self._context_operators
+ def get_context_operators(self, modality_type):
+ return self._context_operators[modality_type]
def get_fusion_operators(self):
return self._fusion_operators
@@ -121,13 +127,15 @@ def register_representation(modalities:
Union[ModalityType, List[ModalityType]])
return decorator
-def register_context_operator():
+def register_context_operator(modality_type):
"""
Decorator to register a context operator.
+
+ @param modality_type: The modality type for which the context operator is
to be registered
"""
def decorator(cls):
- Registry().add_context_operator(cls)
+ Registry().add_context_operator(cls, modality_type)
return cls
return decorator
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 1a348a91df..4cde294b17 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -87,8 +87,8 @@ class UnimodalOptimizer:
)
@lru_cache(maxsize=32)
- def _get_context_operators(self):
- return self.operator_registry.get_context_operators()
+ def _get_context_operators(self, modality_type):
+ return self.operator_registry.get_context_operators(modality_type)
def store_results(self, file_name=None):
if file_name is None:
@@ -302,6 +302,39 @@ class UnimodalOptimizer:
current_node_id = rep_node_id
dags.append(builder.build(current_node_id))
+ if operator.needs_context:
+ context_operators =
self._get_context_operators(modality.modality_type)
+ for context_op in context_operators:
+ if operator.initial_context_length is not None:
+ context_length = operator.initial_context_length
+
+ context_node_id = builder.create_operation_node(
+ context_op,
+ [leaf_id],
+ context_op(context_length).get_current_parameters(),
+ )
+ else:
+ context_node_id = builder.create_operation_node(
+ context_op,
+ [leaf_id],
+ context_op().get_current_parameters(),
+ )
+
+ context_rep_node_id = builder.create_operation_node(
+ operator.__class__,
+ [context_node_id],
+ operator.get_current_parameters(),
+ )
+
+ agg_operator = AggregatedRepresentation()
+ context_agg_node_id = builder.create_operation_node(
+ agg_operator.__class__,
+ [context_rep_node_id],
+ agg_operator.get_current_parameters(),
+ )
+
+ dags.append(builder.build(context_agg_node_id))
+
if not operator.self_contained:
not_self_contained_reps = self._get_not_self_contained_reps(
modality.modality_type
@@ -344,7 +377,7 @@ class UnimodalOptimizer:
def default_context_operators(self, modality, builder, leaf_id,
current_node_id):
dags = []
- context_operators = self._get_context_operators()
+ context_operators = self._get_context_operators(modality.modality_type)
for context_op in context_operators:
if (
modality.modality_type != ModalityType.TEXT
@@ -368,7 +401,7 @@ class UnimodalOptimizer:
def temporal_context_operators(self, modality, builder, leaf_id,
current_node_id):
aggregators =
self.operator_registry.get_representations(modality.modality_type)
- context_operators = self._get_context_operators()
+ context_operators = self._get_context_operators(modality.modality_type)
dags = []
for agg in aggregators:
diff --git a/src/main/python/systemds/scuro/modality/transformed.py
b/src/main/python/systemds/scuro/modality/transformed.py
index f7739f03df..3b01465302 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -18,8 +18,6 @@
# under the License.
#
# -------------------------------------------------------------
-from functools import reduce
-from operator import or_
from typing import Union, List
from systemds.scuro.modality.type import ModalityType
diff --git a/src/main/python/systemds/scuro/modality/type.py
b/src/main/python/systemds/scuro/modality/type.py
index c2fe38176f..23d97e869b 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -108,8 +108,12 @@ class ModalitySchemas:
shape = data.shape
elif data_layout is DataLayout.NESTED_LEVEL:
if data_is_single_instance:
- dtype = data.dtype
- shape = data.shape
+ if isinstance(data, list):
+ dtype = type(data[0])
+ shape = (len(data), len(data[0]))
+ else:
+ dtype = data.dtype
+ shape = data.shape
else:
shape = data[0].shape
dtype = data[0].dtype
@@ -306,13 +310,15 @@ class DataLayout(Enum):
return None
if data_is_single_instance:
- if (
+ if (isinstance(data, list) and not isinstance(data[0], str)) or (
+ isinstance(data, np.ndarray) and data.ndim == 1
+ ):
+ return DataLayout.SINGLE_LEVEL
+ elif (
isinstance(data, list)
or isinstance(data, np.ndarray)
- and data.ndim == 1
+ or isinstance(data, torch.Tensor)
):
- return DataLayout.SINGLE_LEVEL
- elif isinstance(data, np.ndarray) or isinstance(data,
torch.Tensor):
return DataLayout.NESTED_LEVEL
if isinstance(data[0], list):
diff --git a/src/main/python/systemds/scuro/representations/aggregate.py
b/src/main/python/systemds/scuro/representations/aggregate.py
index 0a8438e684..9503a48587 100644
--- a/src/main/python/systemds/scuro/representations/aggregate.py
+++ b/src/main/python/systemds/scuro/representations/aggregate.py
@@ -71,7 +71,7 @@ class Aggregation:
max_len = 0
for i, instance in enumerate(modality.data):
data.append([])
- if isinstance(instance, np.ndarray):
+ if isinstance(instance, np.ndarray) or isinstance(instance, list):
if (
modality.modality_type == ModalityType.IMAGE
or modality.modality_type == ModalityType.VIDEO
diff --git a/src/main/python/systemds/scuro/representations/bert.py
b/src/main/python/systemds/scuro/representations/bert.py
index 4d486bff59..be579c0dd6 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -22,7 +22,7 @@ import numpy as np
from systemds.scuro.modality.transformed import TransformedModality
from systemds.scuro.representations.unimodal import UnimodalRepresentation
import torch
-from transformers import BertTokenizerFast, BertModel
+from transformers import AutoTokenizer, AutoModel
from systemds.scuro.representations.utils import save_embeddings
from systemds.scuro.modality.type import ModalityType
from systemds.scuro.drsearch.operator_registry import register_representation
@@ -37,15 +37,18 @@ class TextDataset(Dataset):
def __init__(self, texts):
self.texts = []
- for text in texts:
- if text is None:
- self.texts.append("")
- elif isinstance(text, np.ndarray):
- self.texts.append(str(text.item()) if text.size == 1 else
str(text))
- elif not isinstance(text, str):
- self.texts.append(str(text))
- else:
- self.texts.append(text)
+ if isinstance(texts, list):
+ self.texts = texts
+ else:
+ for text in texts:
+ if text is None:
+ self.texts.append("")
+ elif isinstance(text, np.ndarray):
+ self.texts.append(str(text.item()) if text.size == 1 else
str(text))
+ elif not isinstance(text, str):
+ self.texts.append(str(text))
+ else:
+ self.texts.append(text)
def __len__(self):
return len(self.texts)
@@ -54,36 +57,61 @@ class TextDataset(Dataset):
return self.texts[idx]
-@register_representation(ModalityType.TEXT)
-class Bert(UnimodalRepresentation):
- def __init__(self, model_name="bert", output_file=None,
max_seq_length=512):
- parameters = {"model_name": "bert"}
+class BertFamily(UnimodalRepresentation):
+ def __init__(
+ self,
+ representation_name,
+ model_name,
+ layer,
+ parameters={},
+ output_file=None,
+ max_seq_length=512,
+ ):
self.model_name = model_name
- super().__init__("Bert", ModalityType.EMBEDDING, parameters)
+ super().__init__(representation_name, ModalityType.EMBEDDING,
parameters)
+ self.layer_name = layer
self.output_file = output_file
self.max_seq_length = max_seq_length
+ self.needs_context = True
+ self.initial_context_length = 350
def transform(self, modality):
transformed_modality = TransformedModality(modality, self)
- model_name = "bert-base-uncased"
- tokenizer = BertTokenizerFast.from_pretrained(
- model_name, clean_up_tokenization_spaces=True
+ tokenizer = AutoTokenizer.from_pretrained(
+ self.model_name, clean_up_tokenization_spaces=True
)
+ self.model =
AutoModel.from_pretrained(self.model_name).to(get_device())
+ self.bert_output = None
+
+ def get_activation(name):
+ def hook(model, input, output):
+ self.bert_output = output.detach().cpu().numpy()
- model = BertModel.from_pretrained(model_name).to(get_device())
+ return hook
- embeddings = self.create_embeddings(modality, model, tokenizer)
+ if self.layer_name != "cls":
+ for name, layer in self.model.named_modules():
+ if name == self.layer_name:
+ layer.register_forward_hook(get_activation(name))
+ break
+
+ if isinstance(modality.data[0], list):
+ embeddings = []
+ for d in modality.data:
+ embeddings.append(self.create_embeddings(d, self.model,
tokenizer))
+ else:
+ embeddings = self.create_embeddings(modality.data, self.model,
tokenizer)
if self.output_file is not None:
save_embeddings(embeddings, self.output_file)
transformed_modality.data_type = np.float32
- transformed_modality.data = np.array(embeddings)
+ transformed_modality.data = embeddings
return transformed_modality
- def create_embeddings(self, modality, model, tokenizer):
- dataset = TextDataset(modality.data)
+ def create_embeddings(self, data, model, tokenizer):
+ dataset = TextDataset(data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False,
collate_fn=None)
cls_embeddings = []
for batch in dataloader:
@@ -94,27 +122,146 @@ class Bert(UnimodalRepresentation):
padding="max_length",
return_attention_mask=True,
truncation=True,
- max_length=512, # TODO: make this dynamic
+ max_length=512, # TODO: make this dynamic with parameter to
tune
)
inputs.to(get_device())
- ModalityType.TEXT.add_field_for_instances(
- modality.metadata,
- "token_to_character_mapping",
- inputs.data["offset_mapping"].tolist(),
- )
-
- ModalityType.TEXT.add_field_for_instances(
- modality.metadata,
- "attention_masks",
- inputs.data["attention_mask"].tolist(),
- )
+ # ModalityType.TEXT.add_field_for_instances(
+ # modality.metadata,
+ # "token_to_character_mapping",
+ # inputs.data["offset_mapping"].tolist(),
+ # )
+ #
+ # ModalityType.TEXT.add_field_for_instances(
+ # modality.metadata,
+ # "attention_masks",
+ # inputs.data["attention_mask"].tolist(),
+ # )
del inputs.data["offset_mapping"]
with torch.no_grad():
outputs = model(**inputs)
-
- cls_embedding =
outputs.last_hidden_state.detach().cpu().numpy()
+ if self.layer_name == "cls":
+ cls_embedding =
outputs.last_hidden_state.detach().cpu().numpy()
+ else:
+ cls_embedding = self.bert_output
cls_embeddings.extend(cls_embedding)
return np.array(cls_embeddings)
+
+
+@register_representation(ModalityType.TEXT)
+class Bert(BertFamily):
+ def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+ parameters = {
+ "layer_name": [
+ "cls",
+ "encoder.layer.0",
+ "encoder.layer.1",
+ "encoder.layer.2",
+ "encoder.layer.3",
+ "encoder.layer.4",
+ "encoder.layer.5",
+ "encoder.layer.6",
+ "encoder.layer.7",
+ "encoder.layer.8",
+ "encoder.layer.9",
+ "encoder.layer.10",
+ "encoder.layer.11",
+ "pooler",
+ "pooler.activation",
+ ]
+ }
+ super().__init__(
+ "Bert", "bert-base-uncased", layer, parameters, output_file,
max_seq_length
+ )
+
+
+@register_representation(ModalityType.TEXT)
+class RoBERTa(BertFamily):
+ def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+ parameters = {
+ "layer_name": [
+ "cls",
+ "encoder.layer.0",
+ "encoder.layer.1",
+ "encoder.layer.2",
+ "encoder.layer.3",
+ "encoder.layer.4",
+ "encoder.layer.5",
+ "encoder.layer.6",
+ "encoder.layer.7",
+ "encoder.layer.8",
+ "encoder.layer.9",
+ "encoder.layer.10",
+ "encoder.layer.11",
+ "pooler",
+ "pooler.activation",
+ ]
+ }
+ super().__init__(
+ "RoBERTa", "roberta-base", layer, parameters, output_file,
max_seq_length
+ )
+
+
+@register_representation(ModalityType.TEXT)
+class DistillBERT(BertFamily):
+ def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+ parameters = {
+ "layer_name": [
+ "cls",
+ "transformer.layer.0",
+ "transformer.layer.1",
+ "transformer.layer.2",
+ "transformer.layer.3",
+ "transformer.layer.4",
+ "transformer.layer.5",
+ ]
+ }
+ super().__init__(
+ "DistillBERT",
+ "distilbert-base-uncased",
+ layer,
+ parameters,
+ output_file,
+ max_seq_length,
+ )
+
+
+@register_representation(ModalityType.TEXT)
+class ALBERT(BertFamily):
+ def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+ parameters = {"layer_name": ["cls", "encoder.albert_layer_groups.0",
"pooler"]}
+ super().__init__(
+ "ALBERT", "albert-base-v2", layer, parameters, output_file,
max_seq_length
+ )
+
+
+@register_representation(ModalityType.TEXT)
+class ELECTRA(BertFamily):
+ def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+ parameters = {
+ "layer_name": [
+ "cls",
+ "encoder.layer.0",
+ "encoder.layer.1",
+ "encoder.layer.2",
+ "encoder.layer.3",
+ "encoder.layer.4",
+ "encoder.layer.5",
+ "encoder.layer.6",
+ "encoder.layer.7",
+ "encoder.layer.8",
+ "encoder.layer.9",
+ "encoder.layer.10",
+ "encoder.layer.11",
+ ]
+ }
+ super().__init__(
+ "ELECTRA",
+ "google/electra-base-discriminator",
+ layer,
+ parameters,
+ output_file,
+ max_seq_length,
+ )
diff --git a/src/main/python/systemds/scuro/representations/clip.py
b/src/main/python/systemds/scuro/representations/clip.py
index 504681f253..a431e52761 100644
--- a/src/main/python/systemds/scuro/representations/clip.py
+++ b/src/main/python/systemds/scuro/representations/clip.py
@@ -119,13 +119,20 @@ class CLIPText(UnimodalRepresentation):
)
self.processor =
CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
self.output_file = output_file
+ self.needs_context = True
+ self.initial_context_length = 55
def transform(self, modality):
transformed_modality = TransformedModality(
modality, self, self.output_modality_type
)
- embeddings = self.create_text_embeddings(modality.data, self.model)
+ if isinstance(modality.data[0], list):
+ embeddings = []
+ for d in modality.data:
+ embeddings.append(self.create_text_embeddings(d, self.model))
+ else:
+ embeddings = self.create_text_embeddings(modality.data, self.model)
if self.output_file is not None:
save_embeddings(embeddings, self.output_file)
diff --git a/src/main/python/systemds/scuro/representations/elmo.py
b/src/main/python/systemds/scuro/representations/elmo.py
new file mode 100644
index 0000000000..ba2a99f8e1
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/elmo.py
@@ -0,0 +1,154 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from systemds.scuro.utils.torch_dataset import CustomDataset
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.drsearch.operator_registry import register_representation
+import torch.utils.data
+import torch
+import numpy as np
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.utils.static_variables import get_device
+from flair.embeddings import ELMoEmbeddings
+from flair.data import Sentence
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+
+
+class TextDataset(Dataset):
+ def __init__(self, texts):
+
+ self.texts = []
+ if isinstance(texts, list):
+ self.texts = texts
+ else:
+ for text in texts:
+ if text is None:
+ self.texts.append("")
+ elif isinstance(text, np.ndarray):
+ self.texts.append(str(text.item()) if text.size == 1 else
str(text))
+ elif not isinstance(text, str):
+ self.texts.append(str(text))
+ else:
+ self.texts.append(text)
+
+ def __len__(self):
+ return len(self.texts)
+
+ def __getitem__(self, idx):
+ return self.texts[idx]
+
+
+# @register_representation([ModalityType.TEXT])
+class ELMoRepresentation(UnimodalRepresentation):
+ def __init__(
+ self, model_name="elmo-original", layer="mix", pooling="mean",
output_file=None
+ ):
+ self.data_type = torch.float32
+ self.model_name = model_name
+ self.layer_name = layer
+ self.pooling = pooling # "mean", "max", "first", "last", or "all" (no
pooling)
+ parameters = self._get_parameters()
+ super().__init__("ELMo", ModalityType.EMBEDDING, parameters)
+
+ self.output_file = output_file
+
+ @property
+ def model_name(self):
+ return self._model_name
+
+ @model_name.setter
+ def model_name(self, model_name):
+ self._model_name = model_name
+
+ if model_name == "elmo-original":
+ self.model = ELMoEmbeddings("original")
+ self.embedding_dim = 1024
+ elif model_name == "elmo-small":
+ self.model = ELMoEmbeddings("small")
+ self.embedding_dim = 256
+ elif model_name == "elmo-medium":
+ self.model = ELMoEmbeddings("medium")
+ self.embedding_dim = 512
+ else:
+ raise NotImplementedError(f"Model {model_name} not supported")
+
+ self.model = self.model.to(get_device())
+
+ def _get_parameters(self):
+ parameters = {
+ "model_name": ["elmo-original", "elmo-small", "elmo-medium"],
+ "layer_name": [
+ "mix",
+ "layer_0",
+ "layer_1",
+ "layer_2",
+ ],
+ "pooling": ["mean", "max", "first", "last", "all"],
+ }
+ return parameters
+
+ def transform(self, modality):
+ transformed_modality = TransformedModality(
+ modality, self, ModalityType.EMBEDDING
+ )
+ dataset = TextDataset(modality.data)
+ dataloader = DataLoader(dataset, batch_size=32, shuffle=False,
collate_fn=None)
+ embeddings = []
+ for batch in dataloader:
+ texts = batch
+ for text in texts:
+ sentence = Sentence(text)
+ self.model.embed(sentence)
+ token_embeddings = []
+ for token in sentence:
+ if self.layer_name == "mix":
+ embedding = token.embedding
+ elif self.layer_name == "layer_0":
+ embedding = token.get_embedding(self.model.name + "-0")
+ elif self.layer_name == "layer_1":
+ embedding = token.get_embedding(self.model.name + "-1")
+ elif self.layer_name == "layer_2":
+ embedding = token.get_embedding(self.model.name + "-2")
+ else:
+ embedding = token.embedding
+
+ token_embeddings.append(embedding.cpu().numpy())
+
+ token_embeddings = np.array(token_embeddings)
+
+ if self.pooling == "mean":
+ sentence_embedding = np.mean(token_embeddings, axis=0)
+ elif self.pooling == "max":
+ sentence_embedding = np.max(token_embeddings, axis=0)
+ elif self.pooling == "first":
+ sentence_embedding = token_embeddings[0]
+ elif self.pooling == "last":
+ sentence_embedding = token_embeddings[-1]
+ elif self.pooling == "all":
+ sentence_embedding = token_embeddings.flatten()
+ else:
+ sentence_embedding = np.mean(token_embeddings, axis=0)
+
+ embeddings.append(sentence_embedding.astype(np.float32))
+
+ transformed_modality.data = np.array(embeddings)
+ return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/glove.py
b/src/main/python/systemds/scuro/representations/glove.py
index 9076efecfc..74f487bd79 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -18,8 +18,10 @@
# under the License.
#
# -------------------------------------------------------------
+import zipfile
import numpy as np
from gensim.utils import tokenize
+from huggingface_hub import hf_hub_download
from systemds.scuro.modality.transformed import TransformedModality
from systemds.scuro.representations.unimodal import UnimodalRepresentation
@@ -39,11 +41,17 @@ def load_glove_embeddings(file_path):
return embeddings
-# @register_representation(ModalityType.TEXT)
+@register_representation(ModalityType.TEXT)
class GloVe(UnimodalRepresentation):
- def __init__(self, glove_path, output_file=None):
+ def __init__(self, output_file=None):
super().__init__("GloVe", ModalityType.TEXT)
- self.glove_path = glove_path
+ file_path = hf_hub_download(
+ repo_id="stanfordnlp/glove", filename="glove.6B.zip"
+ )
+ with zipfile.ZipFile(file_path, "r") as zip_ref:
+ zip_ref.extractall("./glove_extracted")
+
+ self.glove_path = "./glove_extracted/glove.6B.100d.txt"
self.output_file = output_file
def transform(self, modality):
@@ -67,6 +75,5 @@ class GloVe(UnimodalRepresentation):
if self.output_file is not None:
save_embeddings(np.array(embeddings), self.output_file)
- transformed_modality.data_type = np.float32
transformed_modality.data = np.array(embeddings)
return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/text_context.py
b/src/main/python/systemds/scuro/representations/text_context.py
new file mode 100644
index 0000000000..b98b90e187
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/text_context.py
@@ -0,0 +1,221 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import re
+from typing import List, Any
+
+from systemds.scuro.drsearch.operator_registry import register_context_operator
+from systemds.scuro.representations.context import Context
+from systemds.scuro.modality.type import ModalityType
+
+
+def _split_into_words(text: str) -> List[str]:
+ """Split text into words, preserving whitespace structure."""
+ if not text or not isinstance(text, str):
+ return []
+ return text.split()
+
+
+def _split_into_sentences(text: str) -> List[str]:
+ """
+ Split text into sentences using regex.
+ Handles common sentence endings: . ! ?
+ """
+ if not text or not isinstance(text, str):
+ return []
+
+ sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])(?=\s*$)"
+ sentences = re.split(sentence_pattern, text.strip())
+
+ sentences = [s.strip() for s in sentences if s.strip()]
+
+ if not sentences:
+ return [text]
+
+ return sentences
+
+
+def _count_words(text: str) -> int:
+ """
+ Count the number of words in a text string.
+ """
+ if not text or not isinstance(text, str):
+ return 0
+ return len(text.split())
+
+
+def _extract_text(instance: Any) -> str:
+ if isinstance(instance, str):
+ text = instance
+ else:
+ text = str(instance)
+
+ if not text or not text.strip():
+ return ""
+ return text
+
+
+@register_context_operator(ModalityType.TEXT)
+class SentenceBoundarySplit(Context):
+ """
+ Splits text at sentence boundaries while respecting maximum word count.
+
+ Parameters:
+ max_words (int): Maximum number of words per chunk (default: 55)
+ min_words (int): Minimum number of words per chunk before splitting
(default: 10)
+ """
+
+ def __init__(self, max_words=55, min_words=10):
+ parameters = {
+ "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450],
+ "min_words": [10, 20, 30],
+ }
+ super().__init__("SentenceBoundarySplit", parameters)
+ self.max_words = int(max_words)
+ self.min_words = max(1, int(min_words))
+
+ def execute(self, modality):
+ """
+ Split each text instance at sentence boundaries, respecting max_words.
+
+ Returns:
+ List of lists, where each inner list contains text chunks (strings)
+ """
+ chunked_data = []
+
+ for instance in modality.data:
+ text = _extract_text(instance)
+ if not text:
+ chunked_data.append([""])
+ continue
+
+ sentences = _split_into_sentences(text)
+
+ if not sentences:
+ chunked_data.append([text])
+ continue
+
+ chunks = []
+ current_chunk = []
+ current_word_count = 0
+
+ for sentence in sentences:
+ sentence_word_count = _count_words(sentence)
+
+ if sentence_word_count > self.max_words:
+ if current_chunk and current_word_count >= self.min_words:
+ chunks.append("".join(current_chunk))
+ current_chunk = []
+ current_word_count = 0
+
+ words = _split_into_words(sentence)
+ for i in range(0, len(words), self.max_words):
+ chunk_words = words[i : i + self.max_words]
+ chunks.append(" ".join(chunk_words))
+
+ elif current_word_count + sentence_word_count > self.max_words:
+ if current_chunk and current_word_count >= self.min_words:
+ chunks.append(" ".join(current_chunk))
+ current_chunk = [sentence]
+ current_word_count = sentence_word_count
+ else:
+ current_chunk.append(sentence)
+ current_word_count += sentence_word_count
+ else:
+ current_chunk.append(sentence)
+ current_word_count += sentence_word_count
+
+ # Add remaining chunk
+ if current_chunk:
+ chunks.append(" ".join(current_chunk))
+
+ if not chunks:
+ chunks = [text]
+
+ chunked_data.append(chunks)
+
+ return chunked_data
+
+
+@register_context_operator(ModalityType.TEXT)
+class OverlappingSplit(Context):
+ """
+ Splits text with overlapping chunks using a sliding window approach.
+
+ Parameters:
+ max_words (int): Maximum number of words per chunk (default: 55)
+ overlap (float): percentage of overlapping words between chunks
(default: 50%)
+ stride (int, optional): Step size in words. If None, stride =
max_words - overlap_words
+ """
+
+ def __init__(self, max_words=55, overlap=0.5, stride=None):
+ overlap_words = int(max_words * overlap)
+ if stride is None:
+ stride = max_words - overlap_words
+
+ parameters = {
+ "max_words": [40, 55, 70, 250, 300, 350, 400, 450],
+ "overlap": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+ "stride": [10, 15, 20, 30],
+ }
+ super().__init__("OverlappingSplit", parameters)
+ self.max_words = max_words
+ self.overlap = overlap
+ self.stride = stride
+
+ def execute(self, modality):
+ """
+ Split each text instance with overlapping chunks.
+
+ Returns:
+ List of lists, where each inner list contains text chunks (strings)
+ """
+ chunked_data = []
+
+ for instance in modality.data:
+ text = _extract_text(instance)
+ if not text:
+ chunked_data.append("")
+ continue
+
+ words = _split_into_words(text)
+
+ if len(words) <= self.max_words:
+ chunked_data.append([text])
+ continue
+
+ chunks = []
+
+ # Create overlapping chunks with specified stride
+ for i in range(0, len(words), self.stride):
+ chunk_words = words[i : i + self.max_words]
+ if chunk_words:
+ chunk_text = " ".join(chunk_words)
+ chunks.append(chunk_text)
+
+ if i + self.max_words >= len(words):
+ break
+
+ if not chunks:
+ chunks = [text]
+
+ chunked_data.append(chunks)
+
+ return chunked_data
diff --git
a/src/main/python/systemds/scuro/representations/text_context_with_indices.py
b/src/main/python/systemds/scuro/representations/text_context_with_indices.py
new file mode 100644
index 0000000000..cc7070306b
--- /dev/null
+++
b/src/main/python/systemds/scuro/representations/text_context_with_indices.py
@@ -0,0 +1,300 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import re
+from typing import List, Any
+
+from systemds.scuro.drsearch.operator_registry import register_context_operator
+from systemds.scuro.representations.context import Context
+from systemds.scuro.modality.type import ModalityType
+
+# TODO: Use this to get indices for text chunks based on different splitting
strategies
+# To use this approach a differnt extration of text chunks is needed in either
the TextModality or the Representations
+
+
+def _split_into_words(text: str) -> List[str]:
+ """Split text into words, preserving whitespace structure."""
+ if not text or not isinstance(text, str):
+ return []
+ return text.split()
+
+
+def _split_into_sentences(text: str) -> List[str]:
+ """
+ Split text into sentences using regex.
+ Handles common sentence endings: . ! ?
+ """
+ if not text or not isinstance(text, str):
+ return []
+
+ sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])(?=\s*$)"
+ sentences = re.split(sentence_pattern, text.strip())
+
+ sentences = [s.strip() for s in sentences if s.strip()]
+
+ if not sentences:
+ return [text]
+
+ return sentences
+
+
+def _count_words(text: str) -> int:
+ """
+ Count the number of words in a text string.
+ """
+ if not text or not isinstance(text, str):
+ return 0
+ return len(text.split())
+
+
+def _extract_text(instance: Any) -> str:
+ if isinstance(instance, str):
+ text = instance
+ else:
+ text = str(instance)
+
+ if not text or not text.strip():
+ return ""
+ return text
+
+
+# @register_context_operator(ModalityType.TEXT)
+class WordCountSplitIndices(Context):
+ """
+ Splits text after a fixed number of words.
+
+ Parameters:
+ max_words (int): Maximum number of words per chunk (default: 55)
+ overlap (int): Number of overlapping words between chunks (default: 0)
+ """
+
+ def __init__(self, max_words=55, overlap=0):
+ parameters = {
+ "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450],
+ "overlap": [0, 10, 20, 30],
+ }
+ super().__init__("WordCountSplit", parameters)
+ self.max_words = int(max_words)
+ self.overlap = max(0, int(overlap))
+
+ def execute(self, modality):
+ """
+ Split each text instance into chunks of max_words words.
+
+ Returns:
+ List of tuples, where each tuple contains the start and end index
of text chunks
+ """
+ chunked_data = []
+
+ for instance in modality.data:
+ text = _extract_text(instance)
+
+ if not text:
+ chunked_data.append((0, 0))
+ continue
+
+ words = _split_into_words(text)
+
+ if len(words) <= self.max_words:
+ chunked_data.append([(0, len(text))])
+ continue
+
+ chunks = []
+ stride = self.max_words - self.overlap
+
+ start = 0
+ for i in range(0, len(words), stride):
+ chunk_words = words[i : i + self.max_words]
+ chunk_text = " ".join(chunk_words)
+ chunks.append((start, start + len(chunk_text)))
+ start += len(chunk_text) + 1
+
+ if i + self.max_words >= len(words):
+ break
+
+ chunked_data.append(chunks)
+
+ return chunked_data
+
+
+@register_context_operator(ModalityType.TEXT)
+class SentenceBoundarySplitIndices(Context):
+ """
+ Splits text at sentence boundaries while respecting maximum word count.
+
+ Parameters:
+ max_words (int): Maximum number of words per chunk (default: 55)
+ min_words (int): Minimum number of words per chunk before splitting
(default: 10)
+ """
+
+ def __init__(self, max_words=55, min_words=10, overlap=0.1):
+ parameters = {
+ "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450],
+ "min_words": [10, 20, 30],
+ }
+ super().__init__("SentenceBoundarySplit", parameters)
+ self.max_words = int(max_words)
+ self.min_words = max(1, int(min_words))
+ self.overlap = overlap
+ self.stride = max(1, int(max_words * (1 - overlap)))
+
+ def execute(self, modality):
+ """
+ Split each text instance at sentence boundaries, respecting max_words.
+
+ Returns:
+ List of lists, where each inner list contains text chunks (strings)
+ """
+ chunked_data = []
+
+ for instance in modality.data:
+ text = _extract_text(instance)
+ if not text:
+ chunked_data.append((0, 0))
+ continue
+
+ sentences = _split_into_sentences(text)
+
+ if not sentences:
+ chunked_data.append((0, len(text)))
+ continue
+
+ chunks = []
+ current_chunk = None
+ current_word_count = 0
+ start = 0
+ for sentence in sentences:
+ sentence_word_count = _count_words(sentence)
+
+ if sentence_word_count > self.max_words:
+ if current_chunk and current_word_count >= self.min_words:
+ chunks.append(current_chunk)
+ current_chunk = []
+ current_word_count = 0
+
+ words = _split_into_words(sentence)
+ for i in range(0, len(words), self.max_words):
+ chunk_words = words[i : i + self.max_words]
+ current_chunk = (
+ (start, start + len(" ".join(chunk_words)))
+ if not current_chunk
+ else (current_chunk[0], start + len("
".join(chunk_words)))
+ )
+ start += len(" ".join(chunk_words)) + 1
+
+ elif current_word_count + sentence_word_count > self.max_words:
+ if current_chunk and current_word_count >= self.min_words:
+ chunks.append(current_chunk)
+ current_chunk = (start, start + len(sentence))
+ start += len(sentence) + 1
+ current_word_count = sentence_word_count
+ else:
+ current_chunk = (current_chunk[0], start +
len(sentence))
+ start += len(sentence) + 1
+ current_word_count += sentence_word_count
+ else:
+ current_chunk = (
+ (start, start + len(sentence))
+ if not current_chunk
+ else (current_chunk[0], start + len(sentence))
+ )
+ start += len(sentence) + 1
+ current_word_count += sentence_word_count
+
+ # Add remaining chunk
+ if current_chunk:
+ chunks.append(current_chunk)
+
+ if not chunks:
+ chunks = [(0, len(text))]
+
+ chunked_data.append(chunks)
+
+ return chunked_data
+
+
+@register_context_operator(ModalityType.TEXT)
+class OverlappingSplitIndices(Context):
+ """
+ Splits text with overlapping chunks using a sliding window approach.
+
+ Parameters:
+ max_words (int): Maximum number of words per chunk (default: 55)
+ overlap (int): percentage of overlapping words between chunks
(default: 50%)
+ stride (int, optional): Step size in words. If None, stride =
max_words - overlap_words
+ """
+
+ def __init__(self, max_words=55, overlap=0.5, stride=None):
+ overlap_words = int(max_words * overlap)
+ if stride is None:
+ stride = max_words - overlap_words
+
+ parameters = {
+ "max_words": [40, 55, 70, 250, 300, 350, 400, 450],
+ "overlap": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+ "stride": [10, 15, 20, 30],
+ }
+ super().__init__("OverlappingSplit", parameters)
+ self.max_words = max_words
+ self.overlap = overlap
+ self.stride = stride
+
+ def execute(self, modality):
+ """
+ Split each text instance with overlapping chunks.
+
+ Returns:
+ List of tuples, where each tuple contains start and end index to
the text chunks
+ """
+ chunked_data = []
+
+ for instance in modality.data:
+ text = _extract_text(instance)
+ if not text:
+ chunked_data.append((0, 0))
+ continue
+
+ words = _split_into_words(text)
+
+ if len(words) <= self.max_words:
+ chunked_data.append((0, len(text)))
+ continue
+
+ chunks = []
+
+ # Create overlapping chunks with specified stride
+ start = 0
+ for i in range(0, len(words), self.stride):
+ chunk_words = words[i : i + self.max_words]
+ if chunk_words:
+ chunk_text = " ".join(chunk_words)
+ chunks.append((start, start + len(chunk_text)))
+ start += len(chunk_text) - len(
+ " ".join(chunk_words[self.stride - len(chunk_words) :])
+ )
+ if i + self.max_words >= len(words):
+ break
+
+ if not chunks:
+ chunks = [(0, len(text))]
+
+ chunked_data.append(chunks)
+
+ return chunked_data
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py
b/src/main/python/systemds/scuro/representations/unimodal.py
index 362888aa27..2bb34733e2 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/representations/unimodal.py
@@ -38,6 +38,8 @@ class UnimodalRepresentation(Representation):
if parameters is None:
parameters = {}
self.self_contained = self_contained
+ self.needs_context = False
+ self.initial_context_length = None
@abc.abstractmethod
def transform(self, data):
diff --git
a/src/main/python/systemds/scuro/representations/window_aggregation.py
b/src/main/python/systemds/scuro/representations/window_aggregation.py
index f40b28ea87..4d4ec19c5b 100644
--- a/src/main/python/systemds/scuro/representations/window_aggregation.py
+++ b/src/main/python/systemds/scuro/representations/window_aggregation.py
@@ -59,11 +59,11 @@ class Window(Context):
self._aggregation_function = Aggregation(value)
-@register_context_operator()
+@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO])
class WindowAggregation(Window):
def __init__(self, aggregation_function="mean", window_size=10, pad=False):
super().__init__("WindowAggregation", aggregation_function)
- self.parameters["window_size"] = [window_size]
+ self.parameters["window_size"] = [5, 10, 15, 25, 50, 100]
self.window_size = int(window_size)
self.pad = pad
@@ -167,7 +167,7 @@ class WindowAggregation(Window):
return np.array(result)
-@register_context_operator()
+@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO])
class StaticWindow(Window):
def __init__(self, aggregation_function="mean", num_windows=100):
super().__init__("StaticWindow", aggregation_function)
@@ -198,7 +198,7 @@ class StaticWindow(Window):
return np.array(windowed_data)
-@register_context_operator()
+@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO])
class DynamicWindow(Window):
def __init__(self, aggregation_function="mean", num_windows=100):
super().__init__("DynamicWindow", aggregation_function)
diff --git a/src/main/python/tests/scuro/data_generator.py
b/src/main/python/tests/scuro/data_generator.py
index 5bec163fe7..9da0aa82c0 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -26,6 +26,11 @@ from scipy.io.wavfile import write
import random
import os
+from sklearn import svm
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+
+from systemds.scuro.models.model import Model
from systemds.scuro.dataloader.base_loader import BaseLoader
from systemds.scuro.dataloader.video_loader import VideoLoader
from systemds.scuro.dataloader.audio_loader import AudioLoader
@@ -33,6 +38,7 @@ from systemds.scuro.dataloader.text_loader import TextLoader
from systemds.scuro.modality.unimodal_modality import UnimodalModality
from systemds.scuro.modality.transformed import TransformedModality
from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.drsearch.task import Task
class TestDataLoader(BaseLoader):
@@ -130,7 +136,7 @@ class ModalityRandomDataGenerator:
}
return data, metadata
- def create_text_data(self, num_instances):
+ def create_text_data(self, num_instances, num_sentences_per_instance=1):
subjects = [
"The cat",
"A dog",
@@ -172,18 +178,24 @@ class ModalityRandomDataGenerator:
"precisely",
"methodically",
]
+ punctuation = [".", "?", "!"]
sentences = []
for _ in range(num_instances):
- include_adverb = np.random.random() < 0.7
-
- subject = np.random.choice(subjects)
- verb = np.random.choice(verbs)
- obj = np.random.choice(objects)
- adverb = np.random.choice(adverbs) if include_adverb else ""
-
- sentence = f"{subject} {adverb} {verb} {obj}"
-
+ sentence = ""
+ for i in range(num_sentences_per_instance):
+ include_adverb = np.random.random() < 0.7
+
+ subject = np.random.choice(subjects)
+ verb = np.random.choice(verbs)
+ obj = np.random.choice(objects)
+ adverb = np.random.choice(adverbs) if include_adverb else ""
+ punct = np.random.choice(punctuation)
+
+ sentence += " " if i > 0 else ""
+ sentence += f"{subject}"
+ sentence += f" {adverb}" if include_adverb else ""
+ sentence += f" {verb} {obj}{punct}"
sentences.append(sentence)
metadata = {
@@ -382,3 +394,57 @@ class TestDataGenerator:
audio_data = 0.5 * np.sin(2 * np.pi * frequency * t)
write(path, sample_rate, audio_data)
+
+
+class TestSVM(Model):
+ def __init__(self, name):
+ super().__init__(name)
+
+ def fit(self, X, y, X_test, y_test):
+ if X.ndim > 2:
+ X = X.reshape(X.shape[0], -1)
+ self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
+ self.clf = self.clf.fit(X, np.array(y))
+ y_pred = self.clf.predict(X)
+
+ return {
+ "accuracy": classification_report(
+ y, y_pred, output_dict=True, digits=3, zero_division=1
+ )["accuracy"]
+ }, 0
+
+ def test(self, test_X: np.ndarray, test_y: np.ndarray):
+ if test_X.ndim > 2:
+ test_X = test_X.reshape(test_X.shape[0], -1)
+ y_pred = self.clf.predict(np.array(test_X)) # noqa]
+
+ return {
+ "accuracy": classification_report(
+ np.array(test_y), y_pred, output_dict=True, digits=3,
zero_division=1
+ )["accuracy"]
+ }, 0
+
+
+class TestTask(Task):
+ def __init__(self, name, model_name, num_instances):
+ self.labels = ModalityRandomDataGenerator().create_balanced_labels(
+ num_instances=10
+ )
+ split = train_test_split(
+ np.array(range(num_instances)),
+ self.labels,
+ test_size=0.2,
+ random_state=42,
+ stratify=self.labels,
+ )
+ self.train_indizes, self.val_indizes = [int(i) for i in split[0]], [
+ int(i) for i in split[1]
+ ]
+
+ super().__init__(
+ name,
+ TestSVM(model_name),
+ self.labels,
+ self.train_indizes,
+ self.val_indizes,
+ )
diff --git a/src/main/python/tests/scuro/test_hp_tuner.py
b/src/main/python/tests/scuro/test_hp_tuner.py
index 8484a352e4..f163498dab 100644
--- a/src/main/python/tests/scuro/test_hp_tuner.py
+++ b/src/main/python/tests/scuro/test_hp_tuner.py
@@ -22,17 +22,12 @@
import unittest
import numpy as np
-from sklearn import svm
-from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split
from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
from systemds.scuro.representations.average import Average
from systemds.scuro.representations.concatenation import Concatenation
from systemds.scuro.representations.lstm import LSTM
from systemds.scuro.drsearch.operator_registry import Registry
-from systemds.scuro.models.model import Model
-from systemds.scuro.drsearch.task import Task
from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
from systemds.scuro.representations.spectrogram import Spectrogram
@@ -45,70 +40,15 @@ from systemds.scuro.representations.word2vec import W2V
from systemds.scuro.representations.bow import BoW
from systemds.scuro.modality.unimodal_modality import UnimodalModality
from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import ModalityRandomDataGenerator,
TestDataLoader
+from tests.scuro.data_generator import (
+ ModalityRandomDataGenerator,
+ TestDataLoader,
+ TestTask,
+)
from systemds.scuro.modality.type import ModalityType
from systemds.scuro.drsearch.hyperparameter_tuner import HyperparameterTuner
-
-class TestSVM(Model):
- def __init__(self):
- super().__init__("TestSVM")
-
- def fit(self, X, y, X_test, y_test):
- if X.ndim > 2:
- X = X.reshape(X.shape[0], -1)
- self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
- self.clf = self.clf.fit(X, np.array(y))
- y_pred = self.clf.predict(X)
-
- return {
- "accuracy": classification_report(
- y, y_pred, output_dict=True, digits=3, zero_division=1
- )["accuracy"]
- }, 0
-
- def test(self, test_X: np.ndarray, test_y: np.ndarray):
- if test_X.ndim > 2:
- test_X = test_X.reshape(test_X.shape[0], -1)
- y_pred = self.clf.predict(np.array(test_X)) # noqa]
-
- return {
- "accuracy": classification_report(
- np.array(test_y), y_pred, output_dict=True, digits=3,
zero_division=1
- )["accuracy"]
- }, 0
-
-
-class TestSVM2(Model):
- def __init__(self):
- super().__init__("TestSVM2")
-
- def fit(self, X, y, X_test, y_test):
- if X.ndim > 2:
- X = X.reshape(X.shape[0], -1)
- self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
- self.clf = self.clf.fit(X, np.array(y))
- y_pred = self.clf.predict(X)
-
- return {
- "accuracy": classification_report(
- y, y_pred, output_dict=True, digits=3, zero_division=1
- )["accuracy"]
- }, 0
-
- def test(self, test_X: np.ndarray, test_y: np.ndarray):
- if test_X.ndim > 2:
- test_X = test_X.reshape(test_X.shape[0], -1)
- y_pred = self.clf.predict(np.array(test_X)) # noqa
-
- return {
- "accuracy": classification_report(
- np.array(test_y), y_pred, output_dict=True, digits=3,
zero_division=1
- )["accuracy"]
- }, 0
-
-
from unittest.mock import patch
@@ -120,36 +60,10 @@ class TestHPTuner(unittest.TestCase):
def setUpClass(cls):
cls.num_instances = 10
cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
- cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
- num_instances=cls.num_instances
- )
cls.indices = np.array(range(cls.num_instances))
-
- split = train_test_split(
- cls.indices,
- cls.labels,
- test_size=0.2,
- random_state=42,
- )
- cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
- int(i) for i in split[1]
- ]
-
cls.tasks = [
- Task(
- "UnimodalRepresentationTask1",
- TestSVM(),
- cls.labels,
- cls.train_indizes,
- cls.val_indizes,
- ),
- Task(
- "UnimodalRepresentationTask2",
- TestSVM2(),
- cls.labels,
- cls.train_indizes,
- cls.val_indizes,
- ),
+ TestTask("UnimodalRepresentationTask1", "TestSVM1",
cls.num_instances),
+ TestTask("UnimodalRepresentationTask2", "TestSVM2",
cls.num_instances),
]
def test_hp_tuner_for_audio_modality(self):
diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py
b/src/main/python/tests/scuro/test_multimodal_fusion.py
index f98824a16a..e89843afcd 100644
--- a/src/main/python/tests/scuro/test_multimodal_fusion.py
+++ b/src/main/python/tests/scuro/test_multimodal_fusion.py
@@ -22,8 +22,6 @@
import unittest
import numpy as np
-from sklearn import svm
-from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
@@ -32,7 +30,6 @@ from systemds.scuro.representations.concatenation import
Concatenation
from systemds.scuro.representations.lstm import LSTM
from systemds.scuro.representations.average import Average
from systemds.scuro.drsearch.operator_registry import Registry
-from systemds.scuro.models.model import Model
from systemds.scuro.drsearch.task import Task
from systemds.scuro.representations.spectrogram import Spectrogram
@@ -43,70 +40,13 @@ from
systemds.scuro.representations.timeseries_representations import Min, Max
from tests.scuro.data_generator import (
TestDataLoader,
ModalityRandomDataGenerator,
+ TestTask,
)
from systemds.scuro.modality.type import ModalityType
from unittest.mock import patch
-class TestSVM(Model):
- def __init__(self):
- super().__init__("TestSVM")
-
- def fit(self, X, y, X_test, y_test):
- if X.ndim > 2:
- X = X.reshape(X.shape[0], -1)
- self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
- self.clf = self.clf.fit(X, np.array(y))
- y_pred = self.clf.predict(X)
-
- return {
- "accuracy": classification_report(
- y, y_pred, output_dict=True, digits=3, zero_division=1
- )["accuracy"]
- }, 0
-
- def test(self, test_X: np.ndarray, test_y: np.ndarray):
- if test_X.ndim > 2:
- test_X = test_X.reshape(test_X.shape[0], -1)
- y_pred = self.clf.predict(np.array(test_X)) # noqa
-
- return {
- "accuracy": classification_report(
- np.array(test_y), y_pred, output_dict=True, digits=3,
zero_division=1
- )["accuracy"]
- }, 0
-
-
-class TestCNN(Model):
- def __init__(self):
- super().__init__("TestCNN")
-
- def fit(self, X, y, X_test, y_test):
- if X.ndim > 2:
- X = X.reshape(X.shape[0], -1)
- self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
- self.clf = self.clf.fit(X, np.array(y))
- y_pred = self.clf.predict(X)
-
- return {
- "accuracy": classification_report(
- y, y_pred, output_dict=True, digits=3, zero_division=1
- )["accuracy"]
- }, 0
-
- def test(self, test_X: np.ndarray, test_y: np.ndarray):
- if test_X.ndim > 2:
- test_X = test_X.reshape(test_X.shape[0], -1)
- y_pred = self.clf.predict(np.array(test_X)) # noqa
-
- return {
- "accuracy": classification_report(
- np.array(test_y), y_pred, output_dict=True, digits=3,
zero_division=1
- )["accuracy"]
- }, 0
-
-
class TestMultimodalRepresentationOptimizer(unittest.TestCase):
test_file_path = None
data_generator = None
@@ -116,30 +56,10 @@ class
TestMultimodalRepresentationOptimizer(unittest.TestCase):
def setUpClass(cls):
cls.num_instances = 10
cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
- cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
- num_instances=cls.num_instances
- )
cls.indices = np.array(range(cls.num_instances))
- split = train_test_split(
- cls.indices,
- cls.labels,
- test_size=0.2,
- random_state=42,
- stratify=cls.labels,
- )
- cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
- int(i) for i in split[1]
- ]
-
def test_multimodal_fusion(self):
- task = Task(
- "MM_Fusion_Task1",
- TestSVM(),
- self.labels,
- self.train_indizes,
- self.val_indizes,
- )
+ task = TestTask("MM_Fusion_Task1", "Test1", self.num_instances)
audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
self.num_instances, 1000
@@ -199,13 +119,7 @@ class
TestMultimodalRepresentationOptimizer(unittest.TestCase):
)
def test_parallel_multimodal_fusion(self):
- task = Task(
- "MM_Fusion_Task1",
- TestSVM(),
- self.labels,
- self.train_indizes,
- self.val_indizes,
- )
+ task = TestTask("MM_Fusion_Task1", "Test2", self.num_instances)
audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
self.num_instances, 1000
diff --git a/src/main/python/tests/scuro/test_operator_registry.py
b/src/main/python/tests/scuro/test_operator_registry.py
index c33eb5fcc2..2edada0739 100644
--- a/src/main/python/tests/scuro/test_operator_registry.py
+++ b/src/main/python/tests/scuro/test_operator_registry.py
@@ -21,7 +21,14 @@
import unittest
-from systemds.scuro import FrequencyMagnitude
+from systemds.scuro.representations.text_context import (
+ SentenceBoundarySplit,
+ OverlappingSplit,
+)
+from systemds.scuro.representations.text_context_with_indices import (
+ SentenceBoundarySplitIndices,
+ OverlappingSplitIndices,
+)
from systemds.scuro.representations.covarep_audio_features import (
ZeroCrossing,
Spectral,
@@ -124,11 +131,17 @@ class TestOperatorRegistry(unittest.TestCase):
def test_context_operator_in_registry(self):
registry = Registry()
- assert registry.get_context_operators() == [
+ assert registry.get_context_operators(ModalityType.TIMESERIES) == [
WindowAggregation,
StaticWindow,
DynamicWindow,
]
+ assert registry.get_context_operators(ModalityType.TEXT) == [
+ SentenceBoundarySplit,
+ OverlappingSplit,
+ SentenceBoundarySplitIndices,
+ OverlappingSplitIndices,
+ ]
# def test_fusion_operator_in_registry(self):
# registry = Registry()
diff --git a/src/main/python/tests/scuro/test_text_context_operators.py
b/src/main/python/tests/scuro/test_text_context_operators.py
new file mode 100644
index 0000000000..1f04165407
--- /dev/null
+++ b/src/main/python/tests/scuro/test_text_context_operators.py
@@ -0,0 +1,113 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+
+import unittest
+from systemds.scuro.representations.text_context import (
+ SentenceBoundarySplit,
+ OverlappingSplit,
+)
+from systemds.scuro.representations.text_context_with_indices import (
+ SentenceBoundarySplitIndices,
+ OverlappingSplitIndices,
+)
+from tests.scuro.data_generator import (
+ ModalityRandomDataGenerator,
+ TestDataLoader,
+ TestTask,
+)
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.type import ModalityType
+
+
+class TestTextContextOperator(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ cls.data_generator = ModalityRandomDataGenerator()
+ cls.data, cls.md = cls.data_generator.create_text_data(10, 50)
+ cls.text_modality = UnimodalModality(
+ TestDataLoader(
+ [i for i in range(0, 10)],
+ None,
+ ModalityType.TEXT,
+ cls.data,
+ str,
+ cls.md,
+ )
+ )
+ cls.text_modality.extract_raw_data()
+ cls.task = TestTask("TextContextTask", "Test1", 10)
+
+ def test_sentence_boundary_split(self):
+ sentence_boundary_split = SentenceBoundarySplit(10, min_words=4)
+ chunks = sentence_boundary_split.execute(self.text_modality)
+ for i in range(0, len(chunks)):
+ for chunk in chunks[i]:
+ assert len(chunk.split(" ")) <= 10 and (
+ chunk[-1] == "." or chunk[-1] == "!" or chunk[-1] == "?"
+ )
+
+ def test_overlapping_split(self):
+ overlapping_split = OverlappingSplit(40, 0.05)
+ chunks = overlapping_split.execute(self.text_modality)
+ for i in range(len(chunks)):
+ prev_chunk = ""
+ for j, chunk in enumerate(chunks[i]):
+ if j > 0:
+ prev_words = prev_chunk.split(" ")
+ curr_words = chunk.split(" ")
+ assert prev_words[-2:] == curr_words[:2]
+ prev_chunk = chunk
+ assert len(chunk.split(" ")) <= 40
+
+ def test_sentence_boundary_split_indices(self):
+ sentence_boundary_split = SentenceBoundarySplitIndices(10, min_words=4)
+ chunks = sentence_boundary_split.execute(self.text_modality)
+ for i in range(0, len(chunks)):
+ for chunk in chunks[i]:
+ text = self.text_modality.data[i][chunk[0] : chunk[1]].split("
")
+ assert len(text) <= 10 and (
+ text[-1][-1] == "." or text[-1][-1] == "!" or text[-1][-1]
== "?"
+ )
+
+ def test_overlapping_split_indices(self):
+ overlapping_split = OverlappingSplitIndices(40, 0.1)
+ chunks = overlapping_split.execute(self.text_modality)
+ for i in range(len(chunks)):
+ prev_chunk = (0, 0)
+ for j, chunk in enumerate(chunks[i]):
+ if j > 0:
+ prev_words = self.text_modality.data[i][
+ prev_chunk[0] : prev_chunk[1]
+ ].split(" ")
+ curr_words = self.text_modality.data[i][chunk[0] :
chunk[1]].split(
+ " "
+ )
+ assert prev_words[-4:] == curr_words[:4]
+ prev_chunk = chunk
+ assert (
+ len(self.text_modality.data[i][chunk[0] :
chunk[1]].split(" "))
+ <= 40
+ )
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py
b/src/main/python/tests/scuro/test_unimodal_optimizer.py
index ca54ee64b1..0d8ae90177 100644
--- a/src/main/python/tests/scuro/test_unimodal_optimizer.py
+++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py
@@ -23,17 +23,11 @@
import unittest
import numpy as np
-from sklearn import svm
-from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split
-
from systemds.scuro.representations.timeseries_representations import (
Mean,
ACF,
)
from systemds.scuro.drsearch.operator_registry import Registry
-from systemds.scuro.models.model import Model
-from systemds.scuro.drsearch.task import Task
from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
from systemds.scuro.representations.spectrogram import Spectrogram
@@ -44,69 +38,14 @@ from systemds.scuro.representations.word2vec import W2V
from systemds.scuro.representations.bow import BoW
from systemds.scuro.modality.unimodal_modality import UnimodalModality
from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import ModalityRandomDataGenerator,
TestDataLoader
+from tests.scuro.data_generator import (
+ ModalityRandomDataGenerator,
+ TestDataLoader,
+ TestTask,
+)
from systemds.scuro.modality.type import ModalityType
-
-class TestSVM(Model):
- def __init__(self):
- super().__init__("TestSVM")
-
- def fit(self, X, y, X_test, y_test):
- if X.ndim > 2:
- X = X.reshape(X.shape[0], -1)
- self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
- self.clf = self.clf.fit(X, np.array(y))
- y_pred = self.clf.predict(X)
-
- return {
- "accuracy": classification_report(
- y, y_pred, output_dict=True, digits=3, zero_division=1
- )["accuracy"]
- }, 0
-
- def test(self, test_X: np.ndarray, test_y: np.ndarray):
- if test_X.ndim > 2:
- test_X = test_X.reshape(test_X.shape[0], -1)
- y_pred = self.clf.predict(np.array(test_X)) # noqa
-
- return {
- "accuracy": classification_report(
- np.array(test_y), y_pred, output_dict=True, digits=3,
zero_division=1
- )["accuracy"]
- }, 0
-
-
-class TestCNN(Model):
- def __init__(self):
- super().__init__("TestCNN")
-
- def fit(self, X, y, X_test, y_test):
- if X.ndim > 2:
- X = X.reshape(X.shape[0], -1)
- self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
- self.clf = self.clf.fit(X, np.array(y))
- y_pred = self.clf.predict(X)
-
- return {
- "accuracy": classification_report(
- y, y_pred, output_dict=True, digits=3, zero_division=1
- )["accuracy"]
- }, 0
-
- def test(self, test_X: np.ndarray, test_y: np.ndarray):
- if test_X.ndim > 2:
- test_X = test_X.reshape(test_X.shape[0], -1)
- y_pred = self.clf.predict(np.array(test_X)) # noqa
-
- return {
- "accuracy": classification_report(
- np.array(test_y), y_pred, output_dict=True, digits=3,
zero_division=1
- )["accuracy"]
- }, 0
-
-
from unittest.mock import patch
@@ -118,36 +57,12 @@ class
TestUnimodalRepresentationOptimizer(unittest.TestCase):
def setUpClass(cls):
cls.num_instances = 10
cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
- cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
- num_instances=cls.num_instances
- )
- cls.indices = np.array(range(cls.num_instances))
- split = train_test_split(
- cls.indices,
- cls.labels,
- test_size=0.2,
- random_state=42,
- )
- cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
- int(i) for i in split[1]
- ]
+ cls.indices = np.array(range(cls.num_instances))
cls.tasks = [
- Task(
- "UnimodalRepresentationTask1",
- TestSVM(),
- cls.labels,
- cls.train_indizes,
- cls.val_indizes,
- ),
- Task(
- "UnimodalRepresentationTask2",
- TestCNN(),
- cls.labels,
- cls.train_indizes,
- cls.val_indizes,
- ),
+ TestTask("UnimodalRepresentationTask1", "Test1",
cls.num_instances),
+ TestTask("UnimodalRepresentationTask2", "Test2",
cls.num_instances),
]
def test_unimodal_optimizer_for_audio_modality(self):