This is an automated email from the ASF dual-hosted git repository.

cdionysio pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 6c163b2ee8 [SYSTEMDS-3835] Add additional text and context operations
6c163b2ee8 is described below

commit 6c163b2ee8f5a3f2c83291ec649d93325a7131d2
Author: Christina Dionysio <[email protected]>
AuthorDate: Fri Jan 9 10:34:24 2026 +0100

    [SYSTEMDS-3835] Add additional text and context operations
    
    This patch adds a few additional context operations specifically for the 
text modality, and new text representations of the bert family and elmo.
---
 .github/workflows/python.yml                       |   3 +-
 src/main/python/systemds/scuro/__init__.py         |  27 +-
 .../systemds/scuro/drsearch/operator_registry.py   |  24 +-
 .../systemds/scuro/drsearch/unimodal_optimizer.py  |  41 ++-
 .../python/systemds/scuro/modality/transformed.py  |   2 -
 src/main/python/systemds/scuro/modality/type.py    |  18 +-
 .../systemds/scuro/representations/aggregate.py    |   2 +-
 .../python/systemds/scuro/representations/bert.py  | 221 ++++++++++++---
 .../python/systemds/scuro/representations/clip.py  |   9 +-
 .../python/systemds/scuro/representations/elmo.py  | 154 +++++++++++
 .../python/systemds/scuro/representations/glove.py |  15 +-
 .../systemds/scuro/representations/text_context.py | 221 +++++++++++++++
 .../representations/text_context_with_indices.py   | 300 +++++++++++++++++++++
 .../systemds/scuro/representations/unimodal.py     |   2 +
 .../scuro/representations/window_aggregation.py    |   8 +-
 src/main/python/tests/scuro/data_generator.py      |  86 +++++-
 src/main/python/tests/scuro/test_hp_tuner.py       | 100 +------
 .../python/tests/scuro/test_multimodal_fusion.py   |  92 +------
 .../python/tests/scuro/test_operator_registry.py   |  17 +-
 .../tests/scuro/test_text_context_operators.py     | 113 ++++++++
 .../python/tests/scuro/test_unimodal_optimizer.py  | 101 +------
 21 files changed, 1200 insertions(+), 356 deletions(-)

diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 006e20d488..26a2e35ac4 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -173,7 +173,8 @@ jobs:
           opt-einsum \
           nltk \
           fvcore \
-          scikit-optimize 
+          scikit-optimize \
+          flair 
         kill $KA 
         cd src/main/python
         python -m unittest discover -s tests/scuro -p 'test_*.py' -v
diff --git a/src/main/python/systemds/scuro/__init__.py 
b/src/main/python/systemds/scuro/__init__.py
index 8b5a8621d1..7849c03816 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -30,7 +30,13 @@ from 
systemds.scuro.representations.aggregated_representation import (
     AggregatedRepresentation,
 )
 from systemds.scuro.representations.average import Average
-from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.bert import (
+    Bert,
+    RoBERTa,
+    DistillBERT,
+    ALBERT,
+    ELECTRA,
+)
 from systemds.scuro.representations.bow import BoW
 from systemds.scuro.representations.concatenation import Concatenation
 from systemds.scuro.representations.context import Context
@@ -101,6 +107,16 @@ from systemds.scuro.drsearch.multimodal_optimizer import 
MultimodalOptimizer
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 from systemds.scuro.representations.vgg import VGG19
 from systemds.scuro.representations.clip import CLIPText, CLIPVisual
+from systemds.scuro.representations.text_context import (
+    SentenceBoundarySplit,
+    OverlappingSplit,
+)
+from systemds.scuro.representations.text_context_with_indices import (
+    SentenceBoundarySplitIndices,
+    OverlappingSplitIndices,
+)
+from systemds.scuro.representations.elmo import ELMoRepresentation
+
 
 __all__ = [
     "BaseLoader",
@@ -113,6 +129,10 @@ __all__ = [
     "AggregatedRepresentation",
     "Average",
     "Bert",
+    "RoBERTa",
+    "DistillBERT",
+    "ALBERT",
+    "ELECTRA",
     "BoW",
     "Concatenation",
     "Context",
@@ -177,4 +197,9 @@ __all__ = [
     "VGG19",
     "CLIPVisual",
     "CLIPText",
+    "SentenceBoundarySplit",
+    "OverlappingSplit",
+    "ELMoRepresentation",
+    "SentenceBoundarySplitIndices",
+    "OverlappingSplitIndices",
 ]
diff --git a/src/main/python/systemds/scuro/drsearch/operator_registry.py 
b/src/main/python/systemds/scuro/drsearch/operator_registry.py
index 3b20245956..dc62e9b65b 100644
--- a/src/main/python/systemds/scuro/drsearch/operator_registry.py
+++ b/src/main/python/systemds/scuro/drsearch/operator_registry.py
@@ -33,8 +33,10 @@ class Registry:
 
     _instance = None
     _representations = {}
-    _context_operators = []
+    _context_operators = {}
     _fusion_operators = []
+    _text_context_operators = []
+    _video_context_operators = []
 
     def __new__(cls):
         if not cls._instance:
@@ -60,8 +62,13 @@ class Registry:
     ):
         self._representations[modality].append(representation)
 
-    def add_context_operator(self, context_operator):
-        self._context_operators.append(context_operator)
+    def add_context_operator(self, context_operator, modality_type):
+        if not isinstance(modality_type, list):
+            modality_type = [modality_type]
+        for m_type in modality_type:
+            if not m_type in self._context_operators.keys():
+                self._context_operators[m_type] = []
+            self._context_operators[m_type].append(context_operator)
 
     def add_fusion_operator(self, fusion_operator):
         self._fusion_operators.append(fusion_operator)
@@ -76,9 +83,8 @@ class Registry:
                 reps.append(rep)
         return reps
 
-    def get_context_operators(self):
-        # TODO: return modality specific context operations
-        return self._context_operators
+    def get_context_operators(self, modality_type):
+        return self._context_operators[modality_type]
 
     def get_fusion_operators(self):
         return self._fusion_operators
@@ -121,13 +127,15 @@ def register_representation(modalities: 
Union[ModalityType, List[ModalityType]])
     return decorator
 
 
-def register_context_operator():
+def register_context_operator(modality_type):
     """
     Decorator to register a context operator.
+
+    @param modality_type: The modality type for which the context operator is 
to be registered
     """
 
     def decorator(cls):
-        Registry().add_context_operator(cls)
+        Registry().add_context_operator(cls, modality_type)
         return cls
 
     return decorator
diff --git a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py 
b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
index 1a348a91df..4cde294b17 100644
--- a/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
+++ b/src/main/python/systemds/scuro/drsearch/unimodal_optimizer.py
@@ -87,8 +87,8 @@ class UnimodalOptimizer:
         )
 
     @lru_cache(maxsize=32)
-    def _get_context_operators(self):
-        return self.operator_registry.get_context_operators()
+    def _get_context_operators(self, modality_type):
+        return self.operator_registry.get_context_operators(modality_type)
 
     def store_results(self, file_name=None):
         if file_name is None:
@@ -302,6 +302,39 @@ class UnimodalOptimizer:
         current_node_id = rep_node_id
         dags.append(builder.build(current_node_id))
 
+        if operator.needs_context:
+            context_operators = 
self._get_context_operators(modality.modality_type)
+            for context_op in context_operators:
+                if operator.initial_context_length is not None:
+                    context_length = operator.initial_context_length
+
+                    context_node_id = builder.create_operation_node(
+                        context_op,
+                        [leaf_id],
+                        context_op(context_length).get_current_parameters(),
+                    )
+                else:
+                    context_node_id = builder.create_operation_node(
+                        context_op,
+                        [leaf_id],
+                        context_op().get_current_parameters(),
+                    )
+
+                context_rep_node_id = builder.create_operation_node(
+                    operator.__class__,
+                    [context_node_id],
+                    operator.get_current_parameters(),
+                )
+
+                agg_operator = AggregatedRepresentation()
+                context_agg_node_id = builder.create_operation_node(
+                    agg_operator.__class__,
+                    [context_rep_node_id],
+                    agg_operator.get_current_parameters(),
+                )
+
+                dags.append(builder.build(context_agg_node_id))
+
         if not operator.self_contained:
             not_self_contained_reps = self._get_not_self_contained_reps(
                 modality.modality_type
@@ -344,7 +377,7 @@ class UnimodalOptimizer:
 
     def default_context_operators(self, modality, builder, leaf_id, 
current_node_id):
         dags = []
-        context_operators = self._get_context_operators()
+        context_operators = self._get_context_operators(modality.modality_type)
         for context_op in context_operators:
             if (
                 modality.modality_type != ModalityType.TEXT
@@ -368,7 +401,7 @@ class UnimodalOptimizer:
 
     def temporal_context_operators(self, modality, builder, leaf_id, 
current_node_id):
         aggregators = 
self.operator_registry.get_representations(modality.modality_type)
-        context_operators = self._get_context_operators()
+        context_operators = self._get_context_operators(modality.modality_type)
 
         dags = []
         for agg in aggregators:
diff --git a/src/main/python/systemds/scuro/modality/transformed.py 
b/src/main/python/systemds/scuro/modality/transformed.py
index f7739f03df..3b01465302 100644
--- a/src/main/python/systemds/scuro/modality/transformed.py
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -18,8 +18,6 @@
 # under the License.
 #
 # -------------------------------------------------------------
-from functools import reduce
-from operator import or_
 from typing import Union, List
 
 from systemds.scuro.modality.type import ModalityType
diff --git a/src/main/python/systemds/scuro/modality/type.py 
b/src/main/python/systemds/scuro/modality/type.py
index c2fe38176f..23d97e869b 100644
--- a/src/main/python/systemds/scuro/modality/type.py
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -108,8 +108,12 @@ class ModalitySchemas:
                 shape = data.shape
         elif data_layout is DataLayout.NESTED_LEVEL:
             if data_is_single_instance:
-                dtype = data.dtype
-                shape = data.shape
+                if isinstance(data, list):
+                    dtype = type(data[0])
+                    shape = (len(data), len(data[0]))
+                else:
+                    dtype = data.dtype
+                    shape = data.shape
             else:
                 shape = data[0].shape
                 dtype = data[0].dtype
@@ -306,13 +310,15 @@ class DataLayout(Enum):
             return None
 
         if data_is_single_instance:
-            if (
+            if (isinstance(data, list) and not isinstance(data[0], str)) or (
+                isinstance(data, np.ndarray) and data.ndim == 1
+            ):
+                return DataLayout.SINGLE_LEVEL
+            elif (
                 isinstance(data, list)
                 or isinstance(data, np.ndarray)
-                and data.ndim == 1
+                or isinstance(data, torch.Tensor)
             ):
-                return DataLayout.SINGLE_LEVEL
-            elif isinstance(data, np.ndarray) or isinstance(data, 
torch.Tensor):
                 return DataLayout.NESTED_LEVEL
 
         if isinstance(data[0], list):
diff --git a/src/main/python/systemds/scuro/representations/aggregate.py 
b/src/main/python/systemds/scuro/representations/aggregate.py
index 0a8438e684..9503a48587 100644
--- a/src/main/python/systemds/scuro/representations/aggregate.py
+++ b/src/main/python/systemds/scuro/representations/aggregate.py
@@ -71,7 +71,7 @@ class Aggregation:
         max_len = 0
         for i, instance in enumerate(modality.data):
             data.append([])
-            if isinstance(instance, np.ndarray):
+            if isinstance(instance, np.ndarray) or isinstance(instance, list):
                 if (
                     modality.modality_type == ModalityType.IMAGE
                     or modality.modality_type == ModalityType.VIDEO
diff --git a/src/main/python/systemds/scuro/representations/bert.py 
b/src/main/python/systemds/scuro/representations/bert.py
index 4d486bff59..be579c0dd6 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -22,7 +22,7 @@ import numpy as np
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 import torch
-from transformers import BertTokenizerFast, BertModel
+from transformers import AutoTokenizer, AutoModel
 from systemds.scuro.representations.utils import save_embeddings
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.drsearch.operator_registry import register_representation
@@ -37,15 +37,18 @@ class TextDataset(Dataset):
     def __init__(self, texts):
 
         self.texts = []
-        for text in texts:
-            if text is None:
-                self.texts.append("")
-            elif isinstance(text, np.ndarray):
-                self.texts.append(str(text.item()) if text.size == 1 else 
str(text))
-            elif not isinstance(text, str):
-                self.texts.append(str(text))
-            else:
-                self.texts.append(text)
+        if isinstance(texts, list):
+            self.texts = texts
+        else:
+            for text in texts:
+                if text is None:
+                    self.texts.append("")
+                elif isinstance(text, np.ndarray):
+                    self.texts.append(str(text.item()) if text.size == 1 else 
str(text))
+                elif not isinstance(text, str):
+                    self.texts.append(str(text))
+                else:
+                    self.texts.append(text)
 
     def __len__(self):
         return len(self.texts)
@@ -54,36 +57,61 @@ class TextDataset(Dataset):
         return self.texts[idx]
 
 
-@register_representation(ModalityType.TEXT)
-class Bert(UnimodalRepresentation):
-    def __init__(self, model_name="bert", output_file=None, 
max_seq_length=512):
-        parameters = {"model_name": "bert"}
+class BertFamily(UnimodalRepresentation):
+    def __init__(
+        self,
+        representation_name,
+        model_name,
+        layer,
+        parameters={},
+        output_file=None,
+        max_seq_length=512,
+    ):
         self.model_name = model_name
-        super().__init__("Bert", ModalityType.EMBEDDING, parameters)
+        super().__init__(representation_name, ModalityType.EMBEDDING, 
parameters)
 
+        self.layer_name = layer
         self.output_file = output_file
         self.max_seq_length = max_seq_length
+        self.needs_context = True
+        self.initial_context_length = 350
 
     def transform(self, modality):
         transformed_modality = TransformedModality(modality, self)
-        model_name = "bert-base-uncased"
-        tokenizer = BertTokenizerFast.from_pretrained(
-            model_name, clean_up_tokenization_spaces=True
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name, clean_up_tokenization_spaces=True
         )
+        self.model = 
AutoModel.from_pretrained(self.model_name).to(get_device())
+        self.bert_output = None
+
+        def get_activation(name):
+            def hook(model, input, output):
+                self.bert_output = output.detach().cpu().numpy()
 
-        model = BertModel.from_pretrained(model_name).to(get_device())
+            return hook
 
-        embeddings = self.create_embeddings(modality, model, tokenizer)
+        if self.layer_name != "cls":
+            for name, layer in self.model.named_modules():
+                if name == self.layer_name:
+                    layer.register_forward_hook(get_activation(name))
+                    break
+
+        if isinstance(modality.data[0], list):
+            embeddings = []
+            for d in modality.data:
+                embeddings.append(self.create_embeddings(d, self.model, 
tokenizer))
+        else:
+            embeddings = self.create_embeddings(modality.data, self.model, 
tokenizer)
 
         if self.output_file is not None:
             save_embeddings(embeddings, self.output_file)
 
         transformed_modality.data_type = np.float32
-        transformed_modality.data = np.array(embeddings)
+        transformed_modality.data = embeddings
         return transformed_modality
 
-    def create_embeddings(self, modality, model, tokenizer):
-        dataset = TextDataset(modality.data)
+    def create_embeddings(self, data, model, tokenizer):
+        dataset = TextDataset(data)
         dataloader = DataLoader(dataset, batch_size=32, shuffle=False, 
collate_fn=None)
         cls_embeddings = []
         for batch in dataloader:
@@ -94,27 +122,146 @@ class Bert(UnimodalRepresentation):
                 padding="max_length",
                 return_attention_mask=True,
                 truncation=True,
-                max_length=512,  # TODO: make this dynamic
+                max_length=512,  # TODO: make this dynamic with parameter to 
tune
             )
 
             inputs.to(get_device())
-            ModalityType.TEXT.add_field_for_instances(
-                modality.metadata,
-                "token_to_character_mapping",
-                inputs.data["offset_mapping"].tolist(),
-            )
-
-            ModalityType.TEXT.add_field_for_instances(
-                modality.metadata,
-                "attention_masks",
-                inputs.data["attention_mask"].tolist(),
-            )
+            # ModalityType.TEXT.add_field_for_instances(
+            #     modality.metadata,
+            #     "token_to_character_mapping",
+            #     inputs.data["offset_mapping"].tolist(),
+            # )
+            #
+            # ModalityType.TEXT.add_field_for_instances(
+            #     modality.metadata,
+            #     "attention_masks",
+            #     inputs.data["attention_mask"].tolist(),
+            # )
             del inputs.data["offset_mapping"]
 
             with torch.no_grad():
                 outputs = model(**inputs)
-
-                cls_embedding = 
outputs.last_hidden_state.detach().cpu().numpy()
+                if self.layer_name == "cls":
+                    cls_embedding = 
outputs.last_hidden_state.detach().cpu().numpy()
+                else:
+                    cls_embedding = self.bert_output
                 cls_embeddings.extend(cls_embedding)
 
         return np.array(cls_embeddings)
+
+
+@register_representation(ModalityType.TEXT)
+class Bert(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {
+            "layer_name": [
+                "cls",
+                "encoder.layer.0",
+                "encoder.layer.1",
+                "encoder.layer.2",
+                "encoder.layer.3",
+                "encoder.layer.4",
+                "encoder.layer.5",
+                "encoder.layer.6",
+                "encoder.layer.7",
+                "encoder.layer.8",
+                "encoder.layer.9",
+                "encoder.layer.10",
+                "encoder.layer.11",
+                "pooler",
+                "pooler.activation",
+            ]
+        }
+        super().__init__(
+            "Bert", "bert-base-uncased", layer, parameters, output_file, 
max_seq_length
+        )
+
+
+@register_representation(ModalityType.TEXT)
+class RoBERTa(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {
+            "layer_name": [
+                "cls",
+                "encoder.layer.0",
+                "encoder.layer.1",
+                "encoder.layer.2",
+                "encoder.layer.3",
+                "encoder.layer.4",
+                "encoder.layer.5",
+                "encoder.layer.6",
+                "encoder.layer.7",
+                "encoder.layer.8",
+                "encoder.layer.9",
+                "encoder.layer.10",
+                "encoder.layer.11",
+                "pooler",
+                "pooler.activation",
+            ]
+        }
+        super().__init__(
+            "RoBERTa", "roberta-base", layer, parameters, output_file, 
max_seq_length
+        )
+
+
+@register_representation(ModalityType.TEXT)
+class DistillBERT(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {
+            "layer_name": [
+                "cls",
+                "transformer.layer.0",
+                "transformer.layer.1",
+                "transformer.layer.2",
+                "transformer.layer.3",
+                "transformer.layer.4",
+                "transformer.layer.5",
+            ]
+        }
+        super().__init__(
+            "DistillBERT",
+            "distilbert-base-uncased",
+            layer,
+            parameters,
+            output_file,
+            max_seq_length,
+        )
+
+
+@register_representation(ModalityType.TEXT)
+class ALBERT(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {"layer_name": ["cls", "encoder.albert_layer_groups.0", 
"pooler"]}
+        super().__init__(
+            "ALBERT", "albert-base-v2", layer, parameters, output_file, 
max_seq_length
+        )
+
+
+@register_representation(ModalityType.TEXT)
+class ELECTRA(BertFamily):
+    def __init__(self, layer="cls", output_file=None, max_seq_length=512):
+        parameters = {
+            "layer_name": [
+                "cls",
+                "encoder.layer.0",
+                "encoder.layer.1",
+                "encoder.layer.2",
+                "encoder.layer.3",
+                "encoder.layer.4",
+                "encoder.layer.5",
+                "encoder.layer.6",
+                "encoder.layer.7",
+                "encoder.layer.8",
+                "encoder.layer.9",
+                "encoder.layer.10",
+                "encoder.layer.11",
+            ]
+        }
+        super().__init__(
+            "ELECTRA",
+            "google/electra-base-discriminator",
+            layer,
+            parameters,
+            output_file,
+            max_seq_length,
+        )
diff --git a/src/main/python/systemds/scuro/representations/clip.py 
b/src/main/python/systemds/scuro/representations/clip.py
index 504681f253..a431e52761 100644
--- a/src/main/python/systemds/scuro/representations/clip.py
+++ b/src/main/python/systemds/scuro/representations/clip.py
@@ -119,13 +119,20 @@ class CLIPText(UnimodalRepresentation):
         )
         self.processor = 
CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
         self.output_file = output_file
+        self.needs_context = True
+        self.initial_context_length = 55
 
     def transform(self, modality):
         transformed_modality = TransformedModality(
             modality, self, self.output_modality_type
         )
 
-        embeddings = self.create_text_embeddings(modality.data, self.model)
+        if isinstance(modality.data[0], list):
+            embeddings = []
+            for d in modality.data:
+                embeddings.append(self.create_text_embeddings(d, self.model))
+        else:
+            embeddings = self.create_text_embeddings(modality.data, self.model)
 
         if self.output_file is not None:
             save_embeddings(embeddings, self.output_file)
diff --git a/src/main/python/systemds/scuro/representations/elmo.py 
b/src/main/python/systemds/scuro/representations/elmo.py
new file mode 100644
index 0000000000..ba2a99f8e1
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/elmo.py
@@ -0,0 +1,154 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from systemds.scuro.utils.torch_dataset import CustomDataset
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.representations.unimodal import UnimodalRepresentation
+from systemds.scuro.drsearch.operator_registry import register_representation
+import torch.utils.data
+import torch
+import numpy as np
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.utils.static_variables import get_device
+from flair.embeddings import ELMoEmbeddings
+from flair.data import Sentence
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+
+
+class TextDataset(Dataset):
+    def __init__(self, texts):
+
+        self.texts = []
+        if isinstance(texts, list):
+            self.texts = texts
+        else:
+            for text in texts:
+                if text is None:
+                    self.texts.append("")
+                elif isinstance(text, np.ndarray):
+                    self.texts.append(str(text.item()) if text.size == 1 else 
str(text))
+                elif not isinstance(text, str):
+                    self.texts.append(str(text))
+                else:
+                    self.texts.append(text)
+
+    def __len__(self):
+        return len(self.texts)
+
+    def __getitem__(self, idx):
+        return self.texts[idx]
+
+
+# @register_representation([ModalityType.TEXT])
+class ELMoRepresentation(UnimodalRepresentation):
+    def __init__(
+        self, model_name="elmo-original", layer="mix", pooling="mean", 
output_file=None
+    ):
+        self.data_type = torch.float32
+        self.model_name = model_name
+        self.layer_name = layer
+        self.pooling = pooling  # "mean", "max", "first", "last", or "all" (no 
pooling)
+        parameters = self._get_parameters()
+        super().__init__("ELMo", ModalityType.EMBEDDING, parameters)
+
+        self.output_file = output_file
+
+    @property
+    def model_name(self):
+        return self._model_name
+
+    @model_name.setter
+    def model_name(self, model_name):
+        self._model_name = model_name
+
+        if model_name == "elmo-original":
+            self.model = ELMoEmbeddings("original")
+            self.embedding_dim = 1024
+        elif model_name == "elmo-small":
+            self.model = ELMoEmbeddings("small")
+            self.embedding_dim = 256
+        elif model_name == "elmo-medium":
+            self.model = ELMoEmbeddings("medium")
+            self.embedding_dim = 512
+        else:
+            raise NotImplementedError(f"Model {model_name} not supported")
+
+        self.model = self.model.to(get_device())
+
+    def _get_parameters(self):
+        parameters = {
+            "model_name": ["elmo-original", "elmo-small", "elmo-medium"],
+            "layer_name": [
+                "mix",
+                "layer_0",
+                "layer_1",
+                "layer_2",
+            ],
+            "pooling": ["mean", "max", "first", "last", "all"],
+        }
+        return parameters
+
+    def transform(self, modality):
+        transformed_modality = TransformedModality(
+            modality, self, ModalityType.EMBEDDING
+        )
+        dataset = TextDataset(modality.data)
+        dataloader = DataLoader(dataset, batch_size=32, shuffle=False, 
collate_fn=None)
+        embeddings = []
+        for batch in dataloader:
+            texts = batch
+            for text in texts:
+                sentence = Sentence(text)
+                self.model.embed(sentence)
+                token_embeddings = []
+                for token in sentence:
+                    if self.layer_name == "mix":
+                        embedding = token.embedding
+                    elif self.layer_name == "layer_0":
+                        embedding = token.get_embedding(self.model.name + "-0")
+                    elif self.layer_name == "layer_1":
+                        embedding = token.get_embedding(self.model.name + "-1")
+                    elif self.layer_name == "layer_2":
+                        embedding = token.get_embedding(self.model.name + "-2")
+                    else:
+                        embedding = token.embedding
+
+                    token_embeddings.append(embedding.cpu().numpy())
+
+                token_embeddings = np.array(token_embeddings)
+
+                if self.pooling == "mean":
+                    sentence_embedding = np.mean(token_embeddings, axis=0)
+                elif self.pooling == "max":
+                    sentence_embedding = np.max(token_embeddings, axis=0)
+                elif self.pooling == "first":
+                    sentence_embedding = token_embeddings[0]
+                elif self.pooling == "last":
+                    sentence_embedding = token_embeddings[-1]
+                elif self.pooling == "all":
+                    sentence_embedding = token_embeddings.flatten()
+                else:
+                    sentence_embedding = np.mean(token_embeddings, axis=0)
+
+                embeddings.append(sentence_embedding.astype(np.float32))
+
+        transformed_modality.data = np.array(embeddings)
+        return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/glove.py 
b/src/main/python/systemds/scuro/representations/glove.py
index 9076efecfc..74f487bd79 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -18,8 +18,10 @@
 # under the License.
 #
 # -------------------------------------------------------------
+import zipfile
 import numpy as np
 from gensim.utils import tokenize
+from huggingface_hub import hf_hub_download
 
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
@@ -39,11 +41,17 @@ def load_glove_embeddings(file_path):
     return embeddings
 
 
-# @register_representation(ModalityType.TEXT)
+@register_representation(ModalityType.TEXT)
 class GloVe(UnimodalRepresentation):
-    def __init__(self, glove_path, output_file=None):
+    def __init__(self, output_file=None):
         super().__init__("GloVe", ModalityType.TEXT)
-        self.glove_path = glove_path
+        file_path = hf_hub_download(
+            repo_id="stanfordnlp/glove", filename="glove.6B.zip"
+        )
+        with zipfile.ZipFile(file_path, "r") as zip_ref:
+            zip_ref.extractall("./glove_extracted")
+
+        self.glove_path = "./glove_extracted/glove.6B.100d.txt"
         self.output_file = output_file
 
     def transform(self, modality):
@@ -67,6 +75,5 @@ class GloVe(UnimodalRepresentation):
         if self.output_file is not None:
             save_embeddings(np.array(embeddings), self.output_file)
 
-        transformed_modality.data_type = np.float32
         transformed_modality.data = np.array(embeddings)
         return transformed_modality
diff --git a/src/main/python/systemds/scuro/representations/text_context.py 
b/src/main/python/systemds/scuro/representations/text_context.py
new file mode 100644
index 0000000000..b98b90e187
--- /dev/null
+++ b/src/main/python/systemds/scuro/representations/text_context.py
@@ -0,0 +1,221 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import re
+from typing import List, Any
+
+from systemds.scuro.drsearch.operator_registry import register_context_operator
+from systemds.scuro.representations.context import Context
+from systemds.scuro.modality.type import ModalityType
+
+
+def _split_into_words(text: str) -> List[str]:
+    """Split text into words, preserving whitespace structure."""
+    if not text or not isinstance(text, str):
+        return []
+    return text.split()
+
+
+def _split_into_sentences(text: str) -> List[str]:
+    """
+    Split text into sentences using regex.
+    Handles common sentence endings: . ! ?
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])(?=\s*$)"
+    sentences = re.split(sentence_pattern, text.strip())
+
+    sentences = [s.strip() for s in sentences if s.strip()]
+
+    if not sentences:
+        return [text]
+
+    return sentences
+
+
+def _count_words(text: str) -> int:
+    """
+    Count the number of words in a text string.
+    """
+    if not text or not isinstance(text, str):
+        return 0
+    return len(text.split())
+
+
+def _extract_text(instance: Any) -> str:
+    if isinstance(instance, str):
+        text = instance
+    else:
+        text = str(instance)
+
+    if not text or not text.strip():
+        return ""
+    return text
+
+
+@register_context_operator(ModalityType.TEXT)
+class SentenceBoundarySplit(Context):
+    """
+    Splits text at sentence boundaries while respecting maximum word count.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        min_words (int): Minimum number of words per chunk before splitting 
(default: 10)
+    """
+
+    def __init__(self, max_words=55, min_words=10):
+        parameters = {
+            "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450],
+            "min_words": [10, 20, 30],
+        }
+        super().__init__("SentenceBoundarySplit", parameters)
+        self.max_words = int(max_words)
+        self.min_words = max(1, int(min_words))
+
+    def execute(self, modality):
+        """
+        Split each text instance at sentence boundaries, respecting max_words.
+
+        Returns:
+            List of lists, where each inner list contains text chunks (strings)
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+            if not text:
+                chunked_data.append([""])
+                continue
+
+            sentences = _split_into_sentences(text)
+
+            if not sentences:
+                chunked_data.append([text])
+                continue
+
+            chunks = []
+            current_chunk = []
+            current_word_count = 0
+
+            for sentence in sentences:
+                sentence_word_count = _count_words(sentence)
+
+                if sentence_word_count > self.max_words:
+                    if current_chunk and current_word_count >= self.min_words:
+                        chunks.append("".join(current_chunk))
+                        current_chunk = []
+                        current_word_count = 0
+
+                    words = _split_into_words(sentence)
+                    for i in range(0, len(words), self.max_words):
+                        chunk_words = words[i : i + self.max_words]
+                        chunks.append(" ".join(chunk_words))
+
+                elif current_word_count + sentence_word_count > self.max_words:
+                    if current_chunk and current_word_count >= self.min_words:
+                        chunks.append(" ".join(current_chunk))
+                        current_chunk = [sentence]
+                        current_word_count = sentence_word_count
+                    else:
+                        current_chunk.append(sentence)
+                        current_word_count += sentence_word_count
+                else:
+                    current_chunk.append(sentence)
+                    current_word_count += sentence_word_count
+
+            # Add remaining chunk
+            if current_chunk:
+                chunks.append(" ".join(current_chunk))
+
+            if not chunks:
+                chunks = [text]
+
+            chunked_data.append(chunks)
+
+        return chunked_data
+
+
+@register_context_operator(ModalityType.TEXT)
+class OverlappingSplit(Context):
+    """
+    Splits text with overlapping chunks using a sliding window approach.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        overlap (float): percentage of overlapping words between chunks 
(default: 50%)
+        stride (int, optional): Step size in words. If None, stride = 
max_words - overlap_words
+    """
+
+    def __init__(self, max_words=55, overlap=0.5, stride=None):
+        overlap_words = int(max_words * overlap)
+        if stride is None:
+            stride = max_words - overlap_words
+
+        parameters = {
+            "max_words": [40, 55, 70, 250, 300, 350, 400, 450],
+            "overlap": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+            "stride": [10, 15, 20, 30],
+        }
+        super().__init__("OverlappingSplit", parameters)
+        self.max_words = max_words
+        self.overlap = overlap
+        self.stride = stride
+
+    def execute(self, modality):
+        """
+        Split each text instance with overlapping chunks.
+
+        Returns:
+            List of lists, where each inner list contains text chunks (strings)
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+            if not text:
+                chunked_data.append("")
+                continue
+
+            words = _split_into_words(text)
+
+            if len(words) <= self.max_words:
+                chunked_data.append([text])
+                continue
+
+            chunks = []
+
+            # Create overlapping chunks with specified stride
+            for i in range(0, len(words), self.stride):
+                chunk_words = words[i : i + self.max_words]
+                if chunk_words:
+                    chunk_text = " ".join(chunk_words)
+                    chunks.append(chunk_text)
+
+                if i + self.max_words >= len(words):
+                    break
+
+            if not chunks:
+                chunks = [text]
+
+            chunked_data.append(chunks)
+
+        return chunked_data
diff --git 
a/src/main/python/systemds/scuro/representations/text_context_with_indices.py 
b/src/main/python/systemds/scuro/representations/text_context_with_indices.py
new file mode 100644
index 0000000000..cc7070306b
--- /dev/null
+++ 
b/src/main/python/systemds/scuro/representations/text_context_with_indices.py
@@ -0,0 +1,300 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import re
+from typing import List, Any
+
+from systemds.scuro.drsearch.operator_registry import register_context_operator
+from systemds.scuro.representations.context import Context
+from systemds.scuro.modality.type import ModalityType
+
+# TODO: Use this to get indices for text chunks based on different splitting 
strategies
+# To use this approach a differnt extration of text chunks is needed in either 
the TextModality or the Representations
+
+
+def _split_into_words(text: str) -> List[str]:
+    """Split text into words, preserving whitespace structure."""
+    if not text or not isinstance(text, str):
+        return []
+    return text.split()
+
+
+def _split_into_sentences(text: str) -> List[str]:
+    """
+    Split text into sentences using regex.
+    Handles common sentence endings: . ! ?
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    sentence_pattern = r"(?<=[.!?])\s+(?=[A-Z])|(?<=[.!?])(?=\s*$)"
+    sentences = re.split(sentence_pattern, text.strip())
+
+    sentences = [s.strip() for s in sentences if s.strip()]
+
+    if not sentences:
+        return [text]
+
+    return sentences
+
+
+def _count_words(text: str) -> int:
+    """
+    Count the number of words in a text string.
+    """
+    if not text or not isinstance(text, str):
+        return 0
+    return len(text.split())
+
+
+def _extract_text(instance: Any) -> str:
+    if isinstance(instance, str):
+        text = instance
+    else:
+        text = str(instance)
+
+    if not text or not text.strip():
+        return ""
+    return text
+
+
+# @register_context_operator(ModalityType.TEXT)
+class WordCountSplitIndices(Context):
+    """
+    Splits text after a fixed number of words.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        overlap (int): Number of overlapping words between chunks (default: 0)
+    """
+
+    def __init__(self, max_words=55, overlap=0):
+        parameters = {
+            "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450],
+            "overlap": [0, 10, 20, 30],
+        }
+        super().__init__("WordCountSplit", parameters)
+        self.max_words = int(max_words)
+        self.overlap = max(0, int(overlap))
+
+    def execute(self, modality):
+        """
+        Split each text instance into chunks of max_words words.
+
+        Returns:
+            List of tuples, where each tuple contains the start and end index 
of text chunks
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+
+            if not text:
+                chunked_data.append((0, 0))
+                continue
+
+            words = _split_into_words(text)
+
+            if len(words) <= self.max_words:
+                chunked_data.append([(0, len(text))])
+                continue
+
+            chunks = []
+            stride = self.max_words - self.overlap
+
+            start = 0
+            for i in range(0, len(words), stride):
+                chunk_words = words[i : i + self.max_words]
+                chunk_text = " ".join(chunk_words)
+                chunks.append((start, start + len(chunk_text)))
+                start += len(chunk_text) + 1
+
+                if i + self.max_words >= len(words):
+                    break
+
+            chunked_data.append(chunks)
+
+        return chunked_data
+
+
+@register_context_operator(ModalityType.TEXT)
+class SentenceBoundarySplitIndices(Context):
+    """
+    Splits text at sentence boundaries while respecting maximum word count.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        min_words (int): Minimum number of words per chunk before splitting 
(default: 10)
+    """
+
+    def __init__(self, max_words=55, min_words=10, overlap=0.1):
+        parameters = {
+            "max_words": [40, 50, 55, 60, 70, 250, 300, 350, 400, 450],
+            "min_words": [10, 20, 30],
+        }
+        super().__init__("SentenceBoundarySplit", parameters)
+        self.max_words = int(max_words)
+        self.min_words = max(1, int(min_words))
+        self.overlap = overlap
+        self.stride = max(1, int(max_words * (1 - overlap)))
+
+    def execute(self, modality):
+        """
+        Split each text instance at sentence boundaries, respecting max_words.
+
+        Returns:
+            List of lists, where each inner list contains text chunks (strings)
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+            if not text:
+                chunked_data.append((0, 0))
+                continue
+
+            sentences = _split_into_sentences(text)
+
+            if not sentences:
+                chunked_data.append((0, len(text)))
+                continue
+
+            chunks = []
+            current_chunk = None
+            current_word_count = 0
+            start = 0
+            for sentence in sentences:
+                sentence_word_count = _count_words(sentence)
+
+                if sentence_word_count > self.max_words:
+                    if current_chunk and current_word_count >= self.min_words:
+                        chunks.append(current_chunk)
+                        current_chunk = []
+                        current_word_count = 0
+
+                    words = _split_into_words(sentence)
+                    for i in range(0, len(words), self.max_words):
+                        chunk_words = words[i : i + self.max_words]
+                        current_chunk = (
+                            (start, start + len(" ".join(chunk_words)))
+                            if not current_chunk
+                            else (current_chunk[0], start + len(" 
".join(chunk_words)))
+                        )
+                        start += len(" ".join(chunk_words)) + 1
+
+                elif current_word_count + sentence_word_count > self.max_words:
+                    if current_chunk and current_word_count >= self.min_words:
+                        chunks.append(current_chunk)
+                        current_chunk = (start, start + len(sentence))
+                        start += len(sentence) + 1
+                        current_word_count = sentence_word_count
+                    else:
+                        current_chunk = (current_chunk[0], start + 
len(sentence))
+                        start += len(sentence) + 1
+                        current_word_count += sentence_word_count
+                else:
+                    current_chunk = (
+                        (start, start + len(sentence))
+                        if not current_chunk
+                        else (current_chunk[0], start + len(sentence))
+                    )
+                    start += len(sentence) + 1
+                    current_word_count += sentence_word_count
+
+            # Add remaining chunk
+            if current_chunk:
+                chunks.append(current_chunk)
+
+            if not chunks:
+                chunks = [(0, len(text))]
+
+            chunked_data.append(chunks)
+
+        return chunked_data
+
+
+@register_context_operator(ModalityType.TEXT)
+class OverlappingSplitIndices(Context):
+    """
+    Splits text with overlapping chunks using a sliding window approach.
+
+    Parameters:
+        max_words (int): Maximum number of words per chunk (default: 55)
+        overlap (int): percentage of overlapping words between chunks 
(default: 50%)
+        stride (int, optional): Step size in words. If None, stride = 
max_words - overlap_words
+    """
+
+    def __init__(self, max_words=55, overlap=0.5, stride=None):
+        overlap_words = int(max_words * overlap)
+        if stride is None:
+            stride = max_words - overlap_words
+
+        parameters = {
+            "max_words": [40, 55, 70, 250, 300, 350, 400, 450],
+            "overlap": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
+            "stride": [10, 15, 20, 30],
+        }
+        super().__init__("OverlappingSplit", parameters)
+        self.max_words = max_words
+        self.overlap = overlap
+        self.stride = stride
+
+    def execute(self, modality):
+        """
+        Split each text instance with overlapping chunks.
+
+        Returns:
+            List of tuples, where each tuple contains start and end index to 
the text chunks
+        """
+        chunked_data = []
+
+        for instance in modality.data:
+            text = _extract_text(instance)
+            if not text:
+                chunked_data.append((0, 0))
+                continue
+
+            words = _split_into_words(text)
+
+            if len(words) <= self.max_words:
+                chunked_data.append((0, len(text)))
+                continue
+
+            chunks = []
+
+            # Create overlapping chunks with specified stride
+            start = 0
+            for i in range(0, len(words), self.stride):
+                chunk_words = words[i : i + self.max_words]
+                if chunk_words:
+                    chunk_text = " ".join(chunk_words)
+                    chunks.append((start, start + len(chunk_text)))
+                    start += len(chunk_text) - len(
+                        " ".join(chunk_words[self.stride - len(chunk_words) :])
+                    )
+                if i + self.max_words >= len(words):
+                    break
+
+            if not chunks:
+                chunks = [(0, len(text))]
+
+            chunked_data.append(chunks)
+
+        return chunked_data
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py 
b/src/main/python/systemds/scuro/representations/unimodal.py
index 362888aa27..2bb34733e2 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/representations/unimodal.py
@@ -38,6 +38,8 @@ class UnimodalRepresentation(Representation):
         if parameters is None:
             parameters = {}
         self.self_contained = self_contained
+        self.needs_context = False
+        self.initial_context_length = None
 
     @abc.abstractmethod
     def transform(self, data):
diff --git 
a/src/main/python/systemds/scuro/representations/window_aggregation.py 
b/src/main/python/systemds/scuro/representations/window_aggregation.py
index f40b28ea87..4d4ec19c5b 100644
--- a/src/main/python/systemds/scuro/representations/window_aggregation.py
+++ b/src/main/python/systemds/scuro/representations/window_aggregation.py
@@ -59,11 +59,11 @@ class Window(Context):
             self._aggregation_function = Aggregation(value)
 
 
-@register_context_operator()
+@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO])
 class WindowAggregation(Window):
     def __init__(self, aggregation_function="mean", window_size=10, pad=False):
         super().__init__("WindowAggregation", aggregation_function)
-        self.parameters["window_size"] = [window_size]
+        self.parameters["window_size"] = [5, 10, 15, 25, 50, 100]
         self.window_size = int(window_size)
         self.pad = pad
 
@@ -167,7 +167,7 @@ class WindowAggregation(Window):
         return np.array(result)
 
 
-@register_context_operator()
+@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO])
 class StaticWindow(Window):
     def __init__(self, aggregation_function="mean", num_windows=100):
         super().__init__("StaticWindow", aggregation_function)
@@ -198,7 +198,7 @@ class StaticWindow(Window):
         return np.array(windowed_data)
 
 
-@register_context_operator()
+@register_context_operator([ModalityType.TIMESERIES, ModalityType.AUDIO])
 class DynamicWindow(Window):
     def __init__(self, aggregation_function="mean", num_windows=100):
         super().__init__("DynamicWindow", aggregation_function)
diff --git a/src/main/python/tests/scuro/data_generator.py 
b/src/main/python/tests/scuro/data_generator.py
index 5bec163fe7..9da0aa82c0 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -26,6 +26,11 @@ from scipy.io.wavfile import write
 import random
 import os
 
+from sklearn import svm
+from sklearn.metrics import classification_report
+from sklearn.model_selection import train_test_split
+
+from systemds.scuro.models.model import Model
 from systemds.scuro.dataloader.base_loader import BaseLoader
 from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.dataloader.audio_loader import AudioLoader
@@ -33,6 +38,7 @@ from systemds.scuro.dataloader.text_loader import TextLoader
 from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.modality.transformed import TransformedModality
 from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.drsearch.task import Task
 
 
 class TestDataLoader(BaseLoader):
@@ -130,7 +136,7 @@ class ModalityRandomDataGenerator:
         }
         return data, metadata
 
-    def create_text_data(self, num_instances):
+    def create_text_data(self, num_instances, num_sentences_per_instance=1):
         subjects = [
             "The cat",
             "A dog",
@@ -172,18 +178,24 @@ class ModalityRandomDataGenerator:
             "precisely",
             "methodically",
         ]
+        punctuation = [".", "?", "!"]
 
         sentences = []
         for _ in range(num_instances):
-            include_adverb = np.random.random() < 0.7
-
-            subject = np.random.choice(subjects)
-            verb = np.random.choice(verbs)
-            obj = np.random.choice(objects)
-            adverb = np.random.choice(adverbs) if include_adverb else ""
-
-            sentence = f"{subject} {adverb} {verb} {obj}"
-
+            sentence = ""
+            for i in range(num_sentences_per_instance):
+                include_adverb = np.random.random() < 0.7
+
+                subject = np.random.choice(subjects)
+                verb = np.random.choice(verbs)
+                obj = np.random.choice(objects)
+                adverb = np.random.choice(adverbs) if include_adverb else ""
+                punct = np.random.choice(punctuation)
+
+                sentence += " " if i > 0 else ""
+                sentence += f"{subject}"
+                sentence += f" {adverb}" if include_adverb else ""
+                sentence += f" {verb} {obj}{punct}"
             sentences.append(sentence)
 
         metadata = {
@@ -382,3 +394,57 @@ class TestDataGenerator:
         audio_data = 0.5 * np.sin(2 * np.pi * frequency * t)
 
         write(path, sample_rate, audio_data)
+
+
+class TestSVM(Model):
+    def __init__(self, name):
+        super().__init__(name)
+
+    def fit(self, X, y, X_test, y_test):
+        if X.ndim > 2:
+            X = X.reshape(X.shape[0], -1)
+        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
+        self.clf = self.clf.fit(X, np.array(y))
+        y_pred = self.clf.predict(X)
+
+        return {
+            "accuracy": classification_report(
+                y, y_pred, output_dict=True, digits=3, zero_division=1
+            )["accuracy"]
+        }, 0
+
+    def test(self, test_X: np.ndarray, test_y: np.ndarray):
+        if test_X.ndim > 2:
+            test_X = test_X.reshape(test_X.shape[0], -1)
+        y_pred = self.clf.predict(np.array(test_X))  # noqa]
+
+        return {
+            "accuracy": classification_report(
+                np.array(test_y), y_pred, output_dict=True, digits=3, 
zero_division=1
+            )["accuracy"]
+        }, 0
+
+
+class TestTask(Task):
+    def __init__(self, name, model_name, num_instances):
+        self.labels = ModalityRandomDataGenerator().create_balanced_labels(
+            num_instances=10
+        )
+        split = train_test_split(
+            np.array(range(num_instances)),
+            self.labels,
+            test_size=0.2,
+            random_state=42,
+            stratify=self.labels,
+        )
+        self.train_indizes, self.val_indizes = [int(i) for i in split[0]], [
+            int(i) for i in split[1]
+        ]
+
+        super().__init__(
+            name,
+            TestSVM(model_name),
+            self.labels,
+            self.train_indizes,
+            self.val_indizes,
+        )
diff --git a/src/main/python/tests/scuro/test_hp_tuner.py 
b/src/main/python/tests/scuro/test_hp_tuner.py
index 8484a352e4..f163498dab 100644
--- a/src/main/python/tests/scuro/test_hp_tuner.py
+++ b/src/main/python/tests/scuro/test_hp_tuner.py
@@ -22,17 +22,12 @@
 import unittest
 
 import numpy as np
-from sklearn import svm
-from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split
 
 from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
 from systemds.scuro.representations.average import Average
 from systemds.scuro.representations.concatenation import Concatenation
 from systemds.scuro.representations.lstm import LSTM
 from systemds.scuro.drsearch.operator_registry import Registry
-from systemds.scuro.models.model import Model
-from systemds.scuro.drsearch.task import Task
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 
 from systemds.scuro.representations.spectrogram import Spectrogram
@@ -45,70 +40,15 @@ from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.representations.bow import BoW
 from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import ModalityRandomDataGenerator, 
TestDataLoader
+from tests.scuro.data_generator import (
+    ModalityRandomDataGenerator,
+    TestDataLoader,
+    TestTask,
+)
 
 from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.drsearch.hyperparameter_tuner import HyperparameterTuner
 
-
-class TestSVM(Model):
-    def __init__(self):
-        super().__init__("TestSVM")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa]
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, 
zero_division=1
-            )["accuracy"]
-        }, 0
-
-
-class TestSVM2(Model):
-    def __init__(self):
-        super().__init__("TestSVM2")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, 
zero_division=1
-            )["accuracy"]
-        }, 0
-
-
 from unittest.mock import patch
 
 
@@ -120,36 +60,10 @@ class TestHPTuner(unittest.TestCase):
     def setUpClass(cls):
         cls.num_instances = 10
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-        cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
-            num_instances=cls.num_instances
-        )
         cls.indices = np.array(range(cls.num_instances))
-
-        split = train_test_split(
-            cls.indices,
-            cls.labels,
-            test_size=0.2,
-            random_state=42,
-        )
-        cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
-            int(i) for i in split[1]
-        ]
-
         cls.tasks = [
-            Task(
-                "UnimodalRepresentationTask1",
-                TestSVM(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
-            Task(
-                "UnimodalRepresentationTask2",
-                TestSVM2(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
+            TestTask("UnimodalRepresentationTask1", "TestSVM1", 
cls.num_instances),
+            TestTask("UnimodalRepresentationTask2", "TestSVM2", 
cls.num_instances),
         ]
 
     def test_hp_tuner_for_audio_modality(self):
diff --git a/src/main/python/tests/scuro/test_multimodal_fusion.py 
b/src/main/python/tests/scuro/test_multimodal_fusion.py
index f98824a16a..e89843afcd 100644
--- a/src/main/python/tests/scuro/test_multimodal_fusion.py
+++ b/src/main/python/tests/scuro/test_multimodal_fusion.py
@@ -22,8 +22,6 @@
 import unittest
 
 import numpy as np
-from sklearn import svm
-from sklearn.metrics import classification_report
 from sklearn.model_selection import train_test_split
 
 from systemds.scuro.drsearch.multimodal_optimizer import MultimodalOptimizer
@@ -32,7 +30,6 @@ from systemds.scuro.representations.concatenation import 
Concatenation
 from systemds.scuro.representations.lstm import LSTM
 from systemds.scuro.representations.average import Average
 from systemds.scuro.drsearch.operator_registry import Registry
-from systemds.scuro.models.model import Model
 from systemds.scuro.drsearch.task import Task
 
 from systemds.scuro.representations.spectrogram import Spectrogram
@@ -43,70 +40,13 @@ from 
systemds.scuro.representations.timeseries_representations import Min, Max
 from tests.scuro.data_generator import (
     TestDataLoader,
     ModalityRandomDataGenerator,
+    TestTask,
 )
 
 from systemds.scuro.modality.type import ModalityType
 from unittest.mock import patch
 
 
-class TestSVM(Model):
-    def __init__(self):
-        super().__init__("TestSVM")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, 
zero_division=1
-            )["accuracy"]
-        }, 0
-
-
-class TestCNN(Model):
-    def __init__(self):
-        super().__init__("TestCNN")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, 
zero_division=1
-            )["accuracy"]
-        }, 0
-
-
 class TestMultimodalRepresentationOptimizer(unittest.TestCase):
     test_file_path = None
     data_generator = None
@@ -116,30 +56,10 @@ class 
TestMultimodalRepresentationOptimizer(unittest.TestCase):
     def setUpClass(cls):
         cls.num_instances = 10
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-        cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
-            num_instances=cls.num_instances
-        )
         cls.indices = np.array(range(cls.num_instances))
 
-        split = train_test_split(
-            cls.indices,
-            cls.labels,
-            test_size=0.2,
-            random_state=42,
-            stratify=cls.labels,
-        )
-        cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
-            int(i) for i in split[1]
-        ]
-
     def test_multimodal_fusion(self):
-        task = Task(
-            "MM_Fusion_Task1",
-            TestSVM(),
-            self.labels,
-            self.train_indizes,
-            self.val_indizes,
-        )
+        task = TestTask("MM_Fusion_Task1", "Test1", self.num_instances)
 
         audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
             self.num_instances, 1000
@@ -199,13 +119,7 @@ class 
TestMultimodalRepresentationOptimizer(unittest.TestCase):
             )
 
     def test_parallel_multimodal_fusion(self):
-        task = Task(
-            "MM_Fusion_Task1",
-            TestSVM(),
-            self.labels,
-            self.train_indizes,
-            self.val_indizes,
-        )
+        task = TestTask("MM_Fusion_Task1", "Test2", self.num_instances)
 
         audio_data, audio_md = ModalityRandomDataGenerator().create_audio_data(
             self.num_instances, 1000
diff --git a/src/main/python/tests/scuro/test_operator_registry.py 
b/src/main/python/tests/scuro/test_operator_registry.py
index c33eb5fcc2..2edada0739 100644
--- a/src/main/python/tests/scuro/test_operator_registry.py
+++ b/src/main/python/tests/scuro/test_operator_registry.py
@@ -21,7 +21,14 @@
 
 import unittest
 
-from systemds.scuro import FrequencyMagnitude
+from systemds.scuro.representations.text_context import (
+    SentenceBoundarySplit,
+    OverlappingSplit,
+)
+from systemds.scuro.representations.text_context_with_indices import (
+    SentenceBoundarySplitIndices,
+    OverlappingSplitIndices,
+)
 from systemds.scuro.representations.covarep_audio_features import (
     ZeroCrossing,
     Spectral,
@@ -124,11 +131,17 @@ class TestOperatorRegistry(unittest.TestCase):
 
     def test_context_operator_in_registry(self):
         registry = Registry()
-        assert registry.get_context_operators() == [
+        assert registry.get_context_operators(ModalityType.TIMESERIES) == [
             WindowAggregation,
             StaticWindow,
             DynamicWindow,
         ]
+        assert registry.get_context_operators(ModalityType.TEXT) == [
+            SentenceBoundarySplit,
+            OverlappingSplit,
+            SentenceBoundarySplitIndices,
+            OverlappingSplitIndices,
+        ]
 
     # def test_fusion_operator_in_registry(self):
     #     registry = Registry()
diff --git a/src/main/python/tests/scuro/test_text_context_operators.py 
b/src/main/python/tests/scuro/test_text_context_operators.py
new file mode 100644
index 0000000000..1f04165407
--- /dev/null
+++ b/src/main/python/tests/scuro/test_text_context_operators.py
@@ -0,0 +1,113 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+
+import unittest
+from systemds.scuro.representations.text_context import (
+    SentenceBoundarySplit,
+    OverlappingSplit,
+)
+from systemds.scuro.representations.text_context_with_indices import (
+    SentenceBoundarySplitIndices,
+    OverlappingSplitIndices,
+)
+from tests.scuro.data_generator import (
+    ModalityRandomDataGenerator,
+    TestDataLoader,
+    TestTask,
+)
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.type import ModalityType
+
+
+class TestTextContextOperator(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.data_generator = ModalityRandomDataGenerator()
+        cls.data, cls.md = cls.data_generator.create_text_data(10, 50)
+        cls.text_modality = UnimodalModality(
+            TestDataLoader(
+                [i for i in range(0, 10)],
+                None,
+                ModalityType.TEXT,
+                cls.data,
+                str,
+                cls.md,
+            )
+        )
+        cls.text_modality.extract_raw_data()
+        cls.task = TestTask("TextContextTask", "Test1", 10)
+
+    def test_sentence_boundary_split(self):
+        sentence_boundary_split = SentenceBoundarySplit(10, min_words=4)
+        chunks = sentence_boundary_split.execute(self.text_modality)
+        for i in range(0, len(chunks)):
+            for chunk in chunks[i]:
+                assert len(chunk.split(" ")) <= 10 and (
+                    chunk[-1] == "." or chunk[-1] == "!" or chunk[-1] == "?"
+                )
+
+    def test_overlapping_split(self):
+        overlapping_split = OverlappingSplit(40, 0.05)
+        chunks = overlapping_split.execute(self.text_modality)
+        for i in range(len(chunks)):
+            prev_chunk = ""
+            for j, chunk in enumerate(chunks[i]):
+                if j > 0:
+                    prev_words = prev_chunk.split(" ")
+                    curr_words = chunk.split(" ")
+                    assert prev_words[-2:] == curr_words[:2]
+                prev_chunk = chunk
+                assert len(chunk.split(" ")) <= 40
+
+    def test_sentence_boundary_split_indices(self):
+        sentence_boundary_split = SentenceBoundarySplitIndices(10, min_words=4)
+        chunks = sentence_boundary_split.execute(self.text_modality)
+        for i in range(0, len(chunks)):
+            for chunk in chunks[i]:
+                text = self.text_modality.data[i][chunk[0] : chunk[1]].split(" 
")
+                assert len(text) <= 10 and (
+                    text[-1][-1] == "." or text[-1][-1] == "!" or text[-1][-1] 
== "?"
+                )
+
+    def test_overlapping_split_indices(self):
+        overlapping_split = OverlappingSplitIndices(40, 0.1)
+        chunks = overlapping_split.execute(self.text_modality)
+        for i in range(len(chunks)):
+            prev_chunk = (0, 0)
+            for j, chunk in enumerate(chunks[i]):
+                if j > 0:
+                    prev_words = self.text_modality.data[i][
+                        prev_chunk[0] : prev_chunk[1]
+                    ].split(" ")
+                    curr_words = self.text_modality.data[i][chunk[0] : 
chunk[1]].split(
+                        " "
+                    )
+                    assert prev_words[-4:] == curr_words[:4]
+                prev_chunk = chunk
+                assert (
+                    len(self.text_modality.data[i][chunk[0] : 
chunk[1]].split(" "))
+                    <= 40
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/src/main/python/tests/scuro/test_unimodal_optimizer.py 
b/src/main/python/tests/scuro/test_unimodal_optimizer.py
index ca54ee64b1..0d8ae90177 100644
--- a/src/main/python/tests/scuro/test_unimodal_optimizer.py
+++ b/src/main/python/tests/scuro/test_unimodal_optimizer.py
@@ -23,17 +23,11 @@
 import unittest
 
 import numpy as np
-from sklearn import svm
-from sklearn.metrics import classification_report
-from sklearn.model_selection import train_test_split
-
 from systemds.scuro.representations.timeseries_representations import (
     Mean,
     ACF,
 )
 from systemds.scuro.drsearch.operator_registry import Registry
-from systemds.scuro.models.model import Model
-from systemds.scuro.drsearch.task import Task
 from systemds.scuro.drsearch.unimodal_optimizer import UnimodalOptimizer
 
 from systemds.scuro.representations.spectrogram import Spectrogram
@@ -44,69 +38,14 @@ from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.representations.bow import BoW
 from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.representations.resnet import ResNet
-from tests.scuro.data_generator import ModalityRandomDataGenerator, 
TestDataLoader
+from tests.scuro.data_generator import (
+    ModalityRandomDataGenerator,
+    TestDataLoader,
+    TestTask,
+)
 
 from systemds.scuro.modality.type import ModalityType
 
-
-class TestSVM(Model):
-    def __init__(self):
-        super().__init__("TestSVM")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, 
zero_division=1
-            )["accuracy"]
-        }, 0
-
-
-class TestCNN(Model):
-    def __init__(self):
-        super().__init__("TestCNN")
-
-    def fit(self, X, y, X_test, y_test):
-        if X.ndim > 2:
-            X = X.reshape(X.shape[0], -1)
-        self.clf = svm.SVC(C=1, gamma="scale", kernel="rbf", verbose=False)
-        self.clf = self.clf.fit(X, np.array(y))
-        y_pred = self.clf.predict(X)
-
-        return {
-            "accuracy": classification_report(
-                y, y_pred, output_dict=True, digits=3, zero_division=1
-            )["accuracy"]
-        }, 0
-
-    def test(self, test_X: np.ndarray, test_y: np.ndarray):
-        if test_X.ndim > 2:
-            test_X = test_X.reshape(test_X.shape[0], -1)
-        y_pred = self.clf.predict(np.array(test_X))  # noqa
-
-        return {
-            "accuracy": classification_report(
-                np.array(test_y), y_pred, output_dict=True, digits=3, 
zero_division=1
-            )["accuracy"]
-        }, 0
-
-
 from unittest.mock import patch
 
 
@@ -118,36 +57,12 @@ class 
TestUnimodalRepresentationOptimizer(unittest.TestCase):
     def setUpClass(cls):
         cls.num_instances = 10
         cls.mods = [ModalityType.VIDEO, ModalityType.AUDIO, ModalityType.TEXT]
-        cls.labels = ModalityRandomDataGenerator().create_balanced_labels(
-            num_instances=cls.num_instances
-        )
-        cls.indices = np.array(range(cls.num_instances))
 
-        split = train_test_split(
-            cls.indices,
-            cls.labels,
-            test_size=0.2,
-            random_state=42,
-        )
-        cls.train_indizes, cls.val_indizes = [int(i) for i in split[0]], [
-            int(i) for i in split[1]
-        ]
+        cls.indices = np.array(range(cls.num_instances))
 
         cls.tasks = [
-            Task(
-                "UnimodalRepresentationTask1",
-                TestSVM(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
-            Task(
-                "UnimodalRepresentationTask2",
-                TestCNN(),
-                cls.labels,
-                cls.train_indizes,
-                cls.val_indizes,
-            ),
+            TestTask("UnimodalRepresentationTask1", "Test1", 
cls.num_instances),
+            TestTask("UnimodalRepresentationTask2", "Test2", 
cls.num_instances),
         ]
 
     def test_unimodal_optimizer_for_audio_modality(self):

Reply via email to