discivigour commented on code in PR #6987:
URL: https://github.com/apache/paimon/pull/6987#discussion_r2675746766


##########
paimon-python/pypaimon/read/datasource.py:
##########
@@ -216,13 +219,146 @@ def _get_read_task(
                 'read_fn': read_fn,
                 'metadata': metadata,
             }
-            
+
             if parse(ray.__version__) >= 
parse(RAY_VERSION_SCHEMA_IN_READ_TASK):
                 read_task_kwargs['schema'] = schema
-            
+
             if parse(ray.__version__) >= parse(RAY_VERSION_PER_TASK_ROW_LIMIT) 
and per_task_row_limit is not None:
                 read_task_kwargs['per_task_row_limit'] = per_task_row_limit
 
             read_tasks.append(ReadTask(**read_task_kwargs))
 
         return read_tasks
+
+
+class TorchDataset(Dataset):
+    """
+    PyTorch Dataset implementation for reading Paimon table data.
+
+    This class enables Paimon table data to be used directly with PyTorch's
+    training pipeline, allowing for efficient data loading and batching.
+    """
+
+    def __init__(self, table_read: TableRead, splits: List[Split]):
+        """
+        Initialize TorchDataset.
+
+        Args:
+            table_read: TableRead instance for reading data
+            splits: List of splits to read
+        """
+
+        self.table_read = table_read
+        self.splits = splits
+        self._data = self._load_data()
+
+    def __len__(self) -> int:
+        """
+        Return the total number of rows in the dataset.
+
+        Returns:
+            Total number of rows across all splits
+        """
+        if self._data is not None:
+            return len(self._data)
+        else:
+            return 0
+
+    def __getitem__(self, index: int):
+        """
+        Get a single item from the dataset.
+
+        Args:
+            index: Index of the item to retrieve
+
+        Returns:
+            Dictionary containing the row data
+        """
+        if not self._data:
+            return None
+
+        return self._data[index]
+
+    def _load_data(self):

Review Comment:
   👌



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to