Re: [PR] Support Embeddings in mltransform [beam]

via GitHub Fri, 08 Dec 2023 12:30:18 -0800


AnandInguva commented on code in PR #29564:
URL: https://github.com/apache/beam/pull/29564#discussion_r1420909045



##########
sdks/python/apache_beam/ml/transforms/tft.py:
##########
@@ -95,6 +96,24 @@ def __init__(self, columns: List[str]) -> None:
           "Columns are not specified. Please specify the column for the "
           " op %s" % self.__class__.__name__)
 
+  def get_ptransform_for_processing(self, **kwargs) -> beam.PTransform:
+    from apache_beam.ml.transforms.handlers import TFTProcessHandler
+    params = {}
+    artifact_location = kwargs.get('artifact_location')
+    if not artifact_location:
+      raise RuntimeError(
+          "artifact_location is not specified. Please specify the "
+          "artifact_location for the op %s" % self.__class__.__name__)
+
+    transforms = kwargs.get('transforms')
+    if transforms:
+      params['transforms'] = transforms

Review Comment:
   Yes, we are not passing it right now. we can add it later if we need it. 
Thanks for catching.



##########
sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py:
##########
@@ -0,0 +1,152 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Vertex AI Python SDK is required for this module.
+# Follow 
https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk # 
pylint: disable=line-too-long
+# to install Vertex AI Python SDK.
+
+from typing import Any
+from typing import Dict
+from typing import Iterable
+from typing import List
+from typing import Optional
+from typing import Sequence
+
+from google.auth.credentials import Credentials
+
+import apache_beam as beam
+import vertexai
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.transforms.base import EmbeddingsManager
+from apache_beam.ml.transforms.base import _TextEmbeddingHandler
+from vertexai.language_models import TextEmbeddingInput
+from vertexai.language_models import TextEmbeddingModel
+
+__all__ = ["VertexAITextEmbeddings"]
+
+DEFAULT_TASK_TYPE = "RETRIEVAL_DOCUMENT"
+# TODO: Can this list be automatically pulled from Vertex SDK?
+TASK_TYPE_INPUTS = [
+    "RETRIEVAL_DOCUMENT",
+    "RETRIEVAL_QUERY",
+    "SEMANTIC_SIMILARITY",
+    "CLASSIFICATION",
+    "CLUSTERING"
+]
+_BATCH_SIZE = 5  # Vertex AI limits requests to 5 at a time.
+
+
+class _VertexAITextEmbeddingHandler(ModelHandler):
+  """
+  Note: Intended for internal use and guarantees no backwards compatibility.
+  """
+  def __init__(
+      self,
+      model_name: str,
+      title: Optional[str] = None,
+      task_type: str = DEFAULT_TASK_TYPE,
+      project: Optional[str] = None,
+      location: Optional[str] = None,
+      credentials: Optional[Credentials] = None,
+  ):
+    vertexai.init(project=project, location=location, credentials=credentials)
+    self.model_name = model_name
+    if task_type not in TASK_TYPE_INPUTS:
+      raise ValueError(
+          f"task_type must be one of {TASK_TYPE_INPUTS}, got {task_type}")
+    self.task_type = task_type
+    self.title = title
+
+  def run_inference(
+      self,
+      batch: Sequence[str],
+      model: Any,
+      inference_args: Optional[Dict[str, Any]] = None,
+  ) -> Iterable:
+    embeddings = []
+    batch_size = _BATCH_SIZE
+    for i in range(0, len(batch), batch_size):
+      text_batch = batch[i:i + batch_size]
+      text_batch = [
+          TextEmbeddingInput(
+              text=text, title=self.title, task_type=self.task_type)
+          for text in text_batch
+      ]
+      embeddings_batch = model.get_embeddings(text_batch)
+      embeddings.extend([el.values for el in embeddings_batch])
+    return embeddings
+
+  def load_model(self):
+    model = TextEmbeddingModel.from_pretrained(self.model_name)
+    return model
+
+
+class VertexAITextEmbeddings(EmbeddingsManager):
+  def __init__(
+      self,
+      model_name: str,
+      columns: List[str],
+      title: Optional[str] = None,
+      task_type: str = DEFAULT_TASK_TYPE,
+      project: Optional[str] = None,
+      location: Optional[str] = None,
+      credentials: Optional[Credentials] = None,
+      **kwargs):
+    """
+    Embedding Config for Vertex AI Text Embedding models following
+    
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings
 # pylint: disable=line-too-long
+    Text Embeddings are generated for a batch of text using the Vertex AI SDK.
+    Embeddings are returned in a list for each text in the batch. Look at
+    
https://cloud.google.com/vertex-ai/docs/generative-ai/learn/model-versioning#stable-versions-available.md
 # pylint: disable=line-too-long
+    for more information on model versions and lifecycle.
+
+    Args:
+      model_name: The name of the Vertex AI Text Embedding model.
+      columns: The columns containing the text to be embedded.
+      task_type: The downstream task for the embeddings.
+        Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
+        SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING.

Review Comment:
   Added the link



##########
sdks/python/apache_beam/ml/transforms/base.py:
##########
@@ -76,33 +148,52 @@ def __call__(self, data: OperationInputT,
     transformed_data = self.apply_transform(data, output_column_name)
     return transformed_data
 
-  def get_counter(self):
-    """
-    Returns the counter name for the operation.
-    """
-    counter_name = self.__class__.__name__
-    return Metrics.counter(MLTransform, f'BeamML_{counter_name}')
-
 
-class ProcessHandler(Generic[ExampleT, MLTransformOutputT], abc.ABC):
+class ProcessHandler(beam.PTransform[beam.PCollection[ExampleT],
+                                     beam.PCollection[MLTransformOutputT]],
+                     abc.ABC):
   """
   Only for internal use. No backwards compatibility guarantees.
   """
   @abc.abstractmethod
-  def process_data(
-      self, pcoll: beam.PCollection[ExampleT]
-  ) -> beam.PCollection[MLTransformOutputT]:
+  def append_transform(self, transform: BaseOperation):
     """
-    Logic to process the data. This will be the entrypoint in
-    beam.MLTransform to process incoming data.
+    Append transforms to the ProcessHandler.
     """
 
+
+# TODO: Add support for inference_fn
+class EmbeddingsManager(MLTransformProvider):
+  def __init__(
+      self,
+      columns: List[str],
+      *,
+      # common args for all ModelHandlers.
+      load_model_args: Optional[Dict[str, Any]] = None,
+      min_batch_size: Optional[int] = None,
+      max_batch_size: Optional[int] = None,
+      large_model: bool = False,
+      **kwargs):
+    self.load_model_args = load_model_args or {}
+    self.min_batch_size = min_batch_size
+    self.max_batch_size = max_batch_size
+    self.large_model = large_model
+    self.columns = columns
+    self.inference_args = kwargs.pop('inference_args', {})
+
+    if kwargs:
+      _LOGGER.warning("Ignoring the following arguments: %s", kwargs.keys())
+
+  # TODO: Add set_model_handler method.

Review Comment:
   Done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Support Embeddings in mltransform [beam]

Reply via email to