damccorm commented on code in PR #29564:
URL: https://github.com/apache/beam/pull/29564#discussion_r1422565256
##########
sdks/python/apache_beam/ml/transforms/base.py:
##########
@@ -76,33 +148,52 @@ def __call__(self, data: OperationInputT,
transformed_data = self.apply_transform(data, output_column_name)
return transformed_data
- def get_counter(self):
- """
- Returns the counter name for the operation.
- """
- counter_name = self.__class__.__name__
- return Metrics.counter(MLTransform, f'BeamML_{counter_name}')
-
-class ProcessHandler(Generic[ExampleT, MLTransformOutputT], abc.ABC):
+class ProcessHandler(beam.PTransform[beam.PCollection[ExampleT],
+ beam.PCollection[MLTransformOutputT]],
+ abc.ABC):
"""
Only for internal use. No backwards compatibility guarantees.
"""
@abc.abstractmethod
- def process_data(
- self, pcoll: beam.PCollection[ExampleT]
- ) -> beam.PCollection[MLTransformOutputT]:
+ def append_transform(self, transform: BaseOperation):
"""
- Logic to process the data. This will be the entrypoint in
- beam.MLTransform to process incoming data.
+ Append transforms to the ProcessHandler.
"""
+
+# TODO: Add support for inference_fn
Review Comment:
Please link to issue
##########
sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py:
##########
@@ -0,0 +1,153 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Vertex AI Python SDK is required for this module.
+# Follow
https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk #
pylint: disable=line-too-long
+# to install Vertex AI Python SDK.
+
+from typing import Any
+from typing import Dict
+from typing import Iterable
+from typing import List
+from typing import Optional
+from typing import Sequence
+
+from google.auth.credentials import Credentials
+
+import apache_beam as beam
+import vertexai
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.transforms.base import EmbeddingsManager
+from apache_beam.ml.transforms.base import _TextEmbeddingHandler
+from vertexai.language_models import TextEmbeddingInput
+from vertexai.language_models import TextEmbeddingModel
+
+__all__ = ["VertexAITextEmbeddings"]
+
+DEFAULT_TASK_TYPE = "RETRIEVAL_DOCUMENT"
+# TODO: Can this list be automatically pulled from Vertex SDK?
Review Comment:
Please link to issue
##########
sdks/python/apache_beam/ml/transforms/base.py:
##########
@@ -254,3 +353,262 @@ def _increment_counters():
pipeline
| beam.Create([None])
| beam.Map(lambda _: _increment_counters()))
+
+
+class _TransformAttributeManager:
+ """
+ Base class used for saving and loading the attributes.
+ """
+ @staticmethod
+ def save_attributes(artifact_location):
+ """
+ Save the attributes to json file using stdlib json.
+ """
+ raise NotImplementedError
+
+ @staticmethod
+ def load_attributes(artifact_location):
+ """
+ Load the attributes from json file.
+ """
+ raise NotImplementedError
+
+
+class _JsonPickleTransformAttributeManager(_TransformAttributeManager):
+ """
+ Use Jsonpickle to save and load the attributes. Here the attributes refer
+ to the list of PTransforms that are used to process the data.
+
+ jsonpickle is used to serialize the PTransforms and save it to a json file
and
+ is compatible across python versions.
+ """
+ @staticmethod
+ def _is_remote_path(path):
+ is_gcs = path.find('gs://') != -1
+ # TODO: Add support for other remote paths.
Review Comment:
Please link to issue
##########
sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py:
##########
@@ -0,0 +1,152 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Vertex AI Python SDK is required for this module.
+# Follow
https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk #
pylint: disable=line-too-long
+# to install Vertex AI Python SDK.
+
+from typing import Any
+from typing import Dict
+from typing import Iterable
+from typing import List
+from typing import Optional
+from typing import Sequence
+
+from google.auth.credentials import Credentials
+
+import apache_beam as beam
+import vertexai
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.transforms.base import EmbeddingsManager
+from apache_beam.ml.transforms.base import _TextEmbeddingHandler
+from vertexai.language_models import TextEmbeddingInput
+from vertexai.language_models import TextEmbeddingModel
+
+__all__ = ["VertexAITextEmbeddings"]
+
+DEFAULT_TASK_TYPE = "RETRIEVAL_DOCUMENT"
+# TODO: Can this list be automatically pulled from Vertex SDK?
+TASK_TYPE_INPUTS = [
+ "RETRIEVAL_DOCUMENT",
+ "RETRIEVAL_QUERY",
+ "SEMANTIC_SIMILARITY",
+ "CLASSIFICATION",
+ "CLUSTERING"
+]
+_BATCH_SIZE = 5 # Vertex AI limits requests to 5 at a time.
+
+
+class _VertexAITextEmbeddingHandler(ModelHandler):
+ """
+ Note: Intended for internal use and guarantees no backwards compatibility.
+ """
+ def __init__(
+ self,
+ model_name: str,
+ title: Optional[str] = None,
+ task_type: str = DEFAULT_TASK_TYPE,
+ project: Optional[str] = None,
+ location: Optional[str] = None,
+ credentials: Optional[Credentials] = None,
+ ):
+ vertexai.init(project=project, location=location, credentials=credentials)
+ self.model_name = model_name
+ if task_type not in TASK_TYPE_INPUTS:
+ raise ValueError(
+ f"task_type must be one of {TASK_TYPE_INPUTS}, got {task_type}")
+ self.task_type = task_type
+ self.title = title
+
+ def run_inference(
+ self,
+ batch: Sequence[str],
+ model: Any,
+ inference_args: Optional[Dict[str, Any]] = None,
+ ) -> Iterable:
+ embeddings = []
+ batch_size = _BATCH_SIZE
+ for i in range(0, len(batch), batch_size):
+ text_batch = batch[i:i + batch_size]
+ text_batch = [
+ TextEmbeddingInput(
+ text=text, title=self.title, task_type=self.task_type)
+ for text in text_batch
+ ]
+ embeddings_batch = model.get_embeddings(text_batch)
+ embeddings.extend([el.values for el in embeddings_batch])
+ return embeddings
+
+ def load_model(self):
+ model = TextEmbeddingModel.from_pretrained(self.model_name)
+ return model
+
+
+class VertexAITextEmbeddings(EmbeddingsManager):
+ def __init__(
+ self,
+ model_name: str,
+ columns: List[str],
+ title: Optional[str] = None,
+ task_type: str = DEFAULT_TASK_TYPE,
+ project: Optional[str] = None,
+ location: Optional[str] = None,
+ credentials: Optional[Credentials] = None,
+ **kwargs):
+ """
+ Embedding Config for Vertex AI Text Embedding models following
+
https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings
# pylint: disable=line-too-long
+ Text Embeddings are generated for a batch of text using the Vertex AI SDK.
+ Embeddings are returned in a list for each text in the batch. Look at
+
https://cloud.google.com/vertex-ai/docs/generative-ai/learn/model-versioning#stable-versions-available.md
# pylint: disable=line-too-long
+ for more information on model versions and lifecycle.
+
+ Args:
+ model_name: The name of the Vertex AI Text Embedding model.
+ columns: The columns containing the text to be embedded.
+ task_type: The downstream task for the embeddings.
+ Valid values are RETRIEVAL_QUERY, RETRIEVAL_DOCUMENT,
+ SEMANTIC_SIMILARITY, CLASSIFICATION, CLUSTERING.
Review Comment:
SGTM
##########
sdks/python/apache_beam/ml/transforms/embeddings/vertex_ai.py:
##########
@@ -0,0 +1,158 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Vertex AI Python SDK is required for this module.
+# Follow
https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk #
pylint: disable=line-too-long
+# to install Vertex AI Python SDK.
+
+from typing import Any
+from typing import Dict
+from typing import Iterable
+from typing import List
+from typing import Optional
+from typing import Sequence
+
+from google.auth.credentials import Credentials
+
+import apache_beam as beam
+import vertexai
+from apache_beam.ml.inference.base import ModelHandler
+from apache_beam.ml.inference.base import RunInference
+from apache_beam.ml.transforms.base import EmbeddingsManager
+from apache_beam.ml.transforms.base import _TextEmbeddingHandler
+from vertexai.language_models import TextEmbeddingInput
+from vertexai.language_models import TextEmbeddingModel
+
+__all__ = ["VertexAITextEmbeddings"]
+
+TASK_TYPE = "RETRIEVAL_DOCUMENT"
+TASK_TYPE_INPUTS = [
+ "RETRIEVAL_DOCUMENT",
+ "RETRIEVAL_QUERY",
+ "SEMANTIC_SIMILARITY",
+ "CLASSIFICATION",
+ "CLUSTERING"
+]
+
+
+class _VertexAITextEmbeddingHandler(ModelHandler):
+ """
+ Note: Intended for internal use and guarantees no backwards compatibility.
+ """
+ def __init__(
+ self,
+ model_name: str,
+ title: Optional[str] = None,
+ task_type: str = TASK_TYPE,
+ project: Optional[str] = None,
+ location: Optional[str] = None,
+ credentials: Optional[Credentials] = None,
+ ):
+ vertexai.init(project=project, location=location, credentials=credentials)
+ self.model_name = model_name
+ if task_type not in TASK_TYPE_INPUTS:
+ raise ValueError(
+ f"task_type must be one of {TASK_TYPE_INPUTS}, got {task_type}")
Review Comment:
Sounds good - we can defer to a future pr here as well
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]