Re: [PR] Generate external transform wrappers using a script [beam]

via GitHub Wed, 24 Jan 2024 17:16:24 -0800


robertwb commented on code in PR #29834:
URL: https://github.com/apache/beam/pull/29834#discussion_r1465728062



##########
sdks/python/gen_xlang_wrappers.py:
##########
@@ -0,0 +1,420 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Generates Python wrappers for external transforms (specifically,
+SchemaTransforms)
+"""
+
+import argparse
+import datetime
+import logging
+import os
+import re
+import typing
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Union
+import subprocess
+
+import yaml
+from jinja2 import Environment
+from jinja2 import FileSystemLoader
+
+from gen_protos import LICENSE_HEADER
+from gen_protos import PYTHON_SDK_ROOT
+from gen_protos import find_by_ext
+
+SUPPORTED_SDK_DESTINATIONS = ['python']
+PYTHON_SUFFIX = "_et.py"
+MARKER = "# NOTE: This file contains autogenerated external transform(s)\n"
+
+
+def generate_transforms_config(input_services, output_file):
+  """
+  Generates a YAML file containing a list of transform configurations.
+
+  Takes an input YAML file containing a list of expansion service gradle
+  targets. Each service must provide a `destinations` field that specifies the
+  default package (relative path) that generated wrappers should be written
+  under. A default destination is specified for each SDK, like so::
+
+    - gradle_target: 'sdks:java:io:expansion-service:shadowJar'
+      destinations:
+        python: 'apache_beam/io'
+
+
+  Each service may also specify modifications for particular transform.
+  Currently, one can modify the generated wrapper's name and destination file:
+
+    - By default, the transform's identifier is used to generate the wrapper
+      class name. This can be overriden by manually providing a name.
+    - By default, generated wrappers are written to files within the package
+      provided by the expansion service. This can be overridden by manually
+      providing a relative file path.
+
+  See the following example for what such modifications can look like::
+
+    - gradle_target: 'sdks:java:io:expansion-service:shadowJar'
+      destinations:
+        python: 'apache_beam/io'
+      transforms:
+        'beam:schematransform:org.apache.beam:my_transform:v1':
+          name: 'MyCustomTransformName'
+          destinations:
+            python: 'apache_beam/io/gcp/my_custom_module'
+
+  For the above example, we would take the transform with identifier
+  `beam:schematransform:org.apache.beam:my_transform:v1` and by default infer
+  a wrapper class name of `MyTransform` and write the generated code to
+  the module `apache_beam/io/my_transform_et.py`. With the modifications, we
+  instead write the wrapper to `apache_beam/io/gcp/my_custom_module_et.py` and
+  name the class `MyCustomTransformName`.
+
+  Note: we add the prefix `"_et.py"` to the module name so that we can find
+  these generated files later (e.g. to tell Git to ignore them, and to
+  delete them when needed)
+
+  To ignore a particular transform, simply list its identifier in the `ignore`
+  field, like so::
+
+    - gradle_target: 'sdks:java:io:expansion-service:shadowJar'
+      destinations:
+        python: 'apache_beam/io'
+      ignore:
+        - 'beam:schematransform:org.apache.beam:skip_me:v1':
+
+
+  We use :class:`ExternalSchemaTransformProvider` to discover external
+  transforms. Then, we extract the necessary details of each transform and
+  compile them into a new YAML file, which is later used to generate wrappers.
+  """
+  from apache_beam.transforms.external import BeamJarExpansionService
+  from apache_beam.transforms.external_schematransform_provider import 
STANDARD_URN_PATTERN
+  from apache_beam.transforms.external_schematransform_provider import 
ExternalSchemaTransform
+  from apache_beam.transforms.external_schematransform_provider import 
ExternalSchemaTransformProvider
+
+  transform_list: List[Dict[str, Any]] = []
+
+  with open(input_services) as f:
+    services = yaml.safe_load(f)
+  for service in services:
+    target = service['gradle_target']
+
+    if "destinations" not in service:
+      raise ValueError(
+          f"Expansion service with target [{target}] does not "
+          "specify any default destinations.")
+    service_destinations: Dict[str, str] = service['destinations']
+    for sdk in service_destinations.keys():
+      if sdk not in SUPPORTED_SDK_DESTINATIONS:
+        raise ValueError(
+            f"Service with target {target} specifies a "
+            f"destination for an invalid SDK: {sdk}. The "
+            f"supported SDKs are {SUPPORTED_SDK_DESTINATIONS}")
+
+    # get transforms to skip, if any
+    ignore = service.get('ignore', [])
+
+    # use dynamic provider to discover and populate wrapper details
+    provider = ExternalSchemaTransformProvider(BeamJarExpansionService(target))
+    discovered: Dict[str, ExternalSchemaTransform] = provider.get_all()
+    for identifier, wrapper in discovered.items():
+      if identifier in ignore:
+        continue
+      # We infer the destination from the URN and service destination.
+      # For example, the Java IO expansion service defaults to Python
+      # package apache_beam/io. Kafka Write is a transform in this service
+      # with URN beam:schematransform:org.apache.beam:kafka_write:v1
+      # In this case, we infer the destination apache_beam/io/kafka_write
+      functionality_identifier = re.match(STANDARD_URN_PATTERN,
+                                          identifier).groups()[0]
+      destinations = {
+          sdk: f"{destination}/{functionality_identifier}"
+          for sdk,
+          destination in service_destinations.items()
+      }
+      name = wrapper.__name__
+
+      # apply any modifications
+      modified_transform = {}
+      if 'transforms' in service and identifier in service['transforms']:
+        modified_transform = service['transforms'][identifier]
+      if 'name' in modified_transform:
+        name = modified_transform['name']  # override the name
+      if 'destinations' in modified_transform:
+        for sdk, destination in modified_transform['destinations'].items():
+          if sdk not in SUPPORTED_SDK_DESTINATIONS:
+            raise ValueError(
+                f"Identifier {identifier} specifies a destination for "
+                f"an invalid SDK: [{sdk}]. The supported SDKs "
+                f"are {SUPPORTED_SDK_DESTINATIONS}")
+          destinations[sdk] = destination  # override the destination
+
+      fields = {}
+      for param in wrapper.configuration_schema.values():
+        (tp, nullable) = prepare_type(param.type)
+
+        field_info = {
+            'type': str(tp),
+            'description': param.description,
+            'nullable': nullable
+        }
+        fields[param.original_name] = field_info
+
+      transform = {
+          'identifier': identifier,
+          'name': name,
+          'destinations': destinations,
+          'default_service': target,
+          'fields': fields,
+          'description': wrapper.description
+      }
+      transform_list.append(transform)
+
+  with open(output_file, 'w') as f:
+    f.write(LICENSE_HEADER.lstrip())
+    f.write(
+        "# NOTE: This file is autogenerated and should "
+        "not be edited by hand.\n")
+    dt = datetime.datetime.now().date()
+    f.write(f"# Last updated on: {dt}\n\n")
+    yaml.dump(transform_list, f)
+  logging.info("Successfully wrote transform configs to file: %s", output_file)
+
+
+def prepare_type(tp):
+  nullable = False
+  # if it's typing.Optional[...], unwrap to avoid redundancy. Nullability is
+  # communicated in the wrapper's constructor
+  if (typing.get_origin(tp) is Union and
+          type(None) in typing.get_args(tp)):
+    nullable = True
+    # only unwrap if it's a single nullable type. if the type is truly a union
+    # of multiple types, leave it alone.
+    args = typing.get_args(tp)
+    if len(args) == 2:
+      tp = list(filter(lambda t: t is not type(None), args))[0]
+
+  # some logic for setting the type's name to look pretty
+  # TODO(ahmedabu98): Make this more generic to support other remote SDKs
+  # Potentially use Runner API types
+  if tp.__module__ == 'builtins':
+    tp = tp.__name__
+  elif tp.__module__ == 'typing':
+    tp = str(tp).replace("typing.", "")
+  elif tp.__module__ == 'numpy':
+    tp = "%s.%s" % (tp.__module__, tp.__name__)
+
+  return (tp, nullable)
+
+def camel_case_to_snake_case(string):
+  """Convert camelCase to snake_case"""
+  arr = []
+  word = []
+  for i, n in enumerate(string):
+    # If seeing an upper letter after a lower letter, we just witnessed a word
+    # If seeing an upper letter and the next letter is lower, we may have just
+    # witnessed an all caps word
+    if n.isupper() and ((i > 0 and string[i - 1].islower()) or
+                        (i + 1 < len(string) and string[i + 1].islower())):
+      arr.append(''.join(word))
+      word = [n.lower()]
+    else:
+      word.append(n.lower())
+  arr.append(''.join(word))
+  return '_'.join(arr).strip('_')
+
+
+def get_wrappers_from_transform_configs(config_file) -> Dict[str, List[str]]:
+  """
+  Generates code for external transform wrapper classes (subclasses of
+  :class:`ExternalSchemaTransform`).
+
+  Takes a YAML file containing a list of SchemaTransform configurations. For
+  each configuration, the code for a wrapper class is generated, along with any
+  documentation that may be included.
+
+  Each configuration must include a destination file that the generated class
+  will be written to.
+
+  Returns the generated classes, grouped by destination.
+  """
+  env = Environment(loader=FileSystemLoader(PYTHON_SDK_ROOT))
+  python_wrapper_template = env.get_template("python_xlang_wrapper.template")
+
+  # maintain a list of wrappers to write in each file. if modified destinations
+  # are used, we may end up with multiple wrappers in one file.
+  destinations: Dict[str, List[str]] = {}
+
+  with open(config_file) as f:
+    transforms = yaml.safe_load(f)
+    for config in transforms:
+      default_service = config['default_service']
+      description = config['description']
+      destination = config['destinations']['python']
+      name = config['name']
+      fields = config['fields']
+      identifier = config['identifier']
+
+      parameters = []
+      for param, info in fields.items():
+        pythonic_name = camel_case_to_snake_case(param)
+        param_details = {
+            "name": pythonic_name,
+            "type": info['type'],
+            "description": info['description'],
+        }
+
+        if info['nullable']:
+          param_details["default"] = None
+        parameters.append(param_details)
+
+      # Python syntax requires function definitions to have
+      # non-default parameters first
+      parameters = sorted(parameters, key=lambda p: 'default' in p)
+      default_service = f"BeamJarExpansionService(\"{default_service}\")"
+
+      python_wrapper_class = python_wrapper_template.render(
+          class_name=name,
+          identifier=identifier,
+          parameters=parameters,
+          description=description,
+          default_expansion_service=default_service)
+
+      if destination not in destinations:
+        destinations[destination] = []
+      destinations[destination].append(python_wrapper_class)
+
+  return destinations
+
+
+def write_wrappers_to_destinations(grouped_wrappers: Dict[str, List[str]]):
+  """
+  Takes a dictionary of generated wrapper code, grouped by destination.
+  For each destination, create a new file containing the respective wrapper
+  classes. Each file includes the Apache License header and relevant imports.
+  Note: the Jinja template should already follow linting and formatting rules.
+  """
+  written_files = []
+  for dest, wrappers in grouped_wrappers.items():
+    dest += PYTHON_SUFFIX
+    absolute_dest = os.path.join(PYTHON_SDK_ROOT, *dest.split('/'))
+    with open(absolute_dest, "w") as file:
+      file.write(LICENSE_HEADER.lstrip())
+      file.write(
+          MARKER + "# and should not be edited by hand.\n"
+          "# Refer to the utility at gen_xlang_wrappers.py for more info.\n\n")
+      file.write("# pylint:disable=line-too-long\n\n")
+      file.write(
+          "from apache_beam.transforms.external import "
+          "BeamJarExpansionService\n"
+          "from apache_beam.transforms.external_schematransform_provider "
+          "import ExternalSchemaTransform\n")
+      for wrapper in wrappers:
+        file.write(wrapper + "\n")
+    written_files.append(absolute_dest)
+
+  # We only make a best effort attempt to format with yapf because not all

Review Comment:
   Would it be easier to simply make yapf a requirement? 



##########
sdks/python/apache_beam/transforms/external_schematransform_provider_test.py:
##########
@@ -14,24 +14,43 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
 import logging
 import os
+import secrets
+import shutil
+import time
 import unittest
+from importlib import import_module
 
 import pytest
+import yaml
 
 import apache_beam as beam
+from apache_beam.options.pipeline_options import PipelineOptions
 from apache_beam.testing.test_pipeline import TestPipeline
 from apache_beam.testing.util import assert_that
 from apache_beam.testing.util import equal_to
 from apache_beam.transforms.external import BeamJarExpansionService
 from apache_beam.transforms.external_schematransform_provider import 
STANDARD_URN_PATTERN
+from apache_beam.transforms.external_schematransform_provider import 
ExternalSchemaTransform
 from apache_beam.transforms.external_schematransform_provider import 
ExternalSchemaTransformProvider
 from apache_beam.transforms.external_schematransform_provider import 
camel_case_to_snake_case
 from apache_beam.transforms.external_schematransform_provider import 
infer_name_from_identifier
 from apache_beam.transforms.external_schematransform_provider import 
snake_case_to_lower_camel_case
 from apache_beam.transforms.external_schematransform_provider import 
snake_case_to_upper_camel_case
 
+try:

Review Comment:
   Why would these not be available? 



##########
sdks/python/standard_external_transforms.yaml:
##########
@@ -0,0 +1,725 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# NOTE: This file is autogenerated and should not be edited by hand.

Review Comment:
   Put instructions here. 



##########
sdks/python/standard_external_transforms.yaml:
##########
@@ -0,0 +1,725 @@
+#

Review Comment:
   Could we just have a single example transform, then check in all this (and 
the proceeding file) that adds all the transforms in as a subsequent PR? 



##########
sdks/python/standard_expansion_services.yaml:
##########
@@ -0,0 +1,104 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+# This file enumerates the standard Apache Beam expansion services.
+# Each service must specify a package destination for each supported SDK, which
+# is where generated wrappers will go by default.
+#
+# Individual transforms can modify their destination module as well as their
+# generated wrapper class name.
+#
+# Transform identifiers listed in the `ignore` field will be skipped.
+
+- gradle_target: 'sdks:java:io:expansion-service:shadowJar'
+  destinations:
+    python: 'apache_beam/io'
+  transforms:
+    'beam:schematransform:org.apache.beam:kafka_write:v1':
+      name: 'WriteToKafka'
+      destinations:
+        python: 'apache_beam/io/kafka'
+    'beam:schematransform:org.apache.beam:kafka_read:v1':
+      name: 'ReadFromKafka'
+      destinations:
+        python: 'apache_beam/io/kafka'
+
+- gradle_target: 
'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'
+  destinations:
+    python: 'apache_beam/io/gcp'
+  transforms:
+    'beam:schematransform:org.apache.beam:bigquery_storage_write:v2':
+      name: 'StorageWriteToBigQuery'
+      destinations:
+        python: 'apache_beam/io/gcp/bigquery'
+    'beam:schematransform:org.apache.beam:bigquery_storage_read:v1':
+      name: 'StorageReadFromBigQuery'
+      destinations:
+        python: 'apache_beam/io/gcp/bigquery'
+    'beam:schematransform:org.apache.beam:bigquery_fileloads_write:v1':
+      name: 'FileLoadsToBigQuery'
+      destinations:
+        python: 'apache_beam/io/gcp/bigquery'
+    'beam:schematransform:org.apache.beam:bigquery_export_read:v1':
+      name: 'ExportReadFromBigQuery'
+      destinations:
+        python: 'apache_beam/io/gcp/bigquery'
+    'beam:schematransform:org.apache.beam:bigtable_write:v1':
+      name: 'WriteToBigtable'
+      destinations:
+        python: 'apache_beam/io/gcp/bigtable'
+    'beam:schematransform:org.apache.beam:bigtable_read:v1':
+      name: 'ReadFromBigtable'
+      destinations:
+        python: 'apache_beam/io/gcp/bigtable'
+    'beam:schematransform:org.apache.beam:pubsub_read:v1':
+      name: 'ReadFromPubSub'
+      destinations:
+        python: 'apache_beam/io/gcp/pubsub'
+    'beam:schematransform:org.apache.beam:pubsub_write:v1':
+      name: 'WriteToPubSub'
+      destinations:
+        python: 'apache_beam/io/gcp/pubsub'
+    'beam:schematransform:org.apache.beam:pubsublite_read:v1':
+      name: 'ReadFromPubSubLite'
+      destinations:
+        python: 'apache_beam/io/gcp/pubsublite'
+    'beam:schematransform:org.apache.beam:pubsublite_write:v1':
+      name: 'WriteToPubSubLite'
+      destinations:
+        python: 'apache_beam/io/gcp/pubsublite'
+    'beam:schematransform:org.apache.beam:spanner_cdc_read:v1':
+      name: 'ReadFromSpannerChangeStreams'
+      destinations:
+        python: 'apache_beam/io/gcp/spanner'
+    'beam:schematransform:org.apache.beam:spanner_write:v1':
+      name: 'WriteToSpanner'
+      destinations:
+        python: 'apache_beam/io/gcp/spanner'
+    'beam:schematransform:org.apache.beam:jdbc_write:v1':
+      name: 'WriteToJdbc'
+      destinations:
+        python: 'apache_beam/io/jdbc'
+    'beam:schematransform:org.apache.beam:jdbc_read:v1':
+      name: 'ReadFromJdbc'
+      destinations:
+        python: 'apache_beam/io/jdbc'
+  ignore:

Review Comment:
   +1



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Generate external transform wrappers using a script [beam]

Reply via email to