gemini-code-assist[bot] commented on code in PR #38236:
URL: https://github.com/apache/beam/pull/38236#discussion_r3102535862


##########
sdks/python/apache_beam/yaml/yaml_mapping.py:
##########
@@ -205,83 +197,86 @@ def py_value_to_js_dict(py_value):
     return py_value
 
 
+def js_to_py(obj):
+  """Converts mini-racer mapped objects to standard Python types.
+  
+  This is needed because ctx.eval returns JSMappedObjectImpl and JSArrayImpl
+  for JS objects and arrays, which are not picklable and would fail when Beam
+  tries to serialize rows containing them. We also preserve datetime objects
+  which are correctly produced by ctx.eval for JS Date objects.
+  """
+  import datetime
+  from collections import abc
+  
+  type_name = type(obj).__name__
+  if type_name == 'JSMappedObjectImpl':
+    return {k: js_to_py(v) for k, v in dict(obj).items()}
+  elif type_name == 'JSArrayImpl':
+    return [js_to_py(v) for v in list(obj)]
+  elif isinstance(obj, datetime.datetime):
+    return obj
+  elif isinstance(obj, dict):
+    return {k: js_to_py(v) for k, v in obj.items()}
+  elif not isinstance(obj, str) and isinstance(obj, abc.Iterable):
+    return [js_to_py(v) for v in list(obj)]
+  else:
+    return obj
+
+
 # TODO(yaml) Consider adding optional language version parameter to support
 #  ECMAScript 5 and 6
 def _expand_javascript_mapping_func(
     original_fields, expression=None, callable=None, path=None, name=None):
 
-  # Check for installed js2py package
-  if js2py is None:
+  if MiniRacer is None:
     raise ValueError(
-        "Javascript mapping functions are not supported on"
-        " Python 3.12 or later.")
-
-  # import remaining js2py objects
-  from js2py import base
-  from js2py.constructors import jsdate
-  from js2py.internals import simplex
-
-  js_array_type = (
-      base.PyJsArray,
-      base.PyJsArrayBuffer,
-      base.PyJsInt8Array,
-      base.PyJsUint8Array,
-      base.PyJsUint8ClampedArray,
-      base.PyJsInt16Array,
-      base.PyJsUint16Array,
-      base.PyJsInt32Array,
-      base.PyJsUint32Array,
-      base.PyJsFloat32Array,
-      base.PyJsFloat64Array)
-
-  def _js_object_to_py_object(obj):
-    if isinstance(obj, (base.PyJsNumber, base.PyJsString, base.PyJsBoolean)):
-      return base.to_python(obj)
-    elif isinstance(obj, js_array_type):
-      return [_js_object_to_py_object(value) for value in obj.to_list()]
-    elif isinstance(obj, jsdate.PyJsDate):
-      return obj.to_utc_dt()
-    elif isinstance(obj, (base.PyJsNull, base.PyJsUndefined)):
-      return None
-    elif isinstance(obj, base.PyJsError):
-      raise RuntimeError(obj['message'])
-    elif isinstance(obj, base.PyJsObject):
-      return {
-          key: _js_object_to_py_object(value['value'])
-          for (key, value) in obj.own.items()
-      }
-    elif isinstance(obj, base.JsObjectWrapper):
-      return _js_object_to_py_object(obj._obj)
-
-    return obj
-
-  if expression:
-    source = '\n'.join(['function(__row__) {'] + [
-        f'  {name} = __row__.{name}'
-        for name in original_fields if name in expression
-    ] + ['  return (' + expression + ')'] + ['}'])
-    js_func = _CustomJsObjectWrapper(js2py.eval_js(source))
-
-  elif callable:
-    js_func = _CustomJsObjectWrapper(js2py.eval_js(callable))
+        "JavaScript mapping functions require the 'mini-racer' package to be 
installed.")
 
-  else:
+  udf_code = None
+  if path:
     if not path.endswith('.js'):
       raise ValueError(f'File "{path}" is not a valid .js file.')
     udf_code = FileSystems.open(path).read().decode()
-    js = js2py.EvalJs()
-    js.eval(udf_code)
-    js_func = _CustomJsObjectWrapper(getattr(js, name))
+  elif expression:
+    udf_code = f"var func = (__row__) => {{ " + " ".join([
+        f"const {n} = __row__.{n};"
+        for n in original_fields if n in expression
+    ]) + f" return ({expression}); }}"
+  elif callable:
+    udf_code = f"var func = {callable}"
+
+  udf_key = str(uuid.uuid4())
 
   def js_wrapper(row):
+    tid = threading.get_ident()
+    
+    global _js_thread_funcs
+    # MiniRacer contexts are not picklable and cannot be shared across threads.
+    # We use a global dict keyed by thread ID to lazily create and cache a
+    # context per thread.
+    if tid not in _js_thread_funcs:
+      _js_thread_funcs[tid] = {}
+      
+    if udf_key not in _js_thread_funcs[tid]:
+      ctx = MiniRacer()
+      ctx.eval(udf_code)
+      # We use ctx.eval instead of ctx.call to ensure that JavaScript Date
+      # objects are correctly returned as Python datetime objects.
+      # We JSON-serialize the arguments to pass them safely to eval.
+      if expression or callable:
+        _js_thread_funcs[tid][udf_key] = lambda x: 
ctx.eval(f"func({json.dumps(x)})")
+      else:
+        _js_thread_funcs[tid][udf_key] = lambda x: 
ctx.eval(f"{name}({json.dumps(x)})")
+        
+    func = _js_thread_funcs[tid][udf_key]

Review Comment:
   ![high](https://www.gstatic.com/codereviewagent/high-priority.svg)
   
   The manual thread-local management and the use of `ctx.eval` with 
`json.dumps` for every row are inefficient. `py-mini-racer` provides a `call` 
method that handles argument serialization more cleanly. Additionally, using 
`threading.local()` simplifies the caching logic.
   
   ```suggestion
     def js_wrapper(row):
       if not hasattr(_js_thread_local, 'contexts'):
         _js_thread_local.contexts = {}
       if udf_key not in _js_thread_local.contexts:
         ctx = MiniRacer()
         ctx.eval(udf_code)
         _js_thread_local.contexts[udf_key] = ctx
       
       ctx = _js_thread_local.contexts[udf_key]
       row_as_dict = py_value_to_js_dict(row)
       try:
         if expression or callable:
           result = ctx.call("func", row_as_dict)
         else:
           result = ctx.call(name, row_as_dict)
       except Exception as exn:
         raise RuntimeError(
             f"Error evaluating JavaScript expression: {exn}") from exn
       result = js_to_py(result)
       return dicts_to_rows(result)
   ```



##########
sdks/python/apache_beam/yaml/yaml_mapping.py:
##########
@@ -205,83 +197,86 @@ def py_value_to_js_dict(py_value):
     return py_value
 
 
+def js_to_py(obj):
+  """Converts mini-racer mapped objects to standard Python types.
+  
+  This is needed because ctx.eval returns JSMappedObjectImpl and JSArrayImpl
+  for JS objects and arrays, which are not picklable and would fail when Beam
+  tries to serialize rows containing them. We also preserve datetime objects
+  which are correctly produced by ctx.eval for JS Date objects.
+  """
+  import datetime
+  from collections import abc

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   These imports are executed every time `js_to_py` is called (which happens 
for every row and recursively for nested objects). While Python caches imports, 
performing lookups in `sys.modules` millions of times adds unnecessary 
overhead. Please move these imports to the top of the file.



##########
sdks/python/apache_beam/yaml/yaml_mapping.py:
##########
@@ -53,13 +57,12 @@
 from apache_beam.yaml.yaml_errors import 
maybe_with_exception_handling_transform_fn
 from apache_beam.yaml.yaml_provider import dicts_to_rows
 
-# Import js2py package if it exists
 try:
-  import js2py
-  from js2py.base import JsObjectWrapper
+  from py_mini_racer import MiniRacer
 except ImportError:
-  js2py = None
-  JsObjectWrapper = object
+  MiniRacer = None
+
+_js_thread_funcs = {}

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   Using a global dictionary keyed by thread ID for thread-local storage is 
prone to race conditions during initialization and can lead to memory leaks if 
threads are frequently created and destroyed. It is more idiomatic and safer to 
use `threading.local()` for this purpose.
   
   ```suggestion
   _js_thread_local = threading.local()
   ```



##########
sdks/python/apache_beam/yaml/yaml_mapping.py:
##########
@@ -205,83 +197,86 @@ def py_value_to_js_dict(py_value):
     return py_value
 
 
+def js_to_py(obj):
+  """Converts mini-racer mapped objects to standard Python types.
+  
+  This is needed because ctx.eval returns JSMappedObjectImpl and JSArrayImpl
+  for JS objects and arrays, which are not picklable and would fail when Beam
+  tries to serialize rows containing them. We also preserve datetime objects
+  which are correctly produced by ctx.eval for JS Date objects.
+  """
+  import datetime
+  from collections import abc
+  
+  type_name = type(obj).__name__
+  if type_name == 'JSMappedObjectImpl':
+    return {k: js_to_py(v) for k, v in dict(obj).items()}
+  elif type_name == 'JSArrayImpl':
+    return [js_to_py(v) for v in list(obj)]
+  elif isinstance(obj, datetime.datetime):
+    return obj
+  elif isinstance(obj, dict):
+    return {k: js_to_py(v) for k, v in obj.items()}
+  elif not isinstance(obj, str) and isinstance(obj, abc.Iterable):
+    return [js_to_py(v) for v in list(obj)]
+  else:
+    return obj

Review Comment:
   ![medium](https://www.gstatic.com/codereviewagent/medium-priority.svg)
   
   The current implementation of `js_to_py` relies on internal class names of 
the `py-mini-racer` library (e.g., `JSMappedObjectImpl`), which is fragile. 
Since these objects implement standard Python protocols, you can use 
`collections.abc.Mapping` and `collections.abc.Iterable` for a more robust and 
idiomatic implementation.
   
   ```python
     if isinstance(obj, (str, bytes)):
       return obj
     if isinstance(obj, datetime.datetime):
       return obj
     if isinstance(obj, abc.Mapping):
       return {k: js_to_py(v) for k, v in obj.items()}
     if isinstance(obj, abc.Iterable):
       return [js_to_py(v) for v in obj]
     return obj
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to