Copilot commented on code in PR #1546:
URL:
https://github.com/apache/datafusion-python/pull/1546#discussion_r3290100174
##########
crates/core/src/codec.rs:
##########
@@ -301,48 +324,102 @@ impl LogicalExtensionCodec for PythonLogicalCodec {
}
fn try_encode_udf(&self, node: &ScalarUDF, buf: &mut Vec<u8>) ->
Result<()> {
- if try_encode_python_scalar_udf(node, buf)? {
+ if self.python_udf_inlining && try_encode_python_scalar_udf(node,
buf)? {
return Ok(());
}
self.inner.try_encode_udf(node, buf)
}
fn try_decode_udf(&self, name: &str, buf: &[u8]) -> Result<Arc<ScalarUDF>>
{
- if let Some(udf) = try_decode_python_scalar_udf(buf)? {
- return Ok(udf);
+ if self.python_udf_inlining {
+ if let Some(udf) = try_decode_python_scalar_udf(buf)? {
+ return Ok(udf);
+ }
+ } else {
+ refuse_if_inline(buf, PY_SCALAR_UDF_FAMILY, "scalar UDF", name)?;
}
self.inner.try_decode_udf(name, buf)
}
fn try_encode_udaf(&self, node: &AggregateUDF, buf: &mut Vec<u8>) ->
Result<()> {
- if try_encode_python_udaf(node, buf)? {
+ if self.python_udf_inlining && try_encode_python_udaf(node, buf)? {
return Ok(());
}
self.inner.try_encode_udaf(node, buf)
}
fn try_decode_udaf(&self, name: &str, buf: &[u8]) ->
Result<Arc<AggregateUDF>> {
- if let Some(udaf) = try_decode_python_udaf(buf)? {
- return Ok(udaf);
+ if self.python_udf_inlining {
+ if let Some(udaf) = try_decode_python_udaf(buf)? {
+ return Ok(udaf);
+ }
+ } else {
+ refuse_if_inline(buf, PY_AGG_UDF_FAMILY, "aggregate UDF", name)?;
}
self.inner.try_decode_udaf(name, buf)
}
fn try_encode_udwf(&self, node: &WindowUDF, buf: &mut Vec<u8>) ->
Result<()> {
- if try_encode_python_udwf(node, buf)? {
+ if self.python_udf_inlining && try_encode_python_udwf(node, buf)? {
return Ok(());
}
self.inner.try_encode_udwf(node, buf)
}
fn try_decode_udwf(&self, name: &str, buf: &[u8]) ->
Result<Arc<WindowUDF>> {
- if let Some(udwf) = try_decode_python_udwf(buf)? {
- return Ok(udwf);
+ if self.python_udf_inlining {
+ if let Some(udwf) = try_decode_python_udwf(buf)? {
+ return Ok(udwf);
+ }
+ } else {
+ refuse_if_inline(buf, PY_WINDOW_UDF_FAMILY, "window UDF", name)?;
}
self.inner.try_decode_udwf(name, buf)
}
}
+/// Strict-mode gate: if `buf` is a well-framed inline payload for
+/// `family`, return the strict-refusal error; otherwise return
+/// `Ok(())` so the caller can delegate to its `inner` codec.
+///
+/// Routing through [`read_framed_payload`] (rather than a bare
+/// `starts_with` probe) means malformed inline bytes — wrong
+/// wire-format version, mismatched Python version, truncated header —
+/// surface *their* diagnostic instead of the strict-mode message.
+/// The strict message implies sender intent ("inlining is disabled"),
+/// so it should fire only when the bytes really would have decoded.
+///
+/// Fast path: short-circuit on the family-magic prefix before
+/// acquiring the GIL. Plans with many non-Python UDFs would otherwise
+/// pay a GIL acquisition per decode call just to confirm "not a
+/// Python UDF". `read_framed_payload` itself rejects buffers that
+/// don't start with `family`, so this is purely an optimization.
+fn refuse_if_inline(buf: &[u8], family: &[u8], kind: &str, name: &str) ->
Result<()> {
+ if !buf.starts_with(family) {
+ return Ok(());
+ }
+ Python::attach(|py| match read_framed_payload(py, buf, family, kind)? {
+ Some(_) => Err(refuse_inline_payload(kind, name)),
+ None => Ok(()),
+ })
+}
+
+/// Build the error returned by a strict codec when it receives an
+/// inline Python-UDF payload it has been told not to deserialize.
+fn refuse_inline_payload(kind: &str, name: &str) ->
datafusion::error::DataFusionError {
+ // `Execution`, not `Plan`: this is a wire-format decode refusal at
+ // codec time, not a planner-stage failure. Downstream error
+ // classification keys off the variant — surfacing this as a planner
+ // error would mis-route it into "fix your SQL" buckets.
+ datafusion::error::DataFusionError::Execution(format!(
+ "Refusing to deserialize inline Python {kind} '{name}': Python UDF \
+ inlining is disabled on this session. Ask the sender to re-encode \
+ with inlining disabled (so the UDF travels by name), or register \
+ '{name}' on this receiver's session and enable inlining on both \
+ sides — receivers cannot re-encode bytes they did not produce."
Review Comment:
The strict-mode refusal message is a bit misleading: it suggests registering
the UDF and enabling inlining on both sides, but registering the UDF won’t help
while the receiver is strict (the codec refuses inline payloads before registry
lookup), and the sender already used inlining. Consider rewording to present
the two real remediations: (1) re-encode with inlining disabled so the UDF
travels by name and can be resolved from the registry, or (2) enable inlining
on the *receiver* (accepting the cloudpickle security tradeoff).
##########
python/datafusion/expr.py:
##########
@@ -446,8 +447,10 @@ def to_bytes(self, ctx: SessionContext | None = None) ->
bytes:
worker process for distributed evaluation.
When ``ctx`` is supplied, encoding routes through that session's
- installed :class:`LogicalExtensionCodec`. When ``ctx`` is
- ``None``, the default codec is used.
+ installed :class:`LogicalExtensionCodec` (so settings like
+ :meth:`SessionContext.with_python_udf_inlining` take effect).
+ When ``ctx`` is ``None``, the default codec is used (Python UDF
+ inlining on, no user-installed extension codec).
Built-in functions and Python UDFs (scalar, aggregate, window)
Review Comment:
This docstring still states that “Python UDFs … travel inside the returned
bytes” unconditionally, but this is no longer always true now that
`SessionContext.with_python_udf_inlining(enabled=False)` can force Python UDFs
to travel by name only (requiring receiver-side registration). Please qualify
this sentence to reflect the toggle, otherwise readers may assume strict-mode
payloads are still self-contained.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]