Copilot commented on code in PR #5320:
URL: https://github.com/apache/texera/pull/5320#discussion_r3425811888
##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -361,6 +605,186 @@ object PythonCodegenBase {
| detail = "<empty response>"
| return f"{title} [status={status_code}] response={detail}"
|
+ | #
──────────────────────────────────────────────────────────────────
+ | # Image-task helpers (used by ImageTaskCodegen and image-related
+ | # branches of _call_provider).
+ | #
──────────────────────────────────────────────────────────────────
+ |
+ | def _read_image_input(self):
+ | image_input = str(self.IMAGE_INPUT or "").strip()
+ | if image_input.startswith("data:"):
+ | _, encoded = image_input.split(",", 1)
+ | return base64.b64decode(encoded)
+ | if image_input.startswith("http://") or
image_input.startswith("https://"):
+ | resp = requests.get(image_input, timeout=120)
+ | resp.raise_for_status()
+ | return resp.content
+ | if not os.path.exists(image_input):
+ | raise FileNotFoundError(f"Image file not found at path:
{image_input}")
+ | if not os.path.isfile(image_input):
+ | raise ValueError(f"Image input path is not a file:
{image_input}")
+ | with open(image_input, "rb") as image_file:
+ | return image_file.read()
+ |
+ | def _compress_image_bytes(self, image_bytes, max_bytes=33000):
+ | from io import BytesIO
+ | from PIL import Image as PILImage
+ | if len(image_bytes) <= max_bytes:
+ | return image_bytes
+ | try:
+ | img = PILImage.open(BytesIO(image_bytes))
+ | img = img.convert("RGB")
+ | max_dim = 512
+ | quality = 75
+ | while max_dim >= 160:
+ | scale = min(1, max_dim / max(img.width, img.height))
+ | w = max(1, round(img.width * scale))
+ | h = max(1, round(img.height * scale))
+ | resized = img.resize((w, h), PILImage.LANCZOS)
+ | q = quality
+ | while q >= 35:
+ | buf = BytesIO()
+ | resized.save(buf, format="JPEG", quality=q)
+ | if buf.tell() <= max_bytes:
+ | return buf.getvalue()
+ | q -= 10
+ | max_dim = int(max_dim * 0.75)
+ | buf = BytesIO()
+ | resized.save(buf, format="JPEG", quality=35)
+ | return buf.getvalue()
+ | except Exception:
+ | return image_bytes
+ |
+ | def _image_input_as_base64(self, image_bytes):
+ | return base64.b64encode(image_bytes).decode("utf-8")
+ |
+ | def _read_binary_value(self, value):
+ | if value is None or (isinstance(value, float) and
pd.isna(value)):
+ | return None
+ | if isinstance(value, bytes):
+ | return value
+ | val = str(value).strip()
+ | if not val:
+ | return None
+ | if self._looks_like_html(val):
+ | return self._html_to_image_bytes(val)
+ | if val.startswith("data:"):
+ | _, encoded = val.split(",", 1)
+ | return base64.b64decode(encoded)
+ | if val.startswith("http://") or val.startswith("https://"):
+ | resp = requests.get(val, timeout=120)
+ | resp.raise_for_status()
+ | return resp.content
+ | if os.path.exists(val) and os.path.isfile(val):
+ | with open(val, "rb") as f:
+ | return f.read()
+ | try:
+ | return base64.b64decode(val)
+ | except Exception:
+ | return val.encode("utf-8")
+ |
+ | def _looks_like_html(self, val):
+ | s = val.lstrip()[:200].lower()
+ | if s.startswith("<!doctype html") or s.startswith("<html"):
+ | return True
+ | if "plotly.newplot" in val[:5000].lower() or "plotly.react" in
val[:5000].lower():
+ | return True
+ | if "<img" in s and "base64," in s:
+ | return True
+ | return False
+ |
+ | def _html_to_image_bytes(self, html_string):
+ | match =
re.search(r"data:image/[^;]+;base64,([A-Za-z0-9+/\\n\\r =]+)", html_string)
+ | if match:
+ | b64 = match.group(1).replace("\\n", "").replace("\\r",
"").replace(" ", "")
+ | return base64.b64decode(b64)
Review Comment:
`_html_to_image_bytes` uses a raw-string regex and then strips
`"\\n"`/`"\\r"`, which matches literal backslash sequences rather than real
newlines. This will fail to extract base64 data URLs when the HTML contains
actual line breaks/whitespace in the base64 payload.
##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -266,11 +461,12 @@ object PythonCodegenBase {
| # --- resolve all available inference providers for this model
(tried in order) ---
| providers = self._resolve_providers(token)
|
- | # --- validate prompt column exists ---
- | assert prompt_col in table.columns, (
- | f"Prompt column '{prompt_col}' not found in input table. "
- | f"Available columns: {list(table.columns)}"
- | )
+ | # --- validate prompt column exists (skipped for image-only
tasks) ---
+ | if task not in image_only_tasks:
+ | assert prompt_col in table.columns, (
+ | f"Prompt column '{prompt_col}' not found in input
table. "
+ | f"Available columns: {list(table.columns)}"
+ | )
Review Comment:
`process_table` now has a fallback when `prompt_col` is missing for
image+prompt tasks, but the earlier validation still asserts `prompt_col in
table.columns` for those tasks (`task not in image_only_tasks`). That makes the
fallback unreachable (unless Python runs with `-O`), and image+prompt tasks
will fail fast even though the per-row logic can handle a missing prompt column.
##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -361,6 +605,186 @@ object PythonCodegenBase {
| detail = "<empty response>"
| return f"{title} [status={status_code}] response={detail}"
|
+ | #
──────────────────────────────────────────────────────────────────
+ | # Image-task helpers (used by ImageTaskCodegen and image-related
+ | # branches of _call_provider).
+ | #
──────────────────────────────────────────────────────────────────
+ |
+ | def _read_image_input(self):
+ | image_input = str(self.IMAGE_INPUT or "").strip()
+ | if image_input.startswith("data:"):
+ | _, encoded = image_input.split(",", 1)
+ | return base64.b64decode(encoded)
+ | if image_input.startswith("http://") or
image_input.startswith("https://"):
+ | resp = requests.get(image_input, timeout=120)
+ | resp.raise_for_status()
+ | return resp.content
+ | if not os.path.exists(image_input):
+ | raise FileNotFoundError(f"Image file not found at path:
{image_input}")
+ | if not os.path.isfile(image_input):
+ | raise ValueError(f"Image input path is not a file:
{image_input}")
+ | with open(image_input, "rb") as image_file:
+ | return image_file.read()
+ |
+ | def _compress_image_bytes(self, image_bytes, max_bytes=33000):
+ | from io import BytesIO
+ | from PIL import Image as PILImage
+ | if len(image_bytes) <= max_bytes:
+ | return image_bytes
+ | try:
+ | img = PILImage.open(BytesIO(image_bytes))
+ | img = img.convert("RGB")
+ | max_dim = 512
+ | quality = 75
+ | while max_dim >= 160:
+ | scale = min(1, max_dim / max(img.width, img.height))
+ | w = max(1, round(img.width * scale))
+ | h = max(1, round(img.height * scale))
+ | resized = img.resize((w, h), PILImage.LANCZOS)
+ | q = quality
+ | while q >= 35:
+ | buf = BytesIO()
+ | resized.save(buf, format="JPEG", quality=q)
+ | if buf.tell() <= max_bytes:
+ | return buf.getvalue()
+ | q -= 10
+ | max_dim = int(max_dim * 0.75)
+ | buf = BytesIO()
+ | resized.save(buf, format="JPEG", quality=35)
+ | return buf.getvalue()
+ | except Exception:
+ | return image_bytes
+ |
+ | def _image_input_as_base64(self, image_bytes):
+ | return base64.b64encode(image_bytes).decode("utf-8")
+ |
+ | def _read_binary_value(self, value):
+ | if value is None or (isinstance(value, float) and
pd.isna(value)):
+ | return None
Review Comment:
`_read_binary_value` only treats `float('nan')` as missing via
`(isinstance(value, float) and pd.isna(value))`. Pandas missing sentinels like
`pd.NA` / `NaT` (and other scalar NA types) will fall through and be converted
to bytes (`b'<NA>'`), which is incorrect for image columns.
##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -55,11 +55,15 @@ object PythonCodegenBase {
val systemPrompt = ctx.systemPrompt
val maxNewTokens = ctx.safeMaxTokens
val temperature = ctx.safeTemp
+ val imageInput = ctx.imageInput
+ val inputImageColumn = ctx.inputImageColumn
pyb"""import os
|import re
|import json
+ |import base64
|import requests
|import pandas as pd
+ |from urllib.parse import urlparse
|from pytexera import *
Review Comment:
`from urllib.parse import urlparse` is imported in the generated Python but
never used (the code re-imports `urlparse as _urlparse` in the few places it
needs it). This can be dropped to keep the generated script minimal and avoid
misleading readers about which import is actually used.
##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -361,6 +605,186 @@ object PythonCodegenBase {
| detail = "<empty response>"
| return f"{title} [status={status_code}] response={detail}"
|
+ | #
──────────────────────────────────────────────────────────────────
+ | # Image-task helpers (used by ImageTaskCodegen and image-related
+ | # branches of _call_provider).
+ | #
──────────────────────────────────────────────────────────────────
+ |
+ | def _read_image_input(self):
+ | image_input = str(self.IMAGE_INPUT or "").strip()
+ | if image_input.startswith("data:"):
+ | _, encoded = image_input.split(",", 1)
+ | return base64.b64decode(encoded)
+ | if image_input.startswith("http://") or
image_input.startswith("https://"):
+ | resp = requests.get(image_input, timeout=120)
+ | resp.raise_for_status()
+ | return resp.content
+ | if not os.path.exists(image_input):
+ | raise FileNotFoundError(f"Image file not found at path:
{image_input}")
+ | if not os.path.isfile(image_input):
+ | raise ValueError(f"Image input path is not a file:
{image_input}")
+ | with open(image_input, "rb") as image_file:
+ | return image_file.read()
+ |
+ | def _compress_image_bytes(self, image_bytes, max_bytes=33000):
+ | from io import BytesIO
+ | from PIL import Image as PILImage
+ | if len(image_bytes) <= max_bytes:
+ | return image_bytes
+ | try:
+ | img = PILImage.open(BytesIO(image_bytes))
+ | img = img.convert("RGB")
+ | max_dim = 512
+ | quality = 75
+ | while max_dim >= 160:
+ | scale = min(1, max_dim / max(img.width, img.height))
+ | w = max(1, round(img.width * scale))
+ | h = max(1, round(img.height * scale))
+ | resized = img.resize((w, h), PILImage.LANCZOS)
+ | q = quality
+ | while q >= 35:
+ | buf = BytesIO()
+ | resized.save(buf, format="JPEG", quality=q)
+ | if buf.tell() <= max_bytes:
+ | return buf.getvalue()
+ | q -= 10
+ | max_dim = int(max_dim * 0.75)
+ | buf = BytesIO()
+ | resized.save(buf, format="JPEG", quality=35)
+ | return buf.getvalue()
+ | except Exception:
+ | return image_bytes
+ |
+ | def _image_input_as_base64(self, image_bytes):
+ | return base64.b64encode(image_bytes).decode("utf-8")
+ |
+ | def _read_binary_value(self, value):
+ | if value is None or (isinstance(value, float) and
pd.isna(value)):
+ | return None
+ | if isinstance(value, bytes):
+ | return value
+ | val = str(value).strip()
+ | if not val:
+ | return None
+ | if self._looks_like_html(val):
+ | return self._html_to_image_bytes(val)
+ | if val.startswith("data:"):
+ | _, encoded = val.split(",", 1)
+ | return base64.b64decode(encoded)
+ | if val.startswith("http://") or val.startswith("https://"):
+ | resp = requests.get(val, timeout=120)
+ | resp.raise_for_status()
+ | return resp.content
+ | if os.path.exists(val) and os.path.isfile(val):
+ | with open(val, "rb") as f:
+ | return f.read()
+ | try:
+ | return base64.b64decode(val)
+ | except Exception:
+ | return val.encode("utf-8")
+ |
+ | def _looks_like_html(self, val):
+ | s = val.lstrip()[:200].lower()
+ | if s.startswith("<!doctype html") or s.startswith("<html"):
+ | return True
+ | if "plotly.newplot" in val[:5000].lower() or "plotly.react" in
val[:5000].lower():
+ | return True
+ | if "<img" in s and "base64," in s:
+ | return True
+ | return False
+ |
+ | def _html_to_image_bytes(self, html_string):
+ | match =
re.search(r"data:image/[^;]+;base64,([A-Za-z0-9+/\\n\\r =]+)", html_string)
+ | if match:
+ | b64 = match.group(1).replace("\\n", "").replace("\\r",
"").replace(" ", "")
+ | return base64.b64decode(b64)
+ | if "Plotly." in html_string:
+ | try:
+ | import plotly.graph_objects as go
+ | import plotly.io as pio
+ | plotly_match =
re.search(r"Plotly\\.(?:newPlot|react)\\s*\\(\\s*", html_string)
+ | if plotly_match:
+ | pos = plotly_match.end()
+ | if pos < len(html_string) and html_string[pos] in
('"', "'"):
+ | q = html_string[pos]
+ | pos += 1
+ | while pos < len(html_string) and
html_string[pos] != q:
+ | if html_string[pos] == "\\\\":
+ | pos += 1
+ | pos += 1
+ | pos += 1
+ | while pos < len(html_string) and html_string[pos]
in " ,\\n\\r\\t":
+ | pos += 1
+ | data_json, pos =
self._extract_json_arg(html_string, pos)
+ | while pos < len(html_string) and html_string[pos]
in " ,\\n\\r\\t":
+ | pos += 1
Review Comment:
These whitespace-skipping loops check membership in the string `"
,\\n\\r\\t"`, which looks for literal backslash characters rather than
newline/tab characters. That prevents `_html_to_image_bytes` from skipping real
whitespace/newlines around Plotly JSON arguments, making `_extract_json_arg`
much more likely to fail on real Plotly HTML.
##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -361,6 +605,186 @@ object PythonCodegenBase {
| detail = "<empty response>"
| return f"{title} [status={status_code}] response={detail}"
|
+ | #
──────────────────────────────────────────────────────────────────
+ | # Image-task helpers (used by ImageTaskCodegen and image-related
+ | # branches of _call_provider).
+ | #
──────────────────────────────────────────────────────────────────
+ |
+ | def _read_image_input(self):
+ | image_input = str(self.IMAGE_INPUT or "").strip()
+ | if image_input.startswith("data:"):
+ | _, encoded = image_input.split(",", 1)
+ | return base64.b64decode(encoded)
+ | if image_input.startswith("http://") or
image_input.startswith("https://"):
+ | resp = requests.get(image_input, timeout=120)
+ | resp.raise_for_status()
+ | return resp.content
+ | if not os.path.exists(image_input):
+ | raise FileNotFoundError(f"Image file not found at path:
{image_input}")
+ | if not os.path.isfile(image_input):
+ | raise ValueError(f"Image input path is not a file:
{image_input}")
+ | with open(image_input, "rb") as image_file:
+ | return image_file.read()
+ |
+ | def _compress_image_bytes(self, image_bytes, max_bytes=33000):
+ | from io import BytesIO
+ | from PIL import Image as PILImage
+ | if len(image_bytes) <= max_bytes:
+ | return image_bytes
+ | try:
+ | img = PILImage.open(BytesIO(image_bytes))
+ | img = img.convert("RGB")
+ | max_dim = 512
+ | quality = 75
+ | while max_dim >= 160:
+ | scale = min(1, max_dim / max(img.width, img.height))
+ | w = max(1, round(img.width * scale))
+ | h = max(1, round(img.height * scale))
+ | resized = img.resize((w, h), PILImage.LANCZOS)
+ | q = quality
+ | while q >= 35:
+ | buf = BytesIO()
+ | resized.save(buf, format="JPEG", quality=q)
+ | if buf.tell() <= max_bytes:
+ | return buf.getvalue()
+ | q -= 10
+ | max_dim = int(max_dim * 0.75)
+ | buf = BytesIO()
+ | resized.save(buf, format="JPEG", quality=35)
+ | return buf.getvalue()
+ | except Exception:
+ | return image_bytes
+ |
+ | def _image_input_as_base64(self, image_bytes):
+ | return base64.b64encode(image_bytes).decode("utf-8")
+ |
+ | def _read_binary_value(self, value):
+ | if value is None or (isinstance(value, float) and
pd.isna(value)):
+ | return None
+ | if isinstance(value, bytes):
+ | return value
+ | val = str(value).strip()
+ | if not val:
+ | return None
+ | if self._looks_like_html(val):
+ | return self._html_to_image_bytes(val)
+ | if val.startswith("data:"):
+ | _, encoded = val.split(",", 1)
+ | return base64.b64decode(encoded)
+ | if val.startswith("http://") or val.startswith("https://"):
+ | resp = requests.get(val, timeout=120)
+ | resp.raise_for_status()
+ | return resp.content
+ | if os.path.exists(val) and os.path.isfile(val):
+ | with open(val, "rb") as f:
+ | return f.read()
+ | try:
+ | return base64.b64decode(val)
+ | except Exception:
+ | return val.encode("utf-8")
+ |
+ | def _looks_like_html(self, val):
+ | s = val.lstrip()[:200].lower()
+ | if s.startswith("<!doctype html") or s.startswith("<html"):
+ | return True
+ | if "plotly.newplot" in val[:5000].lower() or "plotly.react" in
val[:5000].lower():
+ | return True
+ | if "<img" in s and "base64," in s:
+ | return True
+ | return False
+ |
+ | def _html_to_image_bytes(self, html_string):
+ | match =
re.search(r"data:image/[^;]+;base64,([A-Za-z0-9+/\\n\\r =]+)", html_string)
+ | if match:
+ | b64 = match.group(1).replace("\\n", "").replace("\\r",
"").replace(" ", "")
+ | return base64.b64decode(b64)
+ | if "Plotly." in html_string:
+ | try:
+ | import plotly.graph_objects as go
+ | import plotly.io as pio
+ | plotly_match =
re.search(r"Plotly\\.(?:newPlot|react)\\s*\\(\\s*", html_string)
Review Comment:
The Plotly detection regex is written as a Python raw string but still
double-escapes regex metacharacters (e.g. `r"Plotly\\."`, `\\s`, `\\(`). In a
raw string those match literal backslashes, so `Plotly.newPlot(...)` /
`Plotly.react(...)` won't be detected and the Plotly-to-image conversion path
won't run.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]