Re: [PR] feat(huggingFace): add image task family via ImageTaskCodegen [texera]

via GitHub Tue, 16 Jun 2026 22:38:03 -0700


Copilot commented on code in PR #5320:
URL: https://github.com/apache/texera/pull/5320#discussion_r3425811888



##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -361,6 +605,186 @@ object PythonCodegenBase {
        |            detail = "<empty response>"
        |        return f"{title} [status={status_code}] response={detail}"
        |
+       |    # 
──────────────────────────────────────────────────────────────────
+       |    # Image-task helpers (used by ImageTaskCodegen and image-related
+       |    # branches of _call_provider).
+       |    # 
──────────────────────────────────────────────────────────────────
+       |
+       |    def _read_image_input(self):
+       |        image_input = str(self.IMAGE_INPUT or "").strip()
+       |        if image_input.startswith("data:"):
+       |            _, encoded = image_input.split(",", 1)
+       |            return base64.b64decode(encoded)
+       |        if image_input.startswith("http://";) or 
image_input.startswith("https://";):
+       |            resp = requests.get(image_input, timeout=120)
+       |            resp.raise_for_status()
+       |            return resp.content
+       |        if not os.path.exists(image_input):
+       |            raise FileNotFoundError(f"Image file not found at path: 
{image_input}")
+       |        if not os.path.isfile(image_input):
+       |            raise ValueError(f"Image input path is not a file: 
{image_input}")
+       |        with open(image_input, "rb") as image_file:
+       |            return image_file.read()
+       |
+       |    def _compress_image_bytes(self, image_bytes, max_bytes=33000):
+       |        from io import BytesIO
+       |        from PIL import Image as PILImage
+       |        if len(image_bytes) <= max_bytes:
+       |            return image_bytes
+       |        try:
+       |            img = PILImage.open(BytesIO(image_bytes))
+       |            img = img.convert("RGB")
+       |            max_dim = 512
+       |            quality = 75
+       |            while max_dim >= 160:
+       |                scale = min(1, max_dim / max(img.width, img.height))
+       |                w = max(1, round(img.width * scale))
+       |                h = max(1, round(img.height * scale))
+       |                resized = img.resize((w, h), PILImage.LANCZOS)
+       |                q = quality
+       |                while q >= 35:
+       |                    buf = BytesIO()
+       |                    resized.save(buf, format="JPEG", quality=q)
+       |                    if buf.tell() <= max_bytes:
+       |                        return buf.getvalue()
+       |                    q -= 10
+       |                max_dim = int(max_dim * 0.75)
+       |            buf = BytesIO()
+       |            resized.save(buf, format="JPEG", quality=35)
+       |            return buf.getvalue()
+       |        except Exception:
+       |            return image_bytes
+       |
+       |    def _image_input_as_base64(self, image_bytes):
+       |        return base64.b64encode(image_bytes).decode("utf-8")
+       |
+       |    def _read_binary_value(self, value):
+       |        if value is None or (isinstance(value, float) and 
pd.isna(value)):
+       |            return None
+       |        if isinstance(value, bytes):
+       |            return value
+       |        val = str(value).strip()
+       |        if not val:
+       |            return None
+       |        if self._looks_like_html(val):
+       |            return self._html_to_image_bytes(val)
+       |        if val.startswith("data:"):
+       |            _, encoded = val.split(",", 1)
+       |            return base64.b64decode(encoded)
+       |        if val.startswith("http://";) or val.startswith("https://";):
+       |            resp = requests.get(val, timeout=120)
+       |            resp.raise_for_status()
+       |            return resp.content
+       |        if os.path.exists(val) and os.path.isfile(val):
+       |            with open(val, "rb") as f:
+       |                return f.read()
+       |        try:
+       |            return base64.b64decode(val)
+       |        except Exception:
+       |            return val.encode("utf-8")
+       |
+       |    def _looks_like_html(self, val):
+       |        s = val.lstrip()[:200].lower()
+       |        if s.startswith("<!doctype html") or s.startswith("<html"):
+       |            return True
+       |        if "plotly.newplot" in val[:5000].lower() or "plotly.react" in 
val[:5000].lower():
+       |            return True
+       |        if "<img" in s and "base64," in s:
+       |            return True
+       |        return False
+       |
+       |    def _html_to_image_bytes(self, html_string):
+       |        match = 
re.search(r"data:image/[^;]+;base64,([A-Za-z0-9+/\\n\\r =]+)", html_string)
+       |        if match:
+       |            b64 = match.group(1).replace("\\n", "").replace("\\r", 
"").replace(" ", "")
+       |            return base64.b64decode(b64)

Review Comment:
   `_html_to_image_bytes` uses a raw-string regex and then strips 
`"\\n"`/`"\\r"`, which matches literal backslash sequences rather than real 
newlines. This will fail to extract base64 data URLs when the HTML contains 
actual line breaks/whitespace in the base64 payload.



##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -266,11 +461,12 @@ object PythonCodegenBase {
        |        # --- resolve all available inference providers for this model 
(tried in order) ---
        |        providers = self._resolve_providers(token)
        |
-       |        # --- validate prompt column exists ---
-       |        assert prompt_col in table.columns, (
-       |            f"Prompt column '{prompt_col}' not found in input table. "
-       |            f"Available columns: {list(table.columns)}"
-       |        )
+       |        # --- validate prompt column exists (skipped for image-only 
tasks) ---
+       |        if task not in image_only_tasks:
+       |            assert prompt_col in table.columns, (
+       |                f"Prompt column '{prompt_col}' not found in input 
table. "
+       |                f"Available columns: {list(table.columns)}"
+       |            )

Review Comment:
   `process_table` now has a fallback when `prompt_col` is missing for 
image+prompt tasks, but the earlier validation still asserts `prompt_col in 
table.columns` for those tasks (`task not in image_only_tasks`). That makes the 
fallback unreachable (unless Python runs with `-O`), and image+prompt tasks 
will fail fast even though the per-row logic can handle a missing prompt column.



##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -361,6 +605,186 @@ object PythonCodegenBase {
        |            detail = "<empty response>"
        |        return f"{title} [status={status_code}] response={detail}"
        |
+       |    # 
──────────────────────────────────────────────────────────────────
+       |    # Image-task helpers (used by ImageTaskCodegen and image-related
+       |    # branches of _call_provider).
+       |    # 
──────────────────────────────────────────────────────────────────
+       |
+       |    def _read_image_input(self):
+       |        image_input = str(self.IMAGE_INPUT or "").strip()
+       |        if image_input.startswith("data:"):
+       |            _, encoded = image_input.split(",", 1)
+       |            return base64.b64decode(encoded)
+       |        if image_input.startswith("http://";) or 
image_input.startswith("https://";):
+       |            resp = requests.get(image_input, timeout=120)
+       |            resp.raise_for_status()
+       |            return resp.content
+       |        if not os.path.exists(image_input):
+       |            raise FileNotFoundError(f"Image file not found at path: 
{image_input}")
+       |        if not os.path.isfile(image_input):
+       |            raise ValueError(f"Image input path is not a file: 
{image_input}")
+       |        with open(image_input, "rb") as image_file:
+       |            return image_file.read()
+       |
+       |    def _compress_image_bytes(self, image_bytes, max_bytes=33000):
+       |        from io import BytesIO
+       |        from PIL import Image as PILImage
+       |        if len(image_bytes) <= max_bytes:
+       |            return image_bytes
+       |        try:
+       |            img = PILImage.open(BytesIO(image_bytes))
+       |            img = img.convert("RGB")
+       |            max_dim = 512
+       |            quality = 75
+       |            while max_dim >= 160:
+       |                scale = min(1, max_dim / max(img.width, img.height))
+       |                w = max(1, round(img.width * scale))
+       |                h = max(1, round(img.height * scale))
+       |                resized = img.resize((w, h), PILImage.LANCZOS)
+       |                q = quality
+       |                while q >= 35:
+       |                    buf = BytesIO()
+       |                    resized.save(buf, format="JPEG", quality=q)
+       |                    if buf.tell() <= max_bytes:
+       |                        return buf.getvalue()
+       |                    q -= 10
+       |                max_dim = int(max_dim * 0.75)
+       |            buf = BytesIO()
+       |            resized.save(buf, format="JPEG", quality=35)
+       |            return buf.getvalue()
+       |        except Exception:
+       |            return image_bytes
+       |
+       |    def _image_input_as_base64(self, image_bytes):
+       |        return base64.b64encode(image_bytes).decode("utf-8")
+       |
+       |    def _read_binary_value(self, value):
+       |        if value is None or (isinstance(value, float) and 
pd.isna(value)):
+       |            return None

Review Comment:
   `_read_binary_value` only treats `float('nan')` as missing via 
`(isinstance(value, float) and pd.isna(value))`. Pandas missing sentinels like 
`pd.NA` / `NaT` (and other scalar NA types) will fall through and be converted 
to bytes (`b'<NA>'`), which is incorrect for image columns.



##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -55,11 +55,15 @@ object PythonCodegenBase {
     val systemPrompt = ctx.systemPrompt
     val maxNewTokens = ctx.safeMaxTokens
     val temperature = ctx.safeTemp
+    val imageInput = ctx.imageInput
+    val inputImageColumn = ctx.inputImageColumn
     pyb"""import os
        |import re
        |import json
+       |import base64
        |import requests
        |import pandas as pd
+       |from urllib.parse import urlparse
        |from pytexera import *

Review Comment:
   `from urllib.parse import urlparse` is imported in the generated Python but 
never used (the code re-imports `urlparse as _urlparse` in the few places it 
needs it). This can be dropped to keep the generated script minimal and avoid 
misleading readers about which import is actually used.



##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -361,6 +605,186 @@ object PythonCodegenBase {
        |            detail = "<empty response>"
        |        return f"{title} [status={status_code}] response={detail}"
        |
+       |    # 
──────────────────────────────────────────────────────────────────
+       |    # Image-task helpers (used by ImageTaskCodegen and image-related
+       |    # branches of _call_provider).
+       |    # 
──────────────────────────────────────────────────────────────────
+       |
+       |    def _read_image_input(self):
+       |        image_input = str(self.IMAGE_INPUT or "").strip()
+       |        if image_input.startswith("data:"):
+       |            _, encoded = image_input.split(",", 1)
+       |            return base64.b64decode(encoded)
+       |        if image_input.startswith("http://";) or 
image_input.startswith("https://";):
+       |            resp = requests.get(image_input, timeout=120)
+       |            resp.raise_for_status()
+       |            return resp.content
+       |        if not os.path.exists(image_input):
+       |            raise FileNotFoundError(f"Image file not found at path: 
{image_input}")
+       |        if not os.path.isfile(image_input):
+       |            raise ValueError(f"Image input path is not a file: 
{image_input}")
+       |        with open(image_input, "rb") as image_file:
+       |            return image_file.read()
+       |
+       |    def _compress_image_bytes(self, image_bytes, max_bytes=33000):
+       |        from io import BytesIO
+       |        from PIL import Image as PILImage
+       |        if len(image_bytes) <= max_bytes:
+       |            return image_bytes
+       |        try:
+       |            img = PILImage.open(BytesIO(image_bytes))
+       |            img = img.convert("RGB")
+       |            max_dim = 512
+       |            quality = 75
+       |            while max_dim >= 160:
+       |                scale = min(1, max_dim / max(img.width, img.height))
+       |                w = max(1, round(img.width * scale))
+       |                h = max(1, round(img.height * scale))
+       |                resized = img.resize((w, h), PILImage.LANCZOS)
+       |                q = quality
+       |                while q >= 35:
+       |                    buf = BytesIO()
+       |                    resized.save(buf, format="JPEG", quality=q)
+       |                    if buf.tell() <= max_bytes:
+       |                        return buf.getvalue()
+       |                    q -= 10
+       |                max_dim = int(max_dim * 0.75)
+       |            buf = BytesIO()
+       |            resized.save(buf, format="JPEG", quality=35)
+       |            return buf.getvalue()
+       |        except Exception:
+       |            return image_bytes
+       |
+       |    def _image_input_as_base64(self, image_bytes):
+       |        return base64.b64encode(image_bytes).decode("utf-8")
+       |
+       |    def _read_binary_value(self, value):
+       |        if value is None or (isinstance(value, float) and 
pd.isna(value)):
+       |            return None
+       |        if isinstance(value, bytes):
+       |            return value
+       |        val = str(value).strip()
+       |        if not val:
+       |            return None
+       |        if self._looks_like_html(val):
+       |            return self._html_to_image_bytes(val)
+       |        if val.startswith("data:"):
+       |            _, encoded = val.split(",", 1)
+       |            return base64.b64decode(encoded)
+       |        if val.startswith("http://";) or val.startswith("https://";):
+       |            resp = requests.get(val, timeout=120)
+       |            resp.raise_for_status()
+       |            return resp.content
+       |        if os.path.exists(val) and os.path.isfile(val):
+       |            with open(val, "rb") as f:
+       |                return f.read()
+       |        try:
+       |            return base64.b64decode(val)
+       |        except Exception:
+       |            return val.encode("utf-8")
+       |
+       |    def _looks_like_html(self, val):
+       |        s = val.lstrip()[:200].lower()
+       |        if s.startswith("<!doctype html") or s.startswith("<html"):
+       |            return True
+       |        if "plotly.newplot" in val[:5000].lower() or "plotly.react" in 
val[:5000].lower():
+       |            return True
+       |        if "<img" in s and "base64," in s:
+       |            return True
+       |        return False
+       |
+       |    def _html_to_image_bytes(self, html_string):
+       |        match = 
re.search(r"data:image/[^;]+;base64,([A-Za-z0-9+/\\n\\r =]+)", html_string)
+       |        if match:
+       |            b64 = match.group(1).replace("\\n", "").replace("\\r", 
"").replace(" ", "")
+       |            return base64.b64decode(b64)
+       |        if "Plotly." in html_string:
+       |            try:
+       |                import plotly.graph_objects as go
+       |                import plotly.io as pio
+       |                plotly_match = 
re.search(r"Plotly\\.(?:newPlot|react)\\s*\\(\\s*", html_string)
+       |                if plotly_match:
+       |                    pos = plotly_match.end()
+       |                    if pos < len(html_string) and html_string[pos] in 
('"', "'"):
+       |                        q = html_string[pos]
+       |                        pos += 1
+       |                        while pos < len(html_string) and 
html_string[pos] != q:
+       |                            if html_string[pos] == "\\\\":
+       |                                pos += 1
+       |                            pos += 1
+       |                        pos += 1
+       |                    while pos < len(html_string) and html_string[pos] 
in " ,\\n\\r\\t":
+       |                        pos += 1
+       |                    data_json, pos = 
self._extract_json_arg(html_string, pos)
+       |                    while pos < len(html_string) and html_string[pos] 
in " ,\\n\\r\\t":
+       |                        pos += 1

Review Comment:
   These whitespace-skipping loops check membership in the string `" 
,\\n\\r\\t"`, which looks for literal backslash characters rather than 
newline/tab characters. That prevents `_html_to_image_bytes` from skipping real 
whitespace/newlines around Plotly JSON arguments, making `_extract_json_arg` 
much more likely to fail on real Plotly HTML.



##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -361,6 +605,186 @@ object PythonCodegenBase {
        |            detail = "<empty response>"
        |        return f"{title} [status={status_code}] response={detail}"
        |
+       |    # 
──────────────────────────────────────────────────────────────────
+       |    # Image-task helpers (used by ImageTaskCodegen and image-related
+       |    # branches of _call_provider).
+       |    # 
──────────────────────────────────────────────────────────────────
+       |
+       |    def _read_image_input(self):
+       |        image_input = str(self.IMAGE_INPUT or "").strip()
+       |        if image_input.startswith("data:"):
+       |            _, encoded = image_input.split(",", 1)
+       |            return base64.b64decode(encoded)
+       |        if image_input.startswith("http://";) or 
image_input.startswith("https://";):
+       |            resp = requests.get(image_input, timeout=120)
+       |            resp.raise_for_status()
+       |            return resp.content
+       |        if not os.path.exists(image_input):
+       |            raise FileNotFoundError(f"Image file not found at path: 
{image_input}")
+       |        if not os.path.isfile(image_input):
+       |            raise ValueError(f"Image input path is not a file: 
{image_input}")
+       |        with open(image_input, "rb") as image_file:
+       |            return image_file.read()
+       |
+       |    def _compress_image_bytes(self, image_bytes, max_bytes=33000):
+       |        from io import BytesIO
+       |        from PIL import Image as PILImage
+       |        if len(image_bytes) <= max_bytes:
+       |            return image_bytes
+       |        try:
+       |            img = PILImage.open(BytesIO(image_bytes))
+       |            img = img.convert("RGB")
+       |            max_dim = 512
+       |            quality = 75
+       |            while max_dim >= 160:
+       |                scale = min(1, max_dim / max(img.width, img.height))
+       |                w = max(1, round(img.width * scale))
+       |                h = max(1, round(img.height * scale))
+       |                resized = img.resize((w, h), PILImage.LANCZOS)
+       |                q = quality
+       |                while q >= 35:
+       |                    buf = BytesIO()
+       |                    resized.save(buf, format="JPEG", quality=q)
+       |                    if buf.tell() <= max_bytes:
+       |                        return buf.getvalue()
+       |                    q -= 10
+       |                max_dim = int(max_dim * 0.75)
+       |            buf = BytesIO()
+       |            resized.save(buf, format="JPEG", quality=35)
+       |            return buf.getvalue()
+       |        except Exception:
+       |            return image_bytes
+       |
+       |    def _image_input_as_base64(self, image_bytes):
+       |        return base64.b64encode(image_bytes).decode("utf-8")
+       |
+       |    def _read_binary_value(self, value):
+       |        if value is None or (isinstance(value, float) and 
pd.isna(value)):
+       |            return None
+       |        if isinstance(value, bytes):
+       |            return value
+       |        val = str(value).strip()
+       |        if not val:
+       |            return None
+       |        if self._looks_like_html(val):
+       |            return self._html_to_image_bytes(val)
+       |        if val.startswith("data:"):
+       |            _, encoded = val.split(",", 1)
+       |            return base64.b64decode(encoded)
+       |        if val.startswith("http://";) or val.startswith("https://";):
+       |            resp = requests.get(val, timeout=120)
+       |            resp.raise_for_status()
+       |            return resp.content
+       |        if os.path.exists(val) and os.path.isfile(val):
+       |            with open(val, "rb") as f:
+       |                return f.read()
+       |        try:
+       |            return base64.b64decode(val)
+       |        except Exception:
+       |            return val.encode("utf-8")
+       |
+       |    def _looks_like_html(self, val):
+       |        s = val.lstrip()[:200].lower()
+       |        if s.startswith("<!doctype html") or s.startswith("<html"):
+       |            return True
+       |        if "plotly.newplot" in val[:5000].lower() or "plotly.react" in 
val[:5000].lower():
+       |            return True
+       |        if "<img" in s and "base64," in s:
+       |            return True
+       |        return False
+       |
+       |    def _html_to_image_bytes(self, html_string):
+       |        match = 
re.search(r"data:image/[^;]+;base64,([A-Za-z0-9+/\\n\\r =]+)", html_string)
+       |        if match:
+       |            b64 = match.group(1).replace("\\n", "").replace("\\r", 
"").replace(" ", "")
+       |            return base64.b64decode(b64)
+       |        if "Plotly." in html_string:
+       |            try:
+       |                import plotly.graph_objects as go
+       |                import plotly.io as pio
+       |                plotly_match = 
re.search(r"Plotly\\.(?:newPlot|react)\\s*\\(\\s*", html_string)

Review Comment:
   The Plotly detection regex is written as a Python raw string but still 
double-escapes regex metacharacters (e.g. `r"Plotly\\."`, `\\s`, `\\(`). In a 
raw string those match literal backslashes, so `Plotly.newPlot(...)` / 
`Plotly.react(...)` won't be detected and the Plotly-to-image conversion path 
won't run.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(huggingFace): add image task family via ImageTaskCodegen [texera]

Reply via email to