Copilot commented on code in PR #5574:
URL: https://github.com/apache/texera/pull/5574#discussion_r3444858228
##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -463,12 +491,29 @@ object PythonCodegenBase {
| # --- resolve all available inference providers for this model
(tried in order) ---
| providers = self._resolve_providers(token)
|
- | # --- validate prompt column exists (required for non-image
tasks) ---
- | if task not in image_tasks:
+ | # --- validate prompt column exists (skipped for image tasks
and binary-only audio tasks) ---
+ | if task not in image_tasks and task not in audio_only_tasks:
| assert prompt_col in table.columns, (
| f"Prompt column '{prompt_col}' not found in input
table. "
| f"Available columns: {list(table.columns)}"
| )
+ | if task == "zero-shot-classification":
+ | assert self.CANDIDATE_LABELS and
self.CANDIDATE_LABELS.strip(), (
+ | "Candidate Labels are required for
zero-shot-classification. "
+ | "Provide a comma-separated list of labels."
+ | )
Review Comment:
For zero-shot-classification, this validation only checks that
`CANDIDATE_LABELS` is a non-empty string. Inputs like `",,,"` or whitespace
will pass validation but produce an empty `labels` list at runtime, which will
fail downstream in a less clear way. Validate that at least one non-empty label
exists after splitting/trimming.
##########
amber/src/main/scala/org/apache/texera/web/auth/UserAuthenticator.scala:
##########
@@ -35,7 +35,7 @@ import java.util.Optional
object UserAuthenticator extends Authenticator[JwtContext, SessionUser] with
LazyLogging {
override def authenticate(context: JwtContext): Optional[SessionUser] = {
try {
- Optional.of(JwtParser.claimsToSessionUser(context.getJwtClaims))
+ JwtParser.claimsToOptionalSessionUser(context.getJwtClaims)
Review Comment:
The scaladoc still references `JwtParser.claimsToSessionUser`, but the
implementation now delegates to `claimsToOptionalSessionUser`. Update the
comment so it reflects the new behavior (and the fact that missing/malformed
claims can return `Optional.empty()`).
##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -831,12 +960,12 @@ object PythonCodegenBase {
| if not content_type or content_type ==
"application/octet-stream":
| from urllib.parse import urlparse as _urlparse
| ext = os.path.splitext(_urlparse(url).path.lower())[1]
- | mime_map = {".png": "image/png", ".jpg": "image/jpeg",
".jpeg": "image/jpeg", ".gif": "image/gif", ".webp": "image/webp", ".svg":
"image/svg+xml", ".mp4": "video/mp4", ".webm": "video/webm"}
+ | mime_map = {".png": "image/png", ".jpg": "image/jpeg",
".jpeg": "image/jpeg", ".gif": "image/gif", ".webp": "image/webp", ".svg":
"image/svg+xml", ".mp3": "audio/mpeg", ".mpeg": "audio/mpeg", ".wav":
"audio/wav", ".flac": "audio/flac", ".ogg": "audio/ogg", ".oga": "audio/ogg",
".m4a": "audio/mp4", ".mp4": "video/mp4", ".webm": "video/webm"}
Review Comment:
The `.m4a` MIME guess is inconsistent: `_get_audio_content_type` maps `.m4a`
to `audio/m4a`, but `_url_to_data_url` maps `.m4a` to `audio/mp4`. This leads
to different MIME strings depending on which path produces the data URL. Pick
one mapping (ideally consistent with the rest of the codebase) and use it in
both places.
##########
common/workflow-operator/src/main/scala/org/apache/texera/amber/operator/huggingFace/codegen/PythonCodegenBase.scala:
##########
@@ -821,6 +928,28 @@ object PythonCodegenBase {
| return text[start_pos:pos], pos
| return None, start_pos
|
+ | def _get_audio_content_type(self):
+ | audio_input = str(self.AUDIO_INPUT or "").strip().lower()
+ | if audio_input.startswith("data:"):
+ | header = audio_input.split(",", 1)[0]
+ | if ";" in header:
+ | return header[5:header.index(";")]
+ | return header[5:]
+ | extension_map = {
+ | ".mp3": "audio/mpeg",
+ | ".mpeg": "audio/mpeg",
+ | ".wav": "audio/wav",
+ | ".flac": "audio/flac",
+ | ".ogg": "audio/ogg",
+ | ".oga": "audio/ogg",
+ | ".webm": "audio/webm",
+ | ".opus": "audio/webm;codecs=opus",
+ | ".amr": "audio/amr",
+ | ".m4a": "audio/m4a",
+ | }
+ | _, ext = os.path.splitext(audio_input)
+ | return extension_map.get(ext, "audio/mpeg")
Review Comment:
`_get_audio_content_type` uses `os.path.splitext(audio_input)` directly.
When `AUDIO_INPUT` is a URL with query params (common for signed URLs), this
yields an extension like `.mp3?X-Amz-...` and the lookup falls back to
`audio/mpeg` even for non-mpeg audio. Parse the URL path before splitting the
extension (similar to `_url_to_data_url`).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]