mengw15 commented on code in PR #5260: URL: https://github.com/apache/texera/pull/5260#discussion_r3431331250
########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,322 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: any[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + constructor(private config: GuiConfigService) {} + + private get enabled(): boolean { + return this.config.env.pythonNotebookMigrationEnabled; + } + + private assertEnabled(): void { + if (!this.enabled) { + throw new Error("Notebook migration feature is disabled"); + } + } + + private parseJsonResponse(raw: string, context: string): any { + // Trim first, then strip optional markdown code fences (```json ... ``` or ``` ... ```) + const cleaned = raw + .trim() + .replace(/^```[a-zA-Z]*\s*/, "") + .replace(/\s*```$/, "") + .trim(); + try { + return JSON.parse(cleaned); + } catch (err) { + throw new Error(`Failed to parse LLM ${context} response as JSON: ${(err as Error).message}`); + } + } + + /** + * Initialize a new LLM session with Texera documentation + */ + public initialize(modelType: string = "gpt-5-mini", apiKey: string = "dummy"): void { + this.assertEnabled(); + this.model = createOpenAI({ + baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), + // apiKey is required by the library for creating the OpenAI compatible client; + // For security reason, we store the apiKey at the backend, thus the value is dummy here. + apiKey: apiKey, + }).chat(modelType); + + this.messages = [ + ...NotebookMigrationLLM.DOCUMENTATION.map( + (doc): ModelMessage => ({ + role: "system", + content: doc, + }) + ), + ]; + + this.initialized = true; + } + + /** + * Verify the connection to the LLM using the given API key + */ + public async verifyConnection(): Promise<boolean> { + if (!this.enabled) return false; + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + try { + await generateText({ + model: this.model, + messages: [ + { + role: "user", + content: "ping", + }, + ], + maxOutputTokens: 10, + }); + + return true; + } catch (err) { + console.error("API key verification failed:", err); + return false; + } + } + + /** + * Send a prompt and receive a response. + * All prior documentation and conversation is preserved. + */ + private async sendPrompt(prompt: string): Promise<string> { + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + this.messages.push({ + role: "user", + content: prompt, + }); + + const result = await generateText({ + model: this.model, + messages: this.messages, + }); + + this.messages.push({ + role: "assistant", + content: result.text, + }); + + return result.text; + } + + /** + * Send a Jupyter Notebook to be converted into a workflow and mapping. + */ + public async convertNotebookToWorkflow(notebook: Notebook): Promise<string> { + this.assertEnabled(); + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + const codeCells = notebook.cells.filter(cell => cell.cell_type === "code"); + const notebookString = codeCells + .map(cell => { + const uuid = String(cell.metadata.uuid); + return `# START ${uuid}\n${cell.source}\n# END ${uuid}`; + }) + .join("\n\n"); + + const workflow = await this.sendPrompt(`${WORKFLOW_PROMPT}\n${notebookString}`); + const mapping = await this.sendPrompt(MAPPING_PROMPT); + + // Remove ```json blocks and parse + const udfLLMResponse = this.parseJsonResponse(workflow, "workflow"); + + const workflowJSON: WorkflowJSON = { + operators: [], + operatorPositions: {}, + links: [], + commentBoxes: [], + settings: { + dataTransferBatchSize: 400, + }, + }; + + const udfMappingToUUID: Record<string, string> = {}; + + Object.entries(udfLLMResponse.code).forEach(([udfId, udfCode], i) => { + const udfUUID = `PythonUDFV2-operator-${uuidv4()}`; + udfMappingToUUID[udfId] = udfUUID; + + let udfOutputColumns: { attributeName: string; attributeType: string }[] = []; + if (udfLLMResponse.outputs && udfLLMResponse.outputs[udfId]) { + udfOutputColumns = udfLLMResponse.outputs[udfId].map((attr: string) => ({ + attributeName: attr, + attributeType: "binary", + })); + } + + // Add UDF to operators + workflowJSON.operators.push({ + operatorID: udfUUID, + operatorType: "PythonUDFV2", + operatorVersion: "3d69fdcedbb409b47162c4b55406c77e54abe416", Review Comment: This `operatorVersion: "3d69fdcedbb409b47162c4b55406c77e54abe416"` is a hardcoded git hash. Two concerns: 1. The hash isn't a real commit in this repo — `git rev-parse 3d69fdce...` returns nothing, and `git grep` for the hash finds it only here. So whatever commit this points to either is dead, on a fork, or never existed on master. 2. Other places that need an `operatorVersion` read it dynamically from the operator schema (e.g., `workflow-util.service.ts:159` does `operatorSchema.operatorVersion`), which the backend computes via `LogicalOp.getOperatorVersion`. Hardcoding here means the generated workflow's version field will mismatch the live PythonUDFV2 version once master moves, causing `workflow-version.service.ts:226`'s `if (operator1.operatorVersion != operator2.operatorVersion)` check to false-positive every time. Should this be fetched from a `getOperatorSchema("PythonUDFV2")` call (matching how other code paths get it), or treated as a placeholder the backend overwrites on save? ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,322 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: any[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + constructor(private config: GuiConfigService) {} + + private get enabled(): boolean { + return this.config.env.pythonNotebookMigrationEnabled; + } + + private assertEnabled(): void { + if (!this.enabled) { + throw new Error("Notebook migration feature is disabled"); + } + } + + private parseJsonResponse(raw: string, context: string): any { + // Trim first, then strip optional markdown code fences (```json ... ``` or ``` ... ```) + const cleaned = raw + .trim() + .replace(/^```[a-zA-Z]*\s*/, "") + .replace(/\s*```$/, "") + .trim(); + try { + return JSON.parse(cleaned); + } catch (err) { + throw new Error(`Failed to parse LLM ${context} response as JSON: ${(err as Error).message}`); + } + } + + /** + * Initialize a new LLM session with Texera documentation + */ + public initialize(modelType: string = "gpt-5-mini", apiKey: string = "dummy"): void { + this.assertEnabled(); + this.model = createOpenAI({ + baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), + // apiKey is required by the library for creating the OpenAI compatible client; + // For security reason, we store the apiKey at the backend, thus the value is dummy here. + apiKey: apiKey, + }).chat(modelType); + + this.messages = [ + ...NotebookMigrationLLM.DOCUMENTATION.map( + (doc): ModelMessage => ({ + role: "system", + content: doc, + }) + ), + ]; + + this.initialized = true; + } + + /** + * Verify the connection to the LLM using the given API key + */ + public async verifyConnection(): Promise<boolean> { + if (!this.enabled) return false; + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + try { + await generateText({ + model: this.model, + messages: [ + { + role: "user", + content: "ping", + }, + ], + maxOutputTokens: 10, + }); + + return true; + } catch (err) { + console.error("API key verification failed:", err); + return false; + } + } + + /** + * Send a prompt and receive a response. + * All prior documentation and conversation is preserved. + */ + private async sendPrompt(prompt: string): Promise<string> { + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + this.messages.push({ + role: "user", + content: prompt, + }); + + const result = await generateText({ + model: this.model, + messages: this.messages, + }); + + this.messages.push({ + role: "assistant", + content: result.text, + }); + + return result.text; + } + + /** + * Send a Jupyter Notebook to be converted into a workflow and mapping. + */ + public async convertNotebookToWorkflow(notebook: Notebook): Promise<string> { Review Comment: The PR description says "testing would require mocking a significant amount of logic that will be introduced in later PRs" — this is true for the LLM call itself, but most of `convertNotebookToWorkflow` is deterministic transformation that doesn't touch the LLM: - `parseJsonResponse` (markdown fence stripping) — pure string in, JSON out. - The loop over `udfLLMResponse.code` building operators — pure object construction from a fake LLM response. - `udfMappingToUUID` lookup for edges — pure mapping. - `udfToCell` / `cellToUdf` bidirectional mapping construction — pure inversion logic. A spec that feeds hand-written "LLM response" JSON into the method and asserts on the output `workflowJSON` + `workflowNotebookMapping` would catch several real risks the current code has: - Missing `cell.metadata.uuid` → `String(undefined)` → all such cells share the marker `"undefined"`. - LLM hallucinated UDF id in `edges` → `udfMappingToUUID[unknownId]` is `undefined` → broken link in the output. - Empty `edges` / empty `code` / empty `outputs` arrays. - Markdown fence with leading text before/after (`Here's the JSON: ```json ... ```\nThanks!`). Would also future-proof refactoring. Not a blocker; just noting the test gap is wider than the "LLM mock needed" framing suggests. ########## frontend/src/app/workspace/service/notebook-migration/migration-llm.ts: ########## @@ -0,0 +1,322 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { Injectable } from "@angular/core"; +import { GuiConfigService } from "../../../common/service/gui-config.service"; +import { createOpenAI } from "@ai-sdk/openai"; +import { generateText, type ModelMessage } from "ai"; +import { AppSettings } from "../../../common/app-setting"; +import { v4 as uuidv4 } from "uuid"; +import { + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + WORKFLOW_PROMPT, + MAPPING_PROMPT, +} from "./migration-prompts"; + +interface Cell { + cell_type: string; + metadata: { [key: string]: any }; + source: string; +} + +export interface Notebook { + cells: Cell[]; +} + +interface WorkflowJSON { + operators: any[]; + operatorPositions: Record<string, { x: number; y: number }>; + links: any[]; + commentBoxes: any[]; + settings: { + dataTransferBatchSize: number; + }; +} + +interface CombinedMapping { + operator_to_cell: Record<string, string[]>; + cell_to_operator: Record<string, string[]>; +} + +@Injectable() +export class NotebookMigrationLLM { + private model: any; + private messages: ModelMessage[] = []; + private initialized = false; + + private static readonly DOCUMENTATION: string[] = [ + TEXERA_OVERVIEW, + TUPLE_DOCUMENTATION, + TABLE_DOCUMENTATION, + OPERATOR_DOCUMENTATION, + EXAMPLE_OF_GOOD_CONVERSION, + VISUALIZER_DOCUMENTATION, + UDF_INPUT_PORT_DOCUMENTATION, + EXAMPLE_OF_MULTIPLE_UDF_CONVERSION, + ]; + + constructor(private config: GuiConfigService) {} + + private get enabled(): boolean { + return this.config.env.pythonNotebookMigrationEnabled; + } + + private assertEnabled(): void { + if (!this.enabled) { + throw new Error("Notebook migration feature is disabled"); + } + } + + private parseJsonResponse(raw: string, context: string): any { + // Trim first, then strip optional markdown code fences (```json ... ``` or ``` ... ```) + const cleaned = raw + .trim() + .replace(/^```[a-zA-Z]*\s*/, "") + .replace(/\s*```$/, "") + .trim(); + try { + return JSON.parse(cleaned); + } catch (err) { + throw new Error(`Failed to parse LLM ${context} response as JSON: ${(err as Error).message}`); + } + } + + /** + * Initialize a new LLM session with Texera documentation + */ + public initialize(modelType: string = "gpt-5-mini", apiKey: string = "dummy"): void { + this.assertEnabled(); + this.model = createOpenAI({ + baseURL: new URL(`${AppSettings.getApiEndpoint()}`, document.baseURI).toString(), + // apiKey is required by the library for creating the OpenAI compatible client; + // For security reason, we store the apiKey at the backend, thus the value is dummy here. + apiKey: apiKey, + }).chat(modelType); + + this.messages = [ + ...NotebookMigrationLLM.DOCUMENTATION.map( + (doc): ModelMessage => ({ + role: "system", + content: doc, + }) + ), + ]; + + this.initialized = true; + } + + /** + * Verify the connection to the LLM using the given API key + */ + public async verifyConnection(): Promise<boolean> { + if (!this.enabled) return false; + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + try { + await generateText({ + model: this.model, + messages: [ + { + role: "user", + content: "ping", + }, + ], + maxOutputTokens: 10, + }); + + return true; + } catch (err) { + console.error("API key verification failed:", err); + return false; + } + } + + /** + * Send a prompt and receive a response. + * All prior documentation and conversation is preserved. + */ + private async sendPrompt(prompt: string): Promise<string> { + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + this.messages.push({ + role: "user", + content: prompt, + }); + + const result = await generateText({ + model: this.model, + messages: this.messages, + }); + + this.messages.push({ + role: "assistant", + content: result.text, + }); + + return result.text; + } + + /** + * Send a Jupyter Notebook to be converted into a workflow and mapping. + */ + public async convertNotebookToWorkflow(notebook: Notebook): Promise<string> { + this.assertEnabled(); + if (!this.initialized) { + throw new Error("LLM session not initialized"); + } + + const codeCells = notebook.cells.filter(cell => cell.cell_type === "code"); + const notebookString = codeCells + .map(cell => { + const uuid = String(cell.metadata.uuid); + return `# START ${uuid}\n${cell.source}\n# END ${uuid}`; + }) + .join("\n\n"); + + const workflow = await this.sendPrompt(`${WORKFLOW_PROMPT}\n${notebookString}`); + const mapping = await this.sendPrompt(MAPPING_PROMPT); + + // Remove ```json blocks and parse + const udfLLMResponse = this.parseJsonResponse(workflow, "workflow"); + + const workflowJSON: WorkflowJSON = { + operators: [], + operatorPositions: {}, + links: [], + commentBoxes: [], + settings: { + dataTransferBatchSize: 400, + }, + }; + + const udfMappingToUUID: Record<string, string> = {}; + + Object.entries(udfLLMResponse.code).forEach(([udfId, udfCode], i) => { + const udfUUID = `PythonUDFV2-operator-${uuidv4()}`; + udfMappingToUUID[udfId] = udfUUID; + + let udfOutputColumns: { attributeName: string; attributeType: string }[] = []; + if (udfLLMResponse.outputs && udfLLMResponse.outputs[udfId]) { + udfOutputColumns = udfLLMResponse.outputs[udfId].map((attr: string) => ({ + attributeName: attr, + attributeType: "binary", + })); + } Review Comment: Hardcoding every UDF output column to `attributeType: "binary"` will likely produce semantically wrong workflows for the common case. Texera's `BINARY` is opaque blob storage — downstream operators that read these columns will treat them as bytes, not as tabular values. For most Python UDFs (returning strings, ints, dataframes), this default is incorrect. The LLM is already returning a list of attribute names in `udfLLMResponse.outputs[udfId]`. A few alternatives, from easiest to best: 1. Change the default to `"STRING"` — a more permissive fallback than BINARY for unknown types. 2. Have the LLM return `{name, type}` pairs (the prompt would need a small extension). 3. Leave the type unset and let the operator-schema validation infer or surface it as "user must set". The current default forces users to manually edit every generated UDF's output schema — defeating part of the migration tool's value. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
