Re: [PR] feat(python-notebook-migration): add LLM client for notebook-to-workflow conversion [texera]

via GitHub Tue, 23 Jun 2026 15:53:17 -0700


zyratlo commented on code in PR #5260:
URL: https://github.com/apache/texera/pull/5260#discussion_r3463419759



##########
frontend/src/app/workspace/service/notebook-migration/migration-llm.spec.ts:
##########
@@ -0,0 +1,232 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { NotebookMigrationLLM, Notebook } from "./migration-llm";
+import { GuiConfigService } from "../../../common/service/gui-config.service";
+import { WorkflowUtilService } from 
"../workflow-graph/util/workflow-util.service";
+import { generateText } from "ai";
+import type { Mock } from "vitest";
+
+// The LLM transport and OpenAI client are mocked so the tests exercise only 
the
+// deterministic transformation (parsing, operator/edge construction, 
cell<->operator mapping).
+vi.mock("ai", () => ({ generateText: vi.fn() }));
+vi.mock("@ai-sdk/openai", () => ({
+  createOpenAI: vi.fn(() => ({ chat: vi.fn(() => ({})) })),
+}));
+
+const mockGenerateText = generateText as unknown as Mock;
+
+describe("NotebookMigrationLLM", () => {
+  let opIdCounter = 0;
+  let stubUtil: WorkflowUtilService;
+
+  // Build a fresh, initialized session with stubbed dependencies. The stubbed
+  // getNewOperatorPredicate hands out deterministic ids (PythonUDFV2-0, -1, 
...).
+  function makeLLM(): NotebookMigrationLLM {
+    const stubConfig = {
+      env: { pythonNotebookMigrationEnabled: true },
+    } as unknown as GuiConfigService;
+
+    stubUtil = {
+      getNewOperatorPredicate: vi.fn((operatorType: string, 
customDisplayName?: string) => ({
+        operatorID: `${operatorType}-${opIdCounter++}`,
+        operatorType,
+        operatorVersion: "test-version",
+        operatorProperties: { workers: 1, defaultEnv: true, envName: "" },
+        inputPorts: [{ portID: "input-0", disallowMultiInputs: false }],
+        outputPorts: [{ portID: "output-0" }],
+        showAdvanced: false,
+        isDisabled: false,
+        customDisplayName,
+        dynamicInputPorts: true,
+        dynamicOutputPorts: true,
+      })),
+    } as unknown as WorkflowUtilService;
+
+    const llm = new NotebookMigrationLLM(stubConfig, stubUtil);
+    llm.initialize();
+    return llm;
+  }
+
+  function codeCell(uuid: string | undefined, source: string) {
+    return { cell_type: "code", metadata: uuid === undefined ? {} : { uuid }, 
source };
+  }
+
+  // Queue the two responses convertNotebookToWorkflow consumes, in order.
+  function mockResponses(workflowResponse: string, mappingResponse: string) {
+    mockGenerateText.mockResolvedValueOnce({ text: workflowResponse 
}).mockResolvedValueOnce({ text: mappingResponse });
+  }
+
+  beforeEach(() => {
+    opIdCounter = 0;
+    mockGenerateText.mockReset();
+  });
+
+  describe("convertNotebookToWorkflow", () => {
+    it("builds operators, links, positions, and a bidirectional mapping", 
async () => {
+      const notebook: Notebook = {
+        cells: [codeCell("CELL1", "print(1)"), codeCell("CELL2", "print(2)")],
+      };
+      mockResponses(
+        JSON.stringify({
+          code: { UDF1: "code1", UDF2: "code2" },
+          edges: [["UDF1", "UDF2"]],
+          outputs: { UDF1: ["a", "b"], UDF2: ["c"] },
+        }),
+        JSON.stringify({ UDF1: ["CELL1"], UDF2: ["CELL2"] })
+      );
+
+      const { workflowJSON, workflowNotebookMapping } = JSON.parse(await 
makeLLM().convertNotebookToWorkflow(notebook));
+
+      expect(workflowJSON.operators.map((op: any) => 
op.operatorID)).toEqual(["PythonUDFV2-0", "PythonUDFV2-1"]);
+      expect(workflowJSON.operators[0].operatorProperties).toMatchObject({
+        code: "code1",
+        retainInputColumns: false,
+      });
+      expect(workflowJSON.operatorPositions).toEqual({
+        "PythonUDFV2-0": { x: 140, y: 0 },
+        "PythonUDFV2-1": { x: 280, y: 0 },
+      });
+      expect(workflowJSON.links).toHaveLength(1);
+      expect(workflowJSON.links[0].source).toEqual({ operatorID: 
"PythonUDFV2-0", portID: "output-0" });
+      expect(workflowJSON.links[0].target).toEqual({ operatorID: 
"PythonUDFV2-1", portID: "input-0" });
+      expect(workflowNotebookMapping.operator_to_cell).toEqual({
+        "PythonUDFV2-0": ["CELL1"],
+        "PythonUDFV2-1": ["CELL2"],
+      });
+      expect(workflowNotebookMapping.cell_to_operator).toEqual({
+        CELL1: ["PythonUDFV2-0"],
+        CELL2: ["PythonUDFV2-1"],
+      });
+    });
+
+    // NOTE (C2): the attributeType is currently hardcoded to "binary". If C2 
lands as
+    // "LLM returns per-column types", update the expected attributeType 
values here.
+    it("declares output columns with attributeType binary", async () => {
+      const notebook: Notebook = { cells: [codeCell("CELL1", "x = 1")] };
+      mockResponses(
+        JSON.stringify({ code: { UDF1: "code1" }, edges: [], outputs: { UDF1: 
["a", "b"] } }),
+        JSON.stringify({ UDF1: ["CELL1"] })
+      );
+
+      const { workflowJSON } = JSON.parse(await 
makeLLM().convertNotebookToWorkflow(notebook));
+
+      
expect(workflowJSON.operators[0].operatorProperties.outputColumns).toEqual([
+        { attributeName: "a", attributeType: "binary" },
+        { attributeName: "b", attributeType: "binary" },
+      ]);
+    });
+
+    it("maps multiple cells onto the same UDF, and one cell onto multiple 
UDFs", async () => {
+      const notebook: Notebook = {
+        cells: [codeCell("CELL1", "a"), codeCell("CELL2", "b")],
+      };
+      mockResponses(
+        JSON.stringify({ code: { UDF1: "c1", UDF2: "c2" }, edges: [], outputs: 
{} }),
+        JSON.stringify({ UDF1: ["CELL1", "CELL2"], UDF2: ["CELL1"] })
+      );
+
+      const { workflowNotebookMapping } = JSON.parse(await 
makeLLM().convertNotebookToWorkflow(notebook));
+
+      expect(workflowNotebookMapping.operator_to_cell).toEqual({
+        "PythonUDFV2-0": ["CELL1", "CELL2"],
+        "PythonUDFV2-1": ["CELL1"],
+      });
+      expect(workflowNotebookMapping.cell_to_operator).toEqual({
+        CELL1: ["PythonUDFV2-0", "PythonUDFV2-1"],
+        CELL2: ["PythonUDFV2-0"],
+      });
+    });
+
+    it("produces a link with an undefined endpoint when an edge references an 
unknown UDF id", async () => {
+      const notebook: Notebook = { cells: [codeCell("CELL1", "a")] };
+      mockResponses(
+        JSON.stringify({ code: { UDF1: "c1" }, edges: [["UDF1", "UDFX"]], 
outputs: {} }),
+        JSON.stringify({ UDF1: ["CELL1"] })
+      );
+
+      const { workflowJSON } = JSON.parse(await 
makeLLM().convertNotebookToWorkflow(notebook));
+
+      // Documents current behavior: udfMappingToUUID["UDFX"] is undefined.
+      expect(workflowJSON.links[0].source.operatorID).toBe("PythonUDFV2-0");
+      expect(workflowJSON.links[0].target.operatorID).toBeUndefined();
+    });
+
+    it("handles empty code, edges, and outputs", async () => {
+      const notebook: Notebook = { cells: [] };
+      mockResponses(JSON.stringify({ code: {}, edges: [], outputs: {} }), 
JSON.stringify({}));
+
+      const { workflowJSON, workflowNotebookMapping } = JSON.parse(await 
makeLLM().convertNotebookToWorkflow(notebook));
+
+      expect(workflowJSON.operators).toEqual([]);
+      expect(workflowJSON.links).toEqual([]);
+      expect(workflowNotebookMapping.operator_to_cell).toEqual({});
+      expect(workflowNotebookMapping.cell_to_operator).toEqual({});
+    });
+
+    it("emits the 'undefined' cell marker in the prompt when a code cell lacks 
metadata.uuid", async () => {
+      const notebook: Notebook = { cells: [codeCell(undefined, "print(1)")] };
+      mockResponses(
+        JSON.stringify({ code: { UDF1: "c1" }, edges: [], outputs: {} }),
+        JSON.stringify({ UDF1: ["CELL1"] })
+      );
+
+      await makeLLM().convertNotebookToWorkflow(notebook);
+
+      // The notebook string (embedded in the workflow prompt) is sent to 
generateText.
+      // messages is a shared, mutated array, so search every message content 
rather than
+      // assuming a fixed index.
+      const allPromptContent = mockGenerateText.mock.calls
+        .flatMap(call => call[0].messages.map((m: any) => m.content))
+        .join("\n");
+      expect(allPromptContent).toContain("# START undefined");
+    });

Review Comment:
   Test cases were updated



##########
frontend/src/app/workspace/service/notebook-migration/migration-llm.spec.ts:
##########
@@ -0,0 +1,232 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import { NotebookMigrationLLM, Notebook } from "./migration-llm";
+import { GuiConfigService } from "../../../common/service/gui-config.service";
+import { WorkflowUtilService } from 
"../workflow-graph/util/workflow-util.service";
+import { generateText } from "ai";
+import type { Mock } from "vitest";
+
+// The LLM transport and OpenAI client are mocked so the tests exercise only 
the
+// deterministic transformation (parsing, operator/edge construction, 
cell<->operator mapping).
+vi.mock("ai", () => ({ generateText: vi.fn() }));
+vi.mock("@ai-sdk/openai", () => ({
+  createOpenAI: vi.fn(() => ({ chat: vi.fn(() => ({})) })),
+}));
+
+const mockGenerateText = generateText as unknown as Mock;
+
+describe("NotebookMigrationLLM", () => {
+  let opIdCounter = 0;
+  let stubUtil: WorkflowUtilService;
+
+  // Build a fresh, initialized session with stubbed dependencies. The stubbed
+  // getNewOperatorPredicate hands out deterministic ids (PythonUDFV2-0, -1, 
...).
+  function makeLLM(): NotebookMigrationLLM {
+    const stubConfig = {
+      env: { pythonNotebookMigrationEnabled: true },
+    } as unknown as GuiConfigService;
+
+    stubUtil = {
+      getNewOperatorPredicate: vi.fn((operatorType: string, 
customDisplayName?: string) => ({
+        operatorID: `${operatorType}-${opIdCounter++}`,
+        operatorType,
+        operatorVersion: "test-version",
+        operatorProperties: { workers: 1, defaultEnv: true, envName: "" },
+        inputPorts: [{ portID: "input-0", disallowMultiInputs: false }],
+        outputPorts: [{ portID: "output-0" }],
+        showAdvanced: false,
+        isDisabled: false,
+        customDisplayName,
+        dynamicInputPorts: true,
+        dynamicOutputPorts: true,
+      })),
+    } as unknown as WorkflowUtilService;
+
+    const llm = new NotebookMigrationLLM(stubConfig, stubUtil);
+    llm.initialize();
+    return llm;
+  }
+
+  function codeCell(uuid: string | undefined, source: string) {
+    return { cell_type: "code", metadata: uuid === undefined ? {} : { uuid }, 
source };
+  }
+
+  // Queue the two responses convertNotebookToWorkflow consumes, in order.
+  function mockResponses(workflowResponse: string, mappingResponse: string) {
+    mockGenerateText.mockResolvedValueOnce({ text: workflowResponse 
}).mockResolvedValueOnce({ text: mappingResponse });
+  }
+
+  beforeEach(() => {
+    opIdCounter = 0;
+    mockGenerateText.mockReset();
+  });
+
+  describe("convertNotebookToWorkflow", () => {
+    it("builds operators, links, positions, and a bidirectional mapping", 
async () => {
+      const notebook: Notebook = {
+        cells: [codeCell("CELL1", "print(1)"), codeCell("CELL2", "print(2)")],
+      };
+      mockResponses(
+        JSON.stringify({
+          code: { UDF1: "code1", UDF2: "code2" },
+          edges: [["UDF1", "UDF2"]],
+          outputs: { UDF1: ["a", "b"], UDF2: ["c"] },
+        }),
+        JSON.stringify({ UDF1: ["CELL1"], UDF2: ["CELL2"] })
+      );
+
+      const { workflowJSON, workflowNotebookMapping } = JSON.parse(await 
makeLLM().convertNotebookToWorkflow(notebook));
+
+      expect(workflowJSON.operators.map((op: any) => 
op.operatorID)).toEqual(["PythonUDFV2-0", "PythonUDFV2-1"]);
+      expect(workflowJSON.operators[0].operatorProperties).toMatchObject({
+        code: "code1",
+        retainInputColumns: false,
+      });
+      expect(workflowJSON.operatorPositions).toEqual({
+        "PythonUDFV2-0": { x: 140, y: 0 },
+        "PythonUDFV2-1": { x: 280, y: 0 },
+      });
+      expect(workflowJSON.links).toHaveLength(1);
+      expect(workflowJSON.links[0].source).toEqual({ operatorID: 
"PythonUDFV2-0", portID: "output-0" });
+      expect(workflowJSON.links[0].target).toEqual({ operatorID: 
"PythonUDFV2-1", portID: "input-0" });
+      expect(workflowNotebookMapping.operator_to_cell).toEqual({
+        "PythonUDFV2-0": ["CELL1"],
+        "PythonUDFV2-1": ["CELL2"],
+      });
+      expect(workflowNotebookMapping.cell_to_operator).toEqual({
+        CELL1: ["PythonUDFV2-0"],
+        CELL2: ["PythonUDFV2-1"],
+      });
+    });
+
+    // NOTE (C2): the attributeType is currently hardcoded to "binary". If C2 
lands as
+    // "LLM returns per-column types", update the expected attributeType 
values here.
+    it("declares output columns with attributeType binary", async () => {
+      const notebook: Notebook = { cells: [codeCell("CELL1", "x = 1")] };
+      mockResponses(
+        JSON.stringify({ code: { UDF1: "code1" }, edges: [], outputs: { UDF1: 
["a", "b"] } }),
+        JSON.stringify({ UDF1: ["CELL1"] })
+      );
+
+      const { workflowJSON } = JSON.parse(await 
makeLLM().convertNotebookToWorkflow(notebook));
+
+      
expect(workflowJSON.operators[0].operatorProperties.outputColumns).toEqual([
+        { attributeName: "a", attributeType: "binary" },
+        { attributeName: "b", attributeType: "binary" },
+      ]);
+    });
+
+    it("maps multiple cells onto the same UDF, and one cell onto multiple 
UDFs", async () => {
+      const notebook: Notebook = {
+        cells: [codeCell("CELL1", "a"), codeCell("CELL2", "b")],
+      };
+      mockResponses(
+        JSON.stringify({ code: { UDF1: "c1", UDF2: "c2" }, edges: [], outputs: 
{} }),
+        JSON.stringify({ UDF1: ["CELL1", "CELL2"], UDF2: ["CELL1"] })
+      );
+
+      const { workflowNotebookMapping } = JSON.parse(await 
makeLLM().convertNotebookToWorkflow(notebook));
+
+      expect(workflowNotebookMapping.operator_to_cell).toEqual({
+        "PythonUDFV2-0": ["CELL1", "CELL2"],
+        "PythonUDFV2-1": ["CELL1"],
+      });
+      expect(workflowNotebookMapping.cell_to_operator).toEqual({
+        CELL1: ["PythonUDFV2-0", "PythonUDFV2-1"],
+        CELL2: ["PythonUDFV2-0"],
+      });
+    });
+
+    it("produces a link with an undefined endpoint when an edge references an 
unknown UDF id", async () => {
+      const notebook: Notebook = { cells: [codeCell("CELL1", "a")] };
+      mockResponses(
+        JSON.stringify({ code: { UDF1: "c1" }, edges: [["UDF1", "UDFX"]], 
outputs: {} }),
+        JSON.stringify({ UDF1: ["CELL1"] })
+      );
+
+      const { workflowJSON } = JSON.parse(await 
makeLLM().convertNotebookToWorkflow(notebook));
+
+      // Documents current behavior: udfMappingToUUID["UDFX"] is undefined.
+      expect(workflowJSON.links[0].source.operatorID).toBe("PythonUDFV2-0");
+      expect(workflowJSON.links[0].target.operatorID).toBeUndefined();
+    });

Review Comment:
   Test cases were updated



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(python-notebook-migration): add LLM client for notebook-to-workflow conversion [texera]

Reply via email to