This is an automated email from the ASF dual-hosted git repository. github-merge-queue[bot] pushed a commit to branch gh-readonly-queue/main/pr-5925-ad908b74857b86f4bc6087b61fcbf9a54f738edb in repository https://gitbox.apache.org/repos/asf/texera.git
commit e17de6fd622daa340594dd268b6d74632cfb08c8 Author: Xinyuan Lin <[email protected]> AuthorDate: Fri Jun 26 01:35:47 2026 -0700 test(workflow-operator): add unit test coverage for Sklearn Naive Bayes descriptors (#5925) ### What changes were proposed in this PR? Pin behavior of the four previously-untested Sklearn Naive Bayes classifier descriptors in `common/workflow-operator`. No production-code changes. | Spec | Source class | Tests | | --- | --- | --- | | `SklearnBernoulliNaiveBayesOpDescSpec` | `SklearnBernoulliNaiveBayesOpDesc` | 5 | | `SklearnComplementNaiveBayesOpDescSpec` | `SklearnComplementNaiveBayesOpDesc` | 5 | | `SklearnGaussianNaiveBayesOpDescSpec` | `SklearnGaussianNaiveBayesOpDesc` | 5 | | `SklearnMultinomialNaiveBayesOpDescSpec` | `SklearnMultinomialNaiveBayesOpDesc` | 5 | **Behavior pinned** | Surface | Contract | | --- | --- | | `operatorInfo` | exact model name + `Sklearn <name> Operator` description; Sklearn group; training/testing input ports + one blocking output | | field defaults | `countVectorizer`/`tfidfTransformer` `false`; `target`/`text` `null` | | `getOutputSchemas` | `model_name` (STRING) + `model` (BINARY) keyed by the declared output port | | `generatePythonCode` | imports and instantiates the matching sklearn estimator (e.g. `BernoulliNB`) via `make_pipeline` | | Round-trip | config fields preserved through the polymorphic `LogicalOp` base, with the correct `operatorType` discriminator | ### Any related issues, documentation, discussions? Part of the ongoing `workflow-operator` unit-test coverage effort. ### How was this PR tested? - `sbt "WorkflowOperator/testOnly *SklearnBernoulliNaiveBayesOpDescSpec *SklearnComplementNaiveBayesOpDescSpec *SklearnGaussianNaiveBayesOpDescSpec *SklearnMultinomialNaiveBayesOpDescSpec"` — 20 tests, all green - `sbt "WorkflowOperator/Test/scalafmtCheck"` and `sbt "WorkflowOperator/scalafixAll --check"` — clean - CI to confirm ### Was this PR authored or co-authored using generative AI tooling? Generated-by: Claude Code (Opus 4.8 [1M context]) --- .../SklearnBernoulliNaiveBayesOpDescSpec.scala | 81 ++++++++++++++++++++++ .../SklearnComplementNaiveBayesOpDescSpec.scala | 81 ++++++++++++++++++++++ .../SklearnGaussianNaiveBayesOpDescSpec.scala | 81 ++++++++++++++++++++++ .../SklearnMultinomialNaiveBayesOpDescSpec.scala | 81 ++++++++++++++++++++++ 4 files changed, 324 insertions(+) diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnBernoulliNaiveBayesOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnBernoulliNaiveBayesOpDescSpec.scala new file mode 100644 index 0000000000..b34d9b2e1b --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnBernoulliNaiveBayesOpDescSpec.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.sklearn + +import org.apache.texera.amber.core.tuple.AttributeType +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.OperatorGroupConstants +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class SklearnBernoulliNaiveBayesOpDescSpec extends AnyFlatSpec with Matchers { + + "SklearnBernoulliNaiveBayesOpDesc.operatorInfo" should + "advertise the model name, Sklearn group, and the training/testing port shape" in { + val info = (new SklearnBernoulliNaiveBayesOpDesc).operatorInfo + info.userFriendlyName shouldBe "Bernoulli Naive Bayes" + info.operatorDescription shouldBe "Sklearn Bernoulli Naive Bayes Operator" + info.operatorGroupName shouldBe OperatorGroupConstants.SKLEARN_GROUP + info.inputPorts.map(_.displayName) shouldBe List("training", "testing") + info.outputPorts should have length 1 + info.outputPorts.head.blocking shouldBe true + } + + "SklearnBernoulliNaiveBayesOpDesc" should "default its config fields" in { + val d = new SklearnBernoulliNaiveBayesOpDesc + d.countVectorizer shouldBe false + d.tfidfTransformer shouldBe false + d.target shouldBe null + d.text shouldBe null + } + + "SklearnBernoulliNaiveBayesOpDesc.getOutputSchemas" should + "emit the model_name/model schema keyed by the declared output port" in { + val d = new SklearnBernoulliNaiveBayesOpDesc + val schema = d.getOutputSchemas(Map.empty)(d.operatorInfo.outputPorts.head.id) + schema.getAttribute("model_name").getType shouldBe AttributeType.STRING + schema.getAttribute("model").getType shouldBe AttributeType.BINARY + } + + "SklearnBernoulliNaiveBayesOpDesc.generatePythonCode" should + "import and instantiate the BernoulliNB estimator" in { + val d = new SklearnBernoulliNaiveBayesOpDesc + d.target = "y" + val code = d.generatePythonCode() + code should include("from sklearn.naive_bayes import BernoulliNB") + code should include("make_pipeline") + code should include("Bernoulli Naive Bayes") + } + + "SklearnBernoulliNaiveBayesOpDesc" should + "round-trip its config fields through the polymorphic base" in { + val d = new SklearnBernoulliNaiveBayesOpDesc + d.target = "label" + d.countVectorizer = true + val json = objectMapper.writeValueAsString(d) + json should include("\"operatorType\":\"SklearnBernoulliNaiveBayes\"") + val restored = objectMapper.readValue(json, classOf[LogicalOp]) + restored shouldBe a[SklearnBernoulliNaiveBayesOpDesc] + val r = restored.asInstanceOf[SklearnBernoulliNaiveBayesOpDesc] + r.target shouldBe "label" + r.countVectorizer shouldBe true + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnComplementNaiveBayesOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnComplementNaiveBayesOpDescSpec.scala new file mode 100644 index 0000000000..0d50eb4577 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnComplementNaiveBayesOpDescSpec.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.sklearn + +import org.apache.texera.amber.core.tuple.AttributeType +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.OperatorGroupConstants +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class SklearnComplementNaiveBayesOpDescSpec extends AnyFlatSpec with Matchers { + + "SklearnComplementNaiveBayesOpDesc.operatorInfo" should + "advertise the model name, Sklearn group, and the training/testing port shape" in { + val info = (new SklearnComplementNaiveBayesOpDesc).operatorInfo + info.userFriendlyName shouldBe "Complement Naive Bayes" + info.operatorDescription shouldBe "Sklearn Complement Naive Bayes Operator" + info.operatorGroupName shouldBe OperatorGroupConstants.SKLEARN_GROUP + info.inputPorts.map(_.displayName) shouldBe List("training", "testing") + info.outputPorts should have length 1 + info.outputPorts.head.blocking shouldBe true + } + + "SklearnComplementNaiveBayesOpDesc" should "default its config fields" in { + val d = new SklearnComplementNaiveBayesOpDesc + d.countVectorizer shouldBe false + d.tfidfTransformer shouldBe false + d.target shouldBe null + d.text shouldBe null + } + + "SklearnComplementNaiveBayesOpDesc.getOutputSchemas" should + "emit the model_name/model schema keyed by the declared output port" in { + val d = new SklearnComplementNaiveBayesOpDesc + val schema = d.getOutputSchemas(Map.empty)(d.operatorInfo.outputPorts.head.id) + schema.getAttribute("model_name").getType shouldBe AttributeType.STRING + schema.getAttribute("model").getType shouldBe AttributeType.BINARY + } + + "SklearnComplementNaiveBayesOpDesc.generatePythonCode" should + "import and instantiate the ComplementNB estimator" in { + val d = new SklearnComplementNaiveBayesOpDesc + d.target = "y" + val code = d.generatePythonCode() + code should include("from sklearn.naive_bayes import ComplementNB") + code should include("make_pipeline") + code should include("Complement Naive Bayes") + } + + "SklearnComplementNaiveBayesOpDesc" should + "round-trip its config fields through the polymorphic base" in { + val d = new SklearnComplementNaiveBayesOpDesc + d.target = "label" + d.countVectorizer = true + val json = objectMapper.writeValueAsString(d) + json should include("\"operatorType\":\"SklearnComplementNaiveBayes\"") + val restored = objectMapper.readValue(json, classOf[LogicalOp]) + restored shouldBe a[SklearnComplementNaiveBayesOpDesc] + val r = restored.asInstanceOf[SklearnComplementNaiveBayesOpDesc] + r.target shouldBe "label" + r.countVectorizer shouldBe true + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnGaussianNaiveBayesOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnGaussianNaiveBayesOpDescSpec.scala new file mode 100644 index 0000000000..9c25894dc1 --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnGaussianNaiveBayesOpDescSpec.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.sklearn + +import org.apache.texera.amber.core.tuple.AttributeType +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.OperatorGroupConstants +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class SklearnGaussianNaiveBayesOpDescSpec extends AnyFlatSpec with Matchers { + + "SklearnGaussianNaiveBayesOpDesc.operatorInfo" should + "advertise the model name, Sklearn group, and the training/testing port shape" in { + val info = (new SklearnGaussianNaiveBayesOpDesc).operatorInfo + info.userFriendlyName shouldBe "Gaussian Naive Bayes" + info.operatorDescription shouldBe "Sklearn Gaussian Naive Bayes Operator" + info.operatorGroupName shouldBe OperatorGroupConstants.SKLEARN_GROUP + info.inputPorts.map(_.displayName) shouldBe List("training", "testing") + info.outputPorts should have length 1 + info.outputPorts.head.blocking shouldBe true + } + + "SklearnGaussianNaiveBayesOpDesc" should "default its config fields" in { + val d = new SklearnGaussianNaiveBayesOpDesc + d.countVectorizer shouldBe false + d.tfidfTransformer shouldBe false + d.target shouldBe null + d.text shouldBe null + } + + "SklearnGaussianNaiveBayesOpDesc.getOutputSchemas" should + "emit the model_name/model schema keyed by the declared output port" in { + val d = new SklearnGaussianNaiveBayesOpDesc + val schema = d.getOutputSchemas(Map.empty)(d.operatorInfo.outputPorts.head.id) + schema.getAttribute("model_name").getType shouldBe AttributeType.STRING + schema.getAttribute("model").getType shouldBe AttributeType.BINARY + } + + "SklearnGaussianNaiveBayesOpDesc.generatePythonCode" should + "import and instantiate the GaussianNB estimator" in { + val d = new SklearnGaussianNaiveBayesOpDesc + d.target = "y" + val code = d.generatePythonCode() + code should include("from sklearn.naive_bayes import GaussianNB") + code should include("make_pipeline") + code should include("Gaussian Naive Bayes") + } + + "SklearnGaussianNaiveBayesOpDesc" should + "round-trip its config fields through the polymorphic base" in { + val d = new SklearnGaussianNaiveBayesOpDesc + d.target = "label" + d.countVectorizer = true + val json = objectMapper.writeValueAsString(d) + json should include("\"operatorType\":\"SklearnGaussianNaiveBayes\"") + val restored = objectMapper.readValue(json, classOf[LogicalOp]) + restored shouldBe a[SklearnGaussianNaiveBayesOpDesc] + val r = restored.asInstanceOf[SklearnGaussianNaiveBayesOpDesc] + r.target shouldBe "label" + r.countVectorizer shouldBe true + } +} diff --git a/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnMultinomialNaiveBayesOpDescSpec.scala b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnMultinomialNaiveBayesOpDescSpec.scala new file mode 100644 index 0000000000..3a6b4debfd --- /dev/null +++ b/common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/sklearn/SklearnMultinomialNaiveBayesOpDescSpec.scala @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.sklearn + +import org.apache.texera.amber.core.tuple.AttributeType +import org.apache.texera.amber.operator.LogicalOp +import org.apache.texera.amber.operator.metadata.OperatorGroupConstants +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class SklearnMultinomialNaiveBayesOpDescSpec extends AnyFlatSpec with Matchers { + + "SklearnMultinomialNaiveBayesOpDesc.operatorInfo" should + "advertise the model name, Sklearn group, and the training/testing port shape" in { + val info = (new SklearnMultinomialNaiveBayesOpDesc).operatorInfo + info.userFriendlyName shouldBe "Multinomial Naive Bayes" + info.operatorDescription shouldBe "Sklearn Multinomial Naive Bayes Operator" + info.operatorGroupName shouldBe OperatorGroupConstants.SKLEARN_GROUP + info.inputPorts.map(_.displayName) shouldBe List("training", "testing") + info.outputPorts should have length 1 + info.outputPorts.head.blocking shouldBe true + } + + "SklearnMultinomialNaiveBayesOpDesc" should "default its config fields" in { + val d = new SklearnMultinomialNaiveBayesOpDesc + d.countVectorizer shouldBe false + d.tfidfTransformer shouldBe false + d.target shouldBe null + d.text shouldBe null + } + + "SklearnMultinomialNaiveBayesOpDesc.getOutputSchemas" should + "emit the model_name/model schema keyed by the declared output port" in { + val d = new SklearnMultinomialNaiveBayesOpDesc + val schema = d.getOutputSchemas(Map.empty)(d.operatorInfo.outputPorts.head.id) + schema.getAttribute("model_name").getType shouldBe AttributeType.STRING + schema.getAttribute("model").getType shouldBe AttributeType.BINARY + } + + "SklearnMultinomialNaiveBayesOpDesc.generatePythonCode" should + "import and instantiate the MultinomialNB estimator" in { + val d = new SklearnMultinomialNaiveBayesOpDesc + d.target = "y" + val code = d.generatePythonCode() + code should include("from sklearn.naive_bayes import MultinomialNB") + code should include("make_pipeline") + code should include("Multinomial Naive Bayes") + } + + "SklearnMultinomialNaiveBayesOpDesc" should + "round-trip its config fields through the polymorphic base" in { + val d = new SklearnMultinomialNaiveBayesOpDesc + d.target = "label" + d.countVectorizer = true + val json = objectMapper.writeValueAsString(d) + json should include("\"operatorType\":\"SklearnMultinomialNaiveBayes\"") + val restored = objectMapper.readValue(json, classOf[LogicalOp]) + restored shouldBe a[SklearnMultinomialNaiveBayesOpDesc] + val r = restored.asInstanceOf[SklearnMultinomialNaiveBayesOpDesc] + r.target shouldBe "label" + r.countVectorizer shouldBe true + } +}
