Copilot commented on code in PR #5656: URL: https://github.com/apache/texera/pull/5656#discussion_r3406832362
########## common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/randomksampling/RandomKSamplingOpExecSpec.scala: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.randomksampling + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec + +class RandomKSamplingOpExecSpec extends AnyFlatSpec { + + // --------------------------------------------------------------------------- + // Fixture builders + // --------------------------------------------------------------------------- + + private val attr = new Attribute("v", AttributeType.INTEGER) + private val schema: Schema = Schema().add(attr) + private def tuple(v: Int): Tuple = + Tuple.builder(schema).add(attr, Integer.valueOf(v)).build() + + private def descJson(percentage: Int): String = { + val desc = new RandomKSamplingOpDesc + desc.percentage = percentage + objectMapper.writeValueAsString(desc) + } + + /** Run `count` tuples through `exec` and return how many it emitted. */ + private def emittedCount(exec: RandomKSamplingOpExec, count: Int): Int = + (1 to count).count(i => exec.processTuple(tuple(i), port = 0).nonEmpty) + + // --------------------------------------------------------------------------- + // Boundary cases — 0% and 100% + // --------------------------------------------------------------------------- + // + // The predicate is `(desc.percentage / 100.0) >= rand.nextDouble()`. + // `Random.nextDouble()` returns a value in `[0.0, 1.0)`. + // + // - At 100% (`1.0`), `1.0 >= rand.nextDouble()` always holds → accept all. + // - At 0% (`0.0`), `0.0 >= rand.nextDouble()` holds iff `nextDouble()` + // returns `0.0`. The probability of that is 1 / 2^53 ≈ 10^-16 — for + // practical purposes, reject all. + + "RandomKSamplingOpExec with percentage = 100" should "accept every tuple" in { + val exec = new RandomKSamplingOpExec(descJson(percentage = 100), idx = 0, workerCount = 7) + assert(emittedCount(exec, 1000) == 1000) + } + + "RandomKSamplingOpExec with percentage = 0" should "reject every tuple" in { + // Edge case (`rand.nextDouble() == 0.0` would let one through) is + // astronomically improbable — running 1000 draws with a fixed seed + // either always passes or always fails. The latter is what the + // implementation produces for percentage 0. + val exec = new RandomKSamplingOpExec(descJson(percentage = 0), idx = 0, workerCount = 7) + assert(emittedCount(exec, 1000) == 0) + } + + // --------------------------------------------------------------------------- + // Determinism — seed = workerCount, so the same (workerCount, + // percentage, input-count) produces the same emission count across runs. + // --------------------------------------------------------------------------- + + "RandomKSamplingOpExec with the same workerCount and percentage" should + "produce the same emission count across two fresh instances (deterministic seed)" in { + val a = new RandomKSamplingOpExec(descJson(percentage = 50), idx = 0, workerCount = 13) + val b = new RandomKSamplingOpExec(descJson(percentage = 50), idx = 1, workerCount = 13) + val countA = emittedCount(a, 200) + val countB = emittedCount(b, 200) + assert(countA == countB, s"deterministic seed should give equal counts, got $countA vs $countB") + } + + it should "yield approximately the requested fraction over a large sample" in { + // At 50% over 2000 tuples, the expected emission count is ~1000. + // For a binomial(2000, 0.5), 3σ is ~67 — allow a ±150 band so the + // case is well clear of stochastic flakiness while still catching + // gross deviations (e.g. percentage being ignored). + val exec = new RandomKSamplingOpExec(descJson(percentage = 50), idx = 0, workerCount = 1) + val n = emittedCount(exec, 2000) + assert(n >= 850 && n <= 1150, s"expected ~1000 emissions at 50%%, got $n") + } + + // --------------------------------------------------------------------------- + // Different worker seeds — different streams + // --------------------------------------------------------------------------- + + "RandomKSamplingOpExec with different workerCount values" should + "draw different sequences (the seed is workerCount)" in { + // Two executors with the same percentage but different workerCount + // should not produce IDENTICAL emission sequences over a meaningful + // sample — the seed is workerCount, so the streams diverge. + val a = new RandomKSamplingOpExec(descJson(percentage = 50), idx = 0, workerCount = 1) + val b = new RandomKSamplingOpExec(descJson(percentage = 50), idx = 0, workerCount = 2) + val emissionsA = (1 to 100).map(i => exec_emit(a, i)) + val emissionsB = (1 to 100).map(i => exec_emit(b, i)) + assert( + emissionsA != emissionsB, + "different workerCount seeds must produce different emission sequences" + ) + } + + private def exec_emit(exec: RandomKSamplingOpExec, i: Int): Boolean = + exec.processTuple(tuple(i), port = 0).nonEmpty Review Comment: Helper method name `exec_emit` is inconsistent with the prevailing camelCase style used elsewhere in the test suite (and in this file, e.g., `emittedCount`). Renaming it improves readability and keeps naming consistent. ########## common/workflow-operator/src/test/scala/org/apache/texera/amber/operator/substringSearch/SubstringSearchOpExecSpec.scala: ########## @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.texera.amber.operator.substringSearch + +import org.apache.texera.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import org.apache.texera.amber.util.JSONUtils.objectMapper +import org.scalatest.flatspec.AnyFlatSpec + +class SubstringSearchOpExecSpec extends AnyFlatSpec { + + // --------------------------------------------------------------------------- + // Fixture builders + // --------------------------------------------------------------------------- + + private val attr = new Attribute("body", AttributeType.STRING) + private val schema: Schema = Schema().add(attr) + private def tuple(text: String): Tuple = + Tuple.builder(schema).add(attr, text).build() + + private def descJson(substring: String, isCaseSensitive: Boolean = false): String = { + val desc = new SubstringSearchOpDesc + desc.attribute = "body" + desc.substring = substring + desc.isCaseSensitive = isCaseSensitive + objectMapper.writeValueAsString(desc) + } + + // --------------------------------------------------------------------------- + // Substring detection — match / no-match + // --------------------------------------------------------------------------- + + "SubstringSearchOpExec" should "yield the input tuple when the substring is present" in { + val exec = new SubstringSearchOpExec(descJson(substring = "hello")) + val t = tuple("hello world") + assert(exec.processTuple(t, port = 0).toList == List(t)) + } + + it should "yield nothing when the substring is absent" in { + val exec = new SubstringSearchOpExec(descJson(substring = "missing")) + assert(exec.processTuple(tuple("hello world"), port = 0).toList.isEmpty) + } + + it should + "match when the substring sits anywhere in the value (start / middle / end)" in { + val exec = new SubstringSearchOpExec(descJson(substring = "abc")) + assert(exec.processTuple(tuple("abc xx"), port = 0).toList.nonEmpty) + assert(exec.processTuple(tuple("xx abc xx"), port = 0).toList.nonEmpty) + assert(exec.processTuple(tuple("xx abc"), port = 0).toList.nonEmpty) + } + + // --------------------------------------------------------------------------- + // Case sensitivity + // --------------------------------------------------------------------------- + + "SubstringSearchOpExec with isCaseSensitive = true" should + "match case-sensitively (case mismatch is rejected)" in { + val exec = new SubstringSearchOpExec(descJson(substring = "HELLO", isCaseSensitive = true)) + assert(exec.processTuple(tuple("hello world"), port = 0).toList.isEmpty) + } + + it should "yield the tuple when the case matches under case-sensitive mode" in { + val exec = new SubstringSearchOpExec(descJson(substring = "HELLO", isCaseSensitive = true)) + val t = tuple("Say HELLO loudly") + assert(exec.processTuple(t, port = 0).toList == List(t)) + } + + "SubstringSearchOpExec with isCaseSensitive = false" should + "match case-insensitively via lowercased equality (production lowercases both sides)" in { + val exec = new SubstringSearchOpExec(descJson(substring = "HELLO", isCaseSensitive = false)) Review Comment: The test description says "lowercased equality", but the production implementation lowercases both sides and then uses `String.contains` (substring semantics), not equality. Updating the wording will avoid misleading future readers about the contract being pinned. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
