[ https://issues.apache.org/jira/browse/FLINK-1901?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14696369#comment-14696369 ]
ASF GitHub Bot commented on FLINK-1901: --------------------------------------- Github user ChengXiangLi commented on a diff in the pull request: https://github.com/apache/flink/pull/949#discussion_r37046195 --- Diff: flink-tests/src/test/scala/org/apache/flink/api/scala/operators/SampleITCase.scala --- @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.api.scala.operators + +import java.util.{List => JavaList, Random} + +import org.apache.flink.api.scala._ +import org.apache.flink.api.scala.util.CollectionDataSets +import org.apache.flink.test.util.MultipleProgramsTestBase.TestExecutionMode +import org.apache.flink.test.util.{MultipleProgramsTestBase, TestBaseUtils} +import org.junit.Assert._ +import org.junit.runner.RunWith +import org.junit.runners.Parameterized +import org.junit.{Before, After, Test} + +import scala.collection.JavaConverters._ + +@RunWith(classOf[Parameterized]) +class SampleITCase(mode: TestExecutionMode) extends MultipleProgramsTestBase(mode) { + private val RNG: Random = new Random + + private var result: JavaList[String] = null; + + @Before + def initiate { + ExecutionEnvironment.getExecutionEnvironment.setParallelism(5) + } + + @After + def after() = { + TestBaseUtils.containsResultAsText(result, getSourceStrings) + } + + @Test + @throws(classOf[Exception]) + def testSamplerWithFractionWithoutReplacement { + verifySamplerWithFractionWithoutReplacement(0d) + verifySamplerWithFractionWithoutReplacement(0.2d) + verifySamplerWithFractionWithoutReplacement(1.0d) + } + + @Test + @throws(classOf[Exception]) + def testSamplerWithFractionWithReplacement { + verifySamplerWithFractionWithReplacement(0d) + verifySamplerWithFractionWithReplacement(0.2d) + verifySamplerWithFractionWithReplacement(1.0d) + verifySamplerWithFractionWithReplacement(2.0d) + } + + @Test + @throws(classOf[Exception]) + def testSamplerWithSizeWithoutReplacement { + verifySamplerWithFixedSizeWithoutReplacement(0) + verifySamplerWithFixedSizeWithoutReplacement(2) + verifySamplerWithFixedSizeWithoutReplacement(21) + } + + @Test + @throws(classOf[Exception]) + def testSamplerWithSizeWithReplacement { + verifySamplerWithFixedSizeWithReplacement(0) + verifySamplerWithFixedSizeWithReplacement(2) + verifySamplerWithFixedSizeWithReplacement(21) + } + + @throws(classOf[Exception]) + private def verifySamplerWithFractionWithoutReplacement(fraction: Double) { + verifySamplerWithFractionWithoutReplacement(fraction, RNG.nextLong) + } + + @throws(classOf[Exception]) + private def verifySamplerWithFractionWithoutReplacement(fraction: Double, seed: Long) { + verifySamplerWithFraction(false, fraction, seed) + } + + @throws(classOf[Exception]) + private def verifySamplerWithFractionWithReplacement(fraction: Double) { + verifySamplerWithFractionWithReplacement(fraction, RNG.nextLong) + } + + @throws(classOf[Exception]) + private def verifySamplerWithFractionWithReplacement(fraction: Double, seed: Long) { + verifySamplerWithFraction(true, fraction, seed) + } + + @throws(classOf[Exception]) + private def verifySamplerWithFraction(withReplacement: Boolean, fraction: Double, seed: Long) { + val ds = getSourceDataSet() + val sampled = ds.sample(withReplacement, fraction, seed) + result = sampled.collect.asJava --- End diff -- The validity of sample result is verified in after() method for each test. As the source data is very small, verify the fraction does not make much sense, so i didn't verify the fraction validity here, but it got verified in RandomSamplerTest in Sampler level by the way. > Create sample operator for Dataset > ---------------------------------- > > Key: FLINK-1901 > URL: https://issues.apache.org/jira/browse/FLINK-1901 > Project: Flink > Issue Type: Improvement > Components: Core > Reporter: Theodore Vasiloudis > Assignee: Chengxiang Li > > In order to be able to implement Stochastic Gradient Descent and a number of > other machine learning algorithms we need to have a way to take a random > sample from a Dataset. > We need to be able to sample with or without replacement from the Dataset, > choose the relative or exact size of the sample, set a seed for > reproducibility, and support sampling within iterations. -- This message was sent by Atlassian JIRA (v6.3.4#6332)