Repository: spark Updated Branches: refs/heads/master 6de6a9772 -> 672032d0a
[SPARK-13020][SQL][TEST] fix random generator for map type when we generate map, we first randomly pick a length, then create a seq of key value pair with the expected length, and finally call `toMap`. However, `toMap` will remove all duplicated keys, which makes the actual map size much less than we expected. This PR fixes this problem by put keys in a set first, to guarantee we have enough keys to build a map with expected length. Author: Wenchen Fan <[email protected]> Closes #10930 from cloud-fan/random-generator. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/672032d0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/672032d0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/672032d0 Branch: refs/heads/master Commit: 672032d0ab1e43bc5a25cecdb1b96dfd35c39778 Parents: 6de6a97 Author: Wenchen Fan <[email protected]> Authored: Wed Feb 3 08:26:35 2016 +0800 Committer: Yin Huai <[email protected]> Committed: Wed Feb 3 08:26:35 2016 +0800 ---------------------------------------------------------------------- .../apache/spark/sql/RandomDataGenerator.scala | 18 ++++++++++++++---- .../spark/sql/RandomDataGeneratorSuite.scala | 11 +++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/672032d0/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala index 55efea8..7c173cb 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGenerator.scala @@ -47,9 +47,9 @@ object RandomDataGenerator { */ private val PROBABILITY_OF_NULL: Float = 0.1f - private val MAX_STR_LEN: Int = 1024 - private val MAX_ARR_SIZE: Int = 128 - private val MAX_MAP_SIZE: Int = 128 + final val MAX_STR_LEN: Int = 1024 + final val MAX_ARR_SIZE: Int = 128 + final val MAX_MAP_SIZE: Int = 128 /** * Helper function for constructing a biased random number generator which returns "interesting" @@ -208,7 +208,17 @@ object RandomDataGenerator { forType(valueType, nullable = valueContainsNull, rand) ) yield { () => { - Seq.fill(rand.nextInt(MAX_MAP_SIZE))((keyGenerator(), valueGenerator())).toMap + val length = rand.nextInt(MAX_MAP_SIZE) + val keys = scala.collection.mutable.HashSet(Seq.fill(length)(keyGenerator()): _*) + // In case the number of different keys is not enough, set a max iteration to avoid + // infinite loop. + var count = 0 + while (keys.size < length && count < MAX_MAP_SIZE) { + keys += keyGenerator() + count += 1 + } + val values = Seq.fill(keys.size)(valueGenerator()) + keys.zip(values).toMap } } } http://git-wip-us.apache.org/repos/asf/spark/blob/672032d0/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala index b8ccdf7..9fba792 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/RandomDataGeneratorSuite.scala @@ -95,4 +95,15 @@ class RandomDataGeneratorSuite extends SparkFunSuite { } } + test("check size of generated map") { + val mapType = MapType(IntegerType, IntegerType) + for (seed <- 1 to 1000) { + val generator = RandomDataGenerator.forType( + mapType, nullable = false, rand = new Random(seed)).get + val maps = Seq.fill(100)(generator().asInstanceOf[Map[Int, Int]]) + val expectedTotalElements = 100 / 2 * RandomDataGenerator.MAX_MAP_SIZE + val deviation = math.abs(maps.map(_.size).sum - expectedTotalElements) + assert(deviation.toDouble / expectedTotalElements < 2e-1) + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
