spark git commit: [SPARK-12026][MLLIB] ChiSqTest gets slower and slower over time when number of features is large
Repository: spark Updated Branches: refs/heads/master cd81fc9e8 -> 021dafc6a [SPARK-12026][MLLIB] ChiSqTest gets slower and slower over time when number of features is large jira: https://issues.apache.org/jira/browse/SPARK-12026 The issue is valid as features.toArray.view.zipWithIndex.slice(startCol, endCol) becomes slower as startCol gets larger. I tested on local and the change can improve the performance and the running time was stable. Author: Yuhao YangCloses #10146 from hhbyyh/chiSq. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/021dafc6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/021dafc6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/021dafc6 Branch: refs/heads/master Commit: 021dafc6a05a31dc22c9f9110dedb47a1f913087 Parents: cd81fc9 Author: Yuhao Yang Authored: Wed Jan 13 17:43:27 2016 -0800 Committer: Joseph K. Bradley Committed: Wed Jan 13 17:43:27 2016 -0800 -- .../scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/021dafc6/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala index f22f2df..4a3fb064 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala @@ -109,7 +109,9 @@ private[stat] object ChiSqTest extends Logging { } i += 1 distinctLabels += label - features.toArray.view.zipWithIndex.slice(startCol, endCol).map { case (feature, col) => + val brzFeatures = features.toBreeze + (startCol until endCol).map { col => +val feature = brzFeatures(col) allDistinctFeatures(col) += feature (col, feature, label) } @@ -122,7 +124,7 @@ private[stat] object ChiSqTest extends Logging { pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap } val numLabels = labels.size - pairCounts.keys.groupBy(_._1).map { case (col, keys) => + pairCounts.keys.groupBy(_._1).foreach { case (col, keys) => val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap val numRows = features.size val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-12026][MLLIB] ChiSqTest gets slower and slower over time when number of features is large
Repository: spark Updated Branches: refs/heads/branch-1.6 26f13faa9 -> a490787da [SPARK-12026][MLLIB] ChiSqTest gets slower and slower over time when number of features is large jira: https://issues.apache.org/jira/browse/SPARK-12026 The issue is valid as features.toArray.view.zipWithIndex.slice(startCol, endCol) becomes slower as startCol gets larger. I tested on local and the change can improve the performance and the running time was stable. Author: Yuhao YangCloses #10146 from hhbyyh/chiSq. (cherry picked from commit 021dafc6a05a31dc22c9f9110dedb47a1f913087) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a490787d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a490787d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a490787d Branch: refs/heads/branch-1.6 Commit: a490787daa5ec11a5e30bc0df31f81edd54ccc6a Parents: 26f13fa Author: Yuhao Yang Authored: Wed Jan 13 17:43:27 2016 -0800 Committer: Joseph K. Bradley Committed: Wed Jan 13 17:43:38 2016 -0800 -- .../scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a490787d/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala index 23c8d7c..1c583a4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/ChiSqTest.scala @@ -109,7 +109,9 @@ private[stat] object ChiSqTest extends Logging { } i += 1 distinctLabels += label - features.toArray.view.zipWithIndex.slice(startCol, endCol).map { case (feature, col) => + val brzFeatures = features.toBreeze + (startCol until endCol).map { col => +val feature = brzFeatures(col) allDistinctFeatures(col) += feature (col, feature, label) } @@ -122,7 +124,7 @@ private[stat] object ChiSqTest extends Logging { pairCounts.keys.filter(_._1 == startCol).map(_._3).toArray.distinct.zipWithIndex.toMap } val numLabels = labels.size - pairCounts.keys.groupBy(_._1).map { case (col, keys) => + pairCounts.keys.groupBy(_._1).foreach { case (col, keys) => val features = keys.map(_._2).toArray.distinct.zipWithIndex.toMap val numRows = features.size val contingency = new BDM(numRows, numLabels, new Array[Double](numRows * numLabels)) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org