Repository: mahout Updated Branches: refs/heads/master 260753fdb -> 5197ac9e8
MAHOUT-1638: H2O bindings fail at drmParallelizeWithRowLabels(...) closes apache/mahout#99 and MAHOUT-1493-h2o closes apache/mahout#72 Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/5197ac9e Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/5197ac9e Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/5197ac9e Branch: refs/heads/master Commit: 5197ac9e8393763dd4ed58f5f1e9f167e75d3078 Parents: 260753f Author: Andrew Palumbo <[email protected]> Authored: Fri Apr 3 12:43:04 2015 -0400 Committer: Andrew Palumbo <[email protected]> Committed: Fri Apr 3 12:43:04 2015 -0400 ---------------------------------------------------------------------- CHANGELOG | 2 ++ .../org/apache/mahout/h2obindings/H2OHdfs.java | 2 +- .../apache/mahout/h2obindings/H2OHelper.java | 37 +++++++++++++++++++- .../mahout/h2obindings/ops/MapBlockHelper.scala | 2 +- .../classifier/naivebayes/NBH2OTestSuite.scala | 26 ++++++++++++++ .../stats/ClassifierStatsH2OTestSuite.scala | 26 ++++++++++++++ 6 files changed, 92 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/5197ac9e/CHANGELOG ---------------------------------------------------------------------- diff --git a/CHANGELOG b/CHANGELOG index fcf2bc3..ba96f26 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ Mahout Change Log Release 0.10.0 - unreleased + MAHOUT-1638: H2O bindings fail at drmParallelizeWithRowLabels(...) (Anand Avati via apalumbo) + MAHOUT-1667: Hadoop 1 and 2 profile in POM (sslavic) MAHOUT-1564: Naive Bayes Classifier for New Text Documents (apalumbo) http://git-wip-us.apache.org/repos/asf/mahout/blob/5197ac9e/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java ---------------------------------------------------------------------- diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java index f21ebe0..56b3745 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHdfs.java @@ -167,7 +167,7 @@ public class H2OHdfs { } if (reader.getKeyClass() == Text.class) { - labels = frame.anyVec().makeZero(); + labels = H2OHelper.makeEmptyStrVec(frame.anyVec()); labelwriter = labels.open(); } http://git-wip-us.apache.org/repos/asf/mahout/blob/5197ac9e/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java ---------------------------------------------------------------------- diff --git a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java index 2ede8cf..859e5b4 100644 --- a/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java +++ b/h2o/src/main/java/org/apache/mahout/h2obindings/H2OHelper.java @@ -36,6 +36,11 @@ import java.util.HashMap; import org.apache.mahout.h2obindings.drm.H2ODrm; +// for makeEmptyStrVec +import water.Key; +import water.DKV; +import water.fvec.CStrChunk; + /** * Collection of helper methods for H2O backend. */ @@ -323,7 +328,7 @@ public class H2OHelper { Map<String,Integer> map = m.getRowLabelBindings(); if (map != null) { // label vector must be similarly partitioned like the Frame - labels = frame.anyVec().makeZero(); + labels = makeEmptyStrVec(frame.anyVec()); Vec.Writer writer = labels.open(); Map<Integer,String> rmap = reverseMap(map); for (int r = 0; r < m.rowSize(); r++) { @@ -389,6 +394,36 @@ public class H2OHelper { return new Frame(vecs); } + + /** + * The following two methods: vecChunkLen and makeEmptyStrVec + * are h2o-0.1.25 specific. + */ + public static Vec makeEmptyStrVec(final Vec template) { + final int nChunks = template.nChunks(); + Key<Vec> key = template.group().addVec(); + final Vec emptystr = new Vec(key, template._espc, null, Vec.T_NUM); + + new MRTask() { + @Override protected void setupLocal() { + for (int i = 0; i < nChunks; i++) { + Key k = emptystr.chunkKey(i); + int chklen = vecChunkLen(template, i); + int stridx[] = new int[chklen]; + byte b[] = new byte[1]; b[0] = 0; + for (int j = 0; j < chklen; j++) stridx[j] = -1; + if (k.home()) DKV.put(k, new CStrChunk(1, b, chklen, stridx), _fs); + } + if (emptystr._key.home()) DKV.put(emptystr._key, emptystr, _fs); + } + }.doAllNodes(); + return emptystr; + } + + public static int vecChunkLen(Vec template, int chunk) { + return (int) (template._espc[chunk + 1] - template._espc[chunk]); + } + /** * Create an empty (zero-filled) H2O DRM. * http://git-wip-us.apache.org/repos/asf/mahout/blob/5197ac9e/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala ---------------------------------------------------------------------- diff --git a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala index 0384826..f69a844 100644 --- a/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala +++ b/h2o/src/main/scala/org/apache/mahout/h2obindings/ops/MapBlockHelper.scala @@ -37,7 +37,7 @@ object MapBlockHelper { case `s` => { val arr = new Array[String](in.rowSize) val vstr = new ValueString - for (i <- 0 to in.rowSize) { + for (i <- 0 to (in.rowSize - 1)) { arr(i) = labels.atStr(vstr, i + startlong).toString } arr http://git-wip-us.apache.org/repos/asf/mahout/blob/5197ac9e/h2o/src/test/scala/org/apache/mahout/classifier/naivebayes/NBH2OTestSuite.scala ---------------------------------------------------------------------- diff --git a/h2o/src/test/scala/org/apache/mahout/classifier/naivebayes/NBH2OTestSuite.scala b/h2o/src/test/scala/org/apache/mahout/classifier/naivebayes/NBH2OTestSuite.scala new file mode 100644 index 0000000..8759e3e --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/classifier/naivebayes/NBH2OTestSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.classifier.naivebayes + +import org.apache.mahout.math._ +import org.apache.mahout.math.scalabindings.RLikeOps._ +import org.apache.mahout.math.scalabindings._ +import org.apache.mahout.h2obindings.test.DistributedH2OSuite +import org.apache.mahout.test.MahoutSuite +import org.scalatest.FunSuite + +class NBH2OTestSuite extends FunSuite with MahoutSuite with DistributedH2OSuite with NBTestBase http://git-wip-us.apache.org/repos/asf/mahout/blob/5197ac9e/h2o/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsH2OTestSuite.scala ---------------------------------------------------------------------- diff --git a/h2o/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsH2OTestSuite.scala b/h2o/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsH2OTestSuite.scala new file mode 100644 index 0000000..909a8fa --- /dev/null +++ b/h2o/src/test/scala/org/apache/mahout/classifier/stats/ClassifierStatsH2OTestSuite.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.classifier.stats + +import org.apache.mahout.h2obindings.test.DistributedH2OSuite +import org.apache.mahout.test.MahoutSuite +import org.scalatest.FunSuite + +class ClassifierStatsH2OTestSuite extends FunSuite with MahoutSuite with DistributedH2OSuite with ClassifierStatsTestBase + +
