MAHOUT-1745: Purge deprecated ConcatVectorsJob from codebase
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/f2151ee5 Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/f2151ee5 Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/f2151ee5 Branch: refs/heads/mahout-0.10.x Commit: f2151ee5eaebd24ba7c75012c2aea84671cb0443 Parents: 5d0c603 Author: Andrew Palumbo <[email protected]> Authored: Fri Jul 31 19:05:59 2015 -0400 Committer: Andrew Palumbo <[email protected]> Committed: Fri Jul 31 19:05:59 2015 -0400 ---------------------------------------------------------------------- CHANGELOG | 2 + .../mahout/utils/ConcatenateVectorsJob.java | 118 ------------------- .../mahout/utils/ConcatenateVectorsReducer.java | 102 ---------------- .../mahout/utils/TestConcatenateVectorsJob.java | 99 ---------------- src/conf/driver.classes.default.props | 1 - 5 files changed, 2 insertions(+), 320 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/mahout/blob/f2151ee5/CHANGELOG ---------------------------------------------------------------------- diff --git a/CHANGELOG b/CHANGELOG index 8457242..61ef3a6 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ Mahout Change Log Release 0.10.2 - unreleased + MAHOUT-1745: Purge deprecated ConcatVectorsJob from codebase (apalumbo) + MAHOUT-1757: Small fix in spca formula (smarthi) MAHOUT-1756: Missing +=: and *=: operators on vectors (smarthi) http://git-wip-us.apache.org/repos/asf/mahout/blob/f2151ee5/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java b/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java deleted file mode 100644 index 33d09a0..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsJob.java +++ /dev/null @@ -1,118 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - *3 - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils; - -import java.io.IOException; - -import com.google.common.base.Preconditions; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.util.ToolRunner; -import org.apache.mahout.common.AbstractJob; -import org.apache.mahout.common.commandline.DefaultOptionCreator; -import org.apache.mahout.math.VectorWritable; - -/* - * Map-reduce job to combine two matrices A and B to (a1,a2,...aN,b1,b2,...bN) - * Technically works on Vector files, so will also concatenate two vectors. - * If either input is a NamedVector, the output has the name: A.name has precedence over B.name. - * Concatenation or per-member combinations given a function object. - * - * Uses clever hack which requires different matrices to have a different number of columns. - * Courtesy of Jake Mannix, https://issues.apache.org/jira/browse/MAHOUT-884 - * If vectors are same length, this will not concatenate them in the right order - * - * @deprecated as of 0.10.0 - * - * TODO: generalize to multiple matrices, should the teeming masses so desire - */ -@Deprecated -public class ConcatenateVectorsJob extends AbstractJob { - - static final String MATRIXA_DIMS = "mahout.concatenatevectors.matrixA_dims"; - static final String MATRIXB_DIMS = "mahout.concatenatevectors.matrixB_dims"; - - private ConcatenateVectorsJob() {} - - public static void main(String[] args) throws Exception { - ToolRunner.run(new ConcatenateVectorsJob(), args); - } - - @Override - public int run(String[] args) throws Exception { - addOption("matrixA", "ma", "A (left) matrix directory", true); - addOption("matrixB", "mb", "B (right) matrix directory", true); - addOutputOption(); - DefaultOptionCreator.overwriteOption().create(); - - if (parseArguments(args) == null) { - return -1; - } - - Path pathA = new Path(getOption("matrixA")); - Path pathB = new Path(getOption("matrixB")); - Path pathOutput = getOutputPath(); - - Configuration configuration = getConf(); - FileSystem fs = FileSystem.get(configuration); - - Class<? extends Writable> keyClassA = getKeyClass(pathA, fs); - Class<? extends Writable> keyClassB = getKeyClass(pathB, fs); - - Preconditions.checkArgument(keyClassA.equals(keyClassB), "All SequenceFiles must use same key class"); - - int dimA = getDimensions(pathA); - int dimB = getDimensions(pathB); - - String nameA = getOption("matrixA"); - String nameB = getOption("matrixB"); - - Job concatenate = prepareJob( - new Path(nameA + "," + nameB), pathOutput, Mapper.class, keyClassA, VectorWritable.class, - ConcatenateVectorsReducer.class, keyClassA, VectorWritable.class); - - configuration = concatenate.getConfiguration(); - configuration.set(MATRIXA_DIMS, Integer.toString(dimA)); - configuration.set(MATRIXB_DIMS, Integer.toString(dimB)); - // TODO: add reducer as combiner - need a system that can exercise combiners - - boolean succeeded = concatenate.waitForCompletion(true); - if (!succeeded) { - return -1; - } - return 0; - } - - private Class<? extends Writable> getKeyClass(Path path, FileSystem fs) throws IOException { - // this works for both part* and a directory/ with part*. - Path pathPattern = new Path(path, "part*"); - FileStatus[] paths = fs.globStatus(pathPattern); - Preconditions.checkArgument(paths.length > 0, path.getName() + " is a file, should be a directory"); - - Path file = paths[0].getPath(); - try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, fs.getConf())){ - return reader.getKeyClass().asSubclass(Writable.class); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/f2151ee5/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java ---------------------------------------------------------------------- diff --git a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java b/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java deleted file mode 100644 index 0cf12ae..0000000 --- a/integration/src/main/java/org/apache/mahout/utils/ConcatenateVectorsReducer.java +++ /dev/null @@ -1,102 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.mahout.utils; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.math.NamedVector; -import org.apache.mahout.math.SequentialAccessSparseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; - -import com.google.common.base.Preconditions; - -/* - * Moded combiner/reducer. If vector comes in as length A or length B, concatenated.Ë - * If it is length A + B, combiner has already concatenated. - * - * @deprecated as of 0.10.0. - * - */ -@Deprecated -public class ConcatenateVectorsReducer extends Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable> { - - int dimsA = 0; - int dimsB = 0; - - public ConcatenateVectorsReducer() { - - } - - public void setup(Context context) throws java.io.IOException, InterruptedException { - Configuration configuration = context.getConfiguration(); - - dimsA = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXA_DIMS)[0]); - dimsB = Integer.valueOf(configuration.getStrings(ConcatenateVectorsJob.MATRIXB_DIMS)[0]); - } - - public void reduce(IntWritable row, Iterable<VectorWritable> vectorWritableIterable, - Context ctx) throws java.io.IOException ,InterruptedException { - Vector vA = null; - Vector vB = null; - Vector vOut = null; - boolean isNamed = false; - String name = null; - - for (VectorWritable vw: vectorWritableIterable) { - Vector v = vw.get(); - if (v instanceof NamedVector) { - name = ((NamedVector) v).getName(); - isNamed = true; - } - - if (v.size() == dimsA) { - vA = v; - } else if (v.size() == dimsB) { - vB = v; - } else if (v.size() == dimsA + dimsB) { - vOut = v; - break; - } - } - - Preconditions.checkArgument((vA != null || vB != null) || (vOut != null)); - - if (vOut == null) { - vOut = new SequentialAccessSparseVector(dimsA + dimsB); - if (isNamed) { - vOut = new NamedVector(vOut, name); - } - } - - if (vA != null) { - appendVector(vOut, vA, 0); - } - - if (vB != null) { - appendVector(vOut, vB, dimsA); - } - ctx.write(row, new VectorWritable(vOut)); - } - - private void appendVector(Vector vOut, Vector vIn, int offset) { - for (Vector.Element element : vIn.nonZeroes()) { - vOut.set(element.index() + offset, element.get()); - } - } -} http://git-wip-us.apache.org/repos/asf/mahout/blob/f2151ee5/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java ---------------------------------------------------------------------- diff --git a/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java b/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java deleted file mode 100644 index a4e2bfc..0000000 --- a/integration/src/test/java/org/apache/mahout/utils/TestConcatenateVectorsJob.java +++ /dev/null @@ -1,99 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.mahout.utils; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.mahout.common.DummyRecordWriter; -import org.apache.mahout.common.MahoutTestCase; -import org.apache.mahout.math.DenseVector; -import org.apache.mahout.math.Vector; -import org.apache.mahout.math.VectorWritable; -import org.junit.Test; - -/** - * Code stolen from TestAffinityMatrixJob. Like TAMJ, it tests the Mappers/Reducers but not test the job - */ -@Deprecated -public class TestConcatenateVectorsJob extends MahoutTestCase { - - private static final double [][] DATA_A = { - {0,1,2,3,4}, - {}, - {0,1,2,3,4} - }; - private static final double [][] DATA_B = { - {}, - {5,6,7}, - {5,6,7} - }; - - @Test - public void testConcatenateVectorsReducer() throws Exception { - - Configuration configuration = getConfiguration(); - configuration.set(ConcatenateVectorsJob.MATRIXA_DIMS, "5"); - configuration.set(ConcatenateVectorsJob.MATRIXB_DIMS, "3"); - - // Yes, all of this generic rigmarole is needed, and woe betide he who changes it - ConcatenateVectorsReducer reducer = new ConcatenateVectorsReducer(); - - DummyRecordWriter<IntWritable, VectorWritable> recordWriter = new DummyRecordWriter<>(); - - Reducer<IntWritable, VectorWritable, IntWritable, VectorWritable>.Context reduceContext = - DummyRecordWriter.build(reducer, configuration, recordWriter, IntWritable.class, VectorWritable.class); - - reducer.setup(reduceContext); - - for(int i = 0; i < 3; i++) { - double[] values = DATA_A[i]; - List<VectorWritable> vwList = new ArrayList<>(); - if (values.length > 0) { - Vector v = new DenseVector(values); - VectorWritable vw = new VectorWritable(); - vw.set(v); - vwList.add(vw); - } - values = DATA_B[i]; - if (values.length > 0) { - Vector v = new DenseVector(values); - VectorWritable vw = new VectorWritable(); - vw.set(v); - vwList.add(vw); - - } - IntWritable row = new IntWritable(i); - - reducer.reduce(row, vwList, reduceContext); - } - - for (IntWritable row : recordWriter.getKeys()) { - List<VectorWritable> list = recordWriter.getValue(row); - Vector v = list.get(0).get(); - assertEquals(8, v.size()); - for (Vector.Element element : v.nonZeroes()) { - assertEquals(element.index(), v.get(element.index()), 0.001); - } - } - } - -} http://git-wip-us.apache.org/repos/asf/mahout/blob/f2151ee5/src/conf/driver.classes.default.props ---------------------------------------------------------------------- diff --git a/src/conf/driver.classes.default.props b/src/conf/driver.classes.default.props index d6a5ddb..69a9ba5 100644 --- a/src/conf/driver.classes.default.props +++ b/src/conf/driver.classes.default.props @@ -14,7 +14,6 @@ org.apache.mahout.vectorizer.EncodedVectorsFromSequenceFiles = seq2encoded: Enco org.apache.mahout.text.WikipediaToSequenceFile = seqwiki : Wikipedia xml dump to sequence file org.apache.mahout.text.SequenceFilesFromMailArchives = seqmailarchives : Creates SequenceFile from a directory containing gzipped mail archives org.apache.mahout.text.SequenceFilesFromLuceneStorageDriver = lucene2seq : Generate Text SequenceFiles from a Lucene index -org.apache.mahout.utils.ConcatenateVectorsJob = concatmatrices : Concatenates 2 matrices of same cardinality into a single matrix org.apache.mahout.clustering.streaming.tools.ResplitSequenceFiles = resplit : Splits a set of SequenceFiles into a number of equal splits org.apache.mahout.clustering.streaming.tools.ClusterQualitySummarizer = qualcluster : Runs clustering experiments and summarizes results in a CSV org.apache.mahout.classifier.df.tools.Describe = describe : Describe the fields and target variable in a data set
