Author: srowen
Date: Thu Jun 23 21:05:11 2011
New Revision: 1139072
URL: http://svn.apache.org/viewvc?rev=1139072&view=rev
Log:
MAHOUT-708 update to Hadoop 0.20.203.0, which just entailed better logic to
ignore new _SUCCESS files. The result still works in 0.20.2
Modified:
mahout/trunk/core/pom.xml
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/eval/ParallelFactorizationEvaluator.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterable.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterator.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterable.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterator.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileIterable.java
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileValueIterable.java
mahout/trunk/core/src/main/java/org/apache/mahout/ga/watchmaker/OutputUtils.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
mahout/trunk/pom.xml
Modified: mahout/trunk/core/pom.xml
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/pom.xml?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
--- mahout/trunk/core/pom.xml (original)
+++ mahout/trunk/core/pom.xml Thu Jun 23 21:05:11 2011
@@ -143,6 +143,14 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-core-asl</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-mapper-asl</artifactId>
+ </dependency>
<dependency>
<groupId>org.slf4j</groupId>
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/eval/ParallelFactorizationEvaluator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/eval/ParallelFactorizationEvaluator.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/eval/ParallelFactorizationEvaluator.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/als/eval/ParallelFactorizationEvaluator.java
Thu Jun 23 21:05:11 2011
@@ -38,6 +38,7 @@ import org.apache.mahout.cf.taste.impl.c
import org.apache.mahout.common.AbstractJob;
import org.apache.mahout.common.IntPairWritable;
import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable;
@@ -108,7 +109,10 @@ public class ParallelFactorizationEvalua
protected double computeRmse(Path errors) {
RunningAverage average = new FullRunningAverage();
for (Pair<DoubleWritable,NullWritable> entry :
- new SequenceFileDirIterable<DoubleWritable, NullWritable>(errors,
PathType.LIST, getConf())) {
+ new SequenceFileDirIterable<DoubleWritable, NullWritable>(errors,
+
PathType.LIST,
+
PathFilters.logsCRCFilter(),
+ getConf())) {
DoubleWritable error = entry.getFirst();
average.addDatum(error.get() * error.get());
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterable.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterable.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterable.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterable.java
Thu Jun 23 21:05:11 2011
@@ -76,7 +76,7 @@ public final class SequenceFileDirIterab
try {
return new SequenceFileDirIterator<K, V>(path, pathType, filter,
ordering, reuseKeyValueInstances, conf);
} catch (IOException ioe) {
- throw new IllegalStateException(ioe);
+ throw new IllegalStateException(path.toString(), ioe);
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterator.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterator.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirIterator.java
Thu Jun 23 21:05:11 2011
@@ -70,7 +70,7 @@ public final class SequenceFileDirIterat
try {
return new
SequenceFileIterator<K,V>(from.getPath(), reuseKeyValueInstances, conf);
} catch (IOException ioe) {
- throw new IllegalStateException(ioe);
+ throw new
IllegalStateException(from.getPath().toString(), ioe);
}
}
});
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterable.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterable.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterable.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterable.java
Thu Jun 23 21:05:11 2011
@@ -75,7 +75,7 @@ public final class SequenceFileDirValueI
try {
return new SequenceFileDirValueIterator<V>(path, pathType, filter,
ordering, reuseKeyValueInstances, conf);
} catch (IOException ioe) {
- throw new IllegalStateException(ioe);
+ throw new IllegalStateException(path.toString(), ioe);
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterator.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterator.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileDirValueIterator.java
Thu Jun 23 21:05:11 2011
@@ -66,7 +66,7 @@ public final class SequenceFileDirValueI
try {
return new
SequenceFileValueIterator<V>(from.getPath(), reuseKeyValueInstances, conf);
} catch (IOException ioe) {
- throw new IllegalStateException(ioe);
+ throw new
IllegalStateException(from.getPath().toString(), ioe);
}
}
});
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileIterable.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileIterable.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileIterable.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileIterable.java
Thu Jun 23 21:05:11 2011
@@ -60,7 +60,7 @@ public final class SequenceFileIterable<
try {
return new SequenceFileIterator<K, V>(path, reuseKeyValueInstances,
conf);
} catch (IOException ioe) {
- throw new IllegalStateException(ioe);
+ throw new IllegalStateException(path.toString(), ioe);
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileValueIterable.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileValueIterable.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileValueIterable.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/common/iterator/sequencefile/SequenceFileValueIterable.java
Thu Jun 23 21:05:11 2011
@@ -59,7 +59,7 @@ public final class SequenceFileValueIter
try {
return new SequenceFileValueIterator<V>(path, reuseKeyValueInstances,
conf);
} catch (IOException ioe) {
- throw new IllegalStateException(ioe);
+ throw new IllegalStateException(path.toString(), ioe);
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/ga/watchmaker/OutputUtils.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/ga/watchmaker/OutputUtils.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/ga/watchmaker/OutputUtils.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/ga/watchmaker/OutputUtils.java
Thu Jun 23 21:05:11 2011
@@ -28,6 +28,7 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import
org.apache.mahout.common.iterator.sequencefile.SequenceFileValueIterable;
/** Utility Class that deals with the output. */
@@ -45,18 +46,13 @@ public final class OutputUtils {
* @return {@code Path} array
*/
public static Path[] listOutputFiles(FileSystem fs, Path outpath) throws
IOException {
- FileStatus[] status = fs.listStatus(outpath);
Collection<Path> outpaths = Lists.newArrayList();
- for (FileStatus s : status) {
+ for (FileStatus s : fs.listStatus(outpath, PathFilters.logsCRCFilter())) {
if (!s.isDir()) {
outpaths.add(s.getPath());
}
}
-
- Path[] outfiles = new Path[outpaths.size()];
- outpaths.toArray(outfiles);
-
- return outfiles;
+ return outpaths.toArray(new Path[outpaths.size()]);
}
/**
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/DistributedRowMatrix.java
Thu Jun 23 21:05:11 2011
@@ -28,6 +28,7 @@ import org.apache.hadoop.io.WritableComp
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.common.iterator.sequencefile.PathType;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterator;
import org.apache.mahout.math.CardinalityException;
@@ -133,7 +134,11 @@ public class DistributedRowMatrix implem
try {
return Iterators.transform(
new SequenceFileDirIterator<IntWritable,VectorWritable>(new
Path(rowPath, "*"),
-
PathType.GLOB, null, null, true, conf),
+
PathType.GLOB,
+
PathFilters.logsCRCFilter(),
+ null,
+ true,
+ conf),
new Function<Pair<IntWritable,VectorWritable>,MatrixSlice>() {
@Override
public MatrixSlice apply(Pair<IntWritable, VectorWritable> from) {
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/TestDistributedRowMatrix.java
Thu Jun 23 21:05:11 2011
@@ -24,6 +24,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.mahout.clustering.ClusteringTestUtils;
import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.MatrixSlice;
import org.apache.mahout.math.RandomAccessSparseVector;
@@ -235,8 +236,8 @@ public final class TestDistributedRowMat
Path outputTempPath = outputStatuses[0].getPath();
Path inputVectorPath = new Path(outputTempPath,
TimesSquaredJob.INPUT_VECTOR);
Path outputVectorPath = new Path(outputTempPath,
TimesSquaredJob.OUTPUT_VECTOR_FILENAME);
- assertEquals(1, fs.listStatus(inputVectorPath).length);
- assertEquals(1, fs.listStatus(outputVectorPath).length);
+ assertEquals(1, fs.listStatus(inputVectorPath,
PathFilters.logsCRCFilter()).length);
+ assertEquals(1, fs.listStatus(outputVectorPath,
PathFilters.logsCRCFilter()).length);
assertEquals(0.0, result1.getDistanceSquared(result2), EPSILON);
}
@@ -272,8 +273,8 @@ public final class TestDistributedRowMat
Path outputTempPath = outputStatuses[0].getPath();
Path inputVectorPath = new Path(outputTempPath,
TimesSquaredJob.INPUT_VECTOR);
Path outputVectorPath = new Path(outputTempPath,
TimesSquaredJob.OUTPUT_VECTOR_FILENAME);
- assertEquals(1, fs.listStatus(inputVectorPath).length);
- assertEquals(1, fs.listStatus(outputVectorPath).length);
+ assertEquals(1, fs.listStatus(inputVectorPath,
PathFilters.logsCRCFilter()).length);
+ assertEquals(1, fs.listStatus(outputVectorPath,
PathFilters.logsCRCFilter()).length);
assertEquals(0.0, result1.getDistanceSquared(result2), EPSILON);
}
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/vectorizer/DocumentProcessorTest.java
Thu Jun 23 21:05:11 2011
@@ -26,13 +26,15 @@ import org.apache.hadoop.io.SequenceFile
import org.apache.hadoop.io.Text;
import org.apache.mahout.common.MahoutTestCase;
import org.apache.mahout.common.StringTuple;
+import org.apache.mahout.common.iterator.sequencefile.PathFilters;
import org.junit.Test;
import java.util.Arrays;
/**
- * Tests tokenizing of <Text documentId, Text text> {@link SequenceFile}s by
the {@link DocumentProcessor} into
- * <Text documentId, StringTuple tokens> sequence files
+ * Tests tokenizing of {@link SequenceFile}s containing document ID and text
(both as {@link Text})
+ * by the {@link DocumentProcessor} into {@link SequenceFile}s of document ID
and tokens (as
+ * {@link StringTuple}).
*/
public class DocumentProcessorTest extends MahoutTestCase {
@@ -58,7 +60,7 @@ public class DocumentProcessorTest exten
DocumentProcessor.tokenizeDocuments(input, DefaultAnalyzer.class, output,
configuration);
- FileStatus[] statuses = fs.listStatus(output);
+ FileStatus[] statuses = fs.listStatus(output, PathFilters.logsCRCFilter());
assertEquals(1, statuses.length);
Path filePath = statuses[0].getPath();
SequenceFile.Reader reader = new SequenceFile.Reader(fs, filePath,
configuration);
Modified: mahout/trunk/pom.xml
URL:
http://svn.apache.org/viewvc/mahout/trunk/pom.xml?rev=1139072&r1=1139071&r2=1139072&view=diff
==============================================================================
--- mahout/trunk/pom.xml (original)
+++ mahout/trunk/pom.xml Thu Jun 23 21:05:11 2011
@@ -191,7 +191,7 @@
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
- <version>0.20.2</version>
+ <version>0.20.203.0</version>
<exclusions>
<exclusion>
<groupId>net.sf.kosmosfs</groupId>
@@ -259,6 +259,16 @@
</exclusion>
</exclusions>
</dependency>
+ <dependency>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-core-asl</artifactId>
+ <version>1.8.2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.codehaus.jackson</groupId>
+ <artifactId>jackson-mapper-asl</artifactId>
+ <version>1.8.2</version>
+ </dependency>
<dependency>
<groupId>commons-dbcp</groupId>