Author: drew
Date: Sat Oct 2 00:40:46 2010
New Revision: 1003720
URL: http://svn.apache.org/viewvc?rev=1003720&view=rev
Log:
MAHOUT-451: Simple utility to split bayes input into training/test sets. Now
uses hadoop filesystem api
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java?rev=1003720&r1=1003719&r2=1003720&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java
Sat Oct 2 00:40:46 2010
@@ -19,8 +19,6 @@ package org.apache.mahout.classifier.bay
import java.io.BufferedReader;
import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
@@ -36,6 +34,10 @@ import org.apache.commons.cli2.builder.A
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.common.RandomUtils;
@@ -116,9 +118,11 @@ public class SplitBayesInput {
private int testRandomSelectionPct = -1;
private Charset charset = Charset.forName("UTF-8");
- private File inputDirectory;
- private File trainingOutputDirectory;
- private File testOutputDirectory;
+ private Configuration conf;
+ private FileSystem fs;
+ private Path inputDirectory;
+ private Path trainingOutputDirectory;
+ private Path testOutputDirectory;
private SplitCallback callback;
@@ -129,6 +133,11 @@ public class SplitBayesInput {
}
}
+ public SplitBayesInput() throws IOException {
+ conf = new Configuration();
+ fs = FileSystem.get(conf);
+ }
+
/** Configure this instance based on the command-line arguments contained
within provided array.
* Calls {...@link #validate()} to ensure consistency of configuration.
*
@@ -196,9 +205,9 @@ public class SplitBayesInput {
return false;
}
- inputDirectory = new File((String) cmdLine.getValue(inputDirOpt));
- trainingOutputDirectory = new File((String)
cmdLine.getValue(trainingOutputDirOpt));
- testOutputDirectory = new File((String)
cmdLine.getValue(testOutputDirOpt));
+ inputDirectory = new Path((String) cmdLine.getValue(inputDirOpt));
+ trainingOutputDirectory = new Path((String)
cmdLine.getValue(trainingOutputDirOpt));
+ testOutputDirectory = new Path((String)
cmdLine.getValue(testOutputDirOpt));
charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
@@ -228,8 +237,8 @@ public class SplitBayesInput {
setTestRandomSelectionPct(Integer.parseInt((String)
cmdLine.getValue(randomSelectionPctOpt)));
}
- trainingOutputDirectory.mkdirs();
- testOutputDirectory.mkdirs();
+ fs.mkdirs(trainingOutputDirectory);
+ fs.mkdirs(testOutputDirectory);
} catch (OptionException e) {
log.error("Command-line option Exception", e);
@@ -256,34 +265,41 @@ public class SplitBayesInput {
* @param inputDir
* @throws IOException
*/
- public void splitDirectory(File inputDir) throws IOException {
- if (!inputDir.isDirectory()) {
- throw new IOException(inputDir + " does not exist, or is not a
directory");
+ public void splitDirectory(Path inputDir) throws IOException {
+ if (fs.getFileStatus(inputDir) == null) {
+ throw new IOException(inputDir + " does not exist");
}
-
+ else if (!fs.getFileStatus(inputDir).isDir()) {
+ throw new IOException(inputDir + " is not a directory");
+ }
+
// input dir contains one file per category.
- File[] inputFiles = inputDir.listFiles();
- for (File inputFile : inputFiles) {
- if (inputFile.isFile()) {
- splitFile(inputFile);
+ FileStatus[] fileStats = fs.listStatus(inputDir);
+ for (FileStatus inputFile : fileStats) {
+ if (!inputFile.isDir()) {
+ splitFile(inputFile.getPath());
}
}
}
+
/** Perform a split on the specified input file. Results will be written to
files of the same name in the specified
* training and test output directories. The {...@link #validate()} method
is called prior to executing the split.
*/
- public void splitFile(File inputFile) throws IOException {
- if (!inputFile.isFile()) {
- throw new IOException(inputFile + " does not exist, or is not a file");
+ public void splitFile(Path inputFile) throws IOException {
+ if (fs.getFileStatus(inputFile) == null) {
+ throw new IOException(inputFile + " does not exist");
+ }
+ else if (fs.getFileStatus(inputFile).isDir()) {
+ throw new IOException(inputFile + " is a directory");
}
validate();
- File testOutputFile = new File(testOutputDirectory, inputFile.getName());
- File trainingOutputFile = new File(trainingOutputDirectory,
inputFile.getName());
+ Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
+ Path trainingOutputFile = new Path(trainingOutputDirectory,
inputFile.getName());
- int lineCount = countLines(inputFile, charset);
+ int lineCount = countLines(fs, inputFile, charset);
log.info("{} has {} lines", inputFile.getName(), lineCount);
@@ -333,9 +349,10 @@ public class SplitBayesInput {
new Object[] {inputFile, testSplitSize, lineCount -
testSplitSize});
}
}
- BufferedReader reader = new BufferedReader(new InputStreamReader(new
FileInputStream(inputFile), charset));
- Writer trainingWriter = new OutputStreamWriter(new
FileOutputStream(trainingOutputFile), charset);
- Writer testWriter = new OutputStreamWriter(new
FileOutputStream(testOutputFile), charset);
+
+ BufferedReader reader = new BufferedReader(new
InputStreamReader(fs.open(inputFile), charset));
+ Writer trainingWriter = new
OutputStreamWriter(fs.create(trainingOutputFile), charset);
+ Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile),
charset);
int pos = 0;
int trainCount = 0;
@@ -429,33 +446,33 @@ public class SplitBayesInput {
this.charset = charset;
}
- public File getInputDirectory() {
+ public Path getInputDirectory() {
return inputDirectory;
}
/** Set the directory from which input data will be read when the the
{...@link #splitDirectory()} method is invoked
*/
- public void setInputDirectory(File inputDir) {
+ public void setInputDirectory(Path inputDir) {
this.inputDirectory = inputDir;
}
- public File getTrainingOutputDirectory() {
+ public Path getTrainingOutputDirectory() {
return trainingOutputDirectory;
}
/** Set the directory to which training data will be written.
*/
- public void setTrainingOutputDirectory(File trainingOutputDir) {
+ public void setTrainingOutputDirectory(Path trainingOutputDir) {
this.trainingOutputDirectory = trainingOutputDir;
}
- public File getTestOutputDirectory() {
+ public Path getTestOutputDirectory() {
return testOutputDirectory;
}
/** Set the directory to which test data will be written.
*/
- public void setTestOutputDirectory(File testOutputDir) {
+ public void setTestOutputDirectory(Path testOutputDir) {
this.testOutputDirectory = testOutputDir;
}
@@ -552,12 +569,18 @@ public class SplitBayesInput {
throw new IllegalArgumentException("no test output directory was
specified");
}
- if (!trainingOutputDirectory.isDirectory()) {
- throw new IOException(inputDirectory + " does not exist, or is not a
directory");
+ if (fs.getFileStatus(trainingOutputDirectory) == null) {
+ throw new IOException(trainingOutputDirectory + " does not exist");
+ }
+ else if (!fs.getFileStatus(trainingOutputDirectory).isDir()) {
+ throw new IOException(trainingOutputDirectory + " is not a directory");
}
- if (!testOutputDirectory.isDirectory()) {
- throw new IOException(inputDirectory + " does not exist, or is not a
directory");
+ if (fs.getFileStatus(testOutputDirectory) == null) {
+ throw new IOException(testOutputDirectory + " does not exist");
+ }
+ else if (!fs.getFileStatus(testOutputDirectory).isDir()) {
+ throw new IOException(testOutputDirectory + " is not a directory");
}
}
@@ -574,8 +597,8 @@ public class SplitBayesInput {
* @throws IOException
* if there is a problem opening or reading the file.
*/
- public static int countLines(File inputFile, Charset charset) throws
IOException {
- BufferedReader countReader = new BufferedReader(new InputStreamReader(new
FileInputStream(inputFile), charset));
+ public static int countLines(FileSystem fs, Path inputFile, Charset charset)
throws IOException {
+ BufferedReader countReader = new BufferedReader(new
InputStreamReader(fs.open(inputFile), charset));
int lineCount = 0;
while (countReader.readLine() != null) {
lineCount++;
@@ -587,7 +610,7 @@ public class SplitBayesInput {
/** Used to pass information back to a caller once a file has been split
without the need for a data object */
public interface SplitCallback {
- void splitComplete(File inputFile, int lineCount, int trainCount, int
testCount, int testSplitStart);
+ void splitComplete(Path inputFile, int lineCount, int trainCount, int
testCount, int testSplitStart);
}
}
Modified:
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java?rev=1003720&r1=1003719&r2=1003720&view=diff
==============================================================================
---
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java
(original)
+++
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java
Sat Oct 2 00:40:46 2010
@@ -25,6 +25,9 @@ import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
import org.apache.mahout.classifier.ClassifierData;
import org.apache.mahout.common.IOUtils;
import org.apache.mahout.examples.MahoutTestCase;
@@ -36,24 +39,29 @@ public final class SplitBayesInputTest e
private OpenObjectIntHashMap<String> countMap;
private Charset charset;
- private File tempInputFile;
- private File tempTrainingDirectory;
- private File tempTestDirectory;
- private File tempInputDirectory;
+ private FileSystem fs;
+ private Configuration conf;
+ private Path tempInputFile;
+ private Path tempTrainingDirectory;
+ private Path tempTestDirectory;
+ private Path tempInputDirectory;
private SplitBayesInput si;
@Override
@Before
public void setUp() throws Exception {
+ conf = new Configuration();
+ fs = FileSystem.get(conf);
+
super.setUp();
countMap = new OpenObjectIntHashMap<String>();
charset = Charset.forName("UTF-8");
- tempInputFile = getTestTempFile("bayesinputfile");
- tempTrainingDirectory = getTestTempDir("bayestrain");
- tempTestDirectory = getTestTempDir("bayestest");
- tempInputDirectory = getTestTempDir("bayesinputdir");
+ tempInputFile = getTestTempFilePath("bayesinputfile");
+ tempTrainingDirectory = getTestTempDirPath("bayestrain");
+ tempTestDirectory = getTestTempDirPath("bayestest");
+ tempInputDirectory = getTestTempDirPath("bayesinputdir");
si = new SplitBayesInput();
si.setTrainingOutputDirectory(tempTrainingDirectory);
@@ -71,9 +79,10 @@ public final class SplitBayesInputTest e
if (writer != null) {
IOUtils.quietClose(writer);
}
+
writer = new BufferedWriter(
new OutputStreamWriter(
- new FileOutputStream(new File(tempInputDirectory,
currentLabel)), Charset.forName("UTF-8")));
+ fs.create(new Path(tempInputDirectory, currentLabel)),
Charset.forName("UTF-8")));
}
countMap.adjustOrPutValue(currentLabel, 1, 1);
writer.write(currentLabel + '\t' + entry[1] + '\n');
@@ -83,7 +92,7 @@ public final class SplitBayesInputTest e
private void writeSingleInputFile() throws IOException {
BufferedWriter writer = new BufferedWriter(
- new OutputStreamWriter(new FileOutputStream(tempInputFile),
Charset.forName("UTF-8")));
+ new OutputStreamWriter(fs.create(tempInputFile),
Charset.forName("UTF-8")));
for (String[] entry : ClassifierData.DATA) {
writer.write(entry[0] + '\t' + entry[1] + '\n');
}
@@ -99,9 +108,9 @@ public final class SplitBayesInputTest e
si.setTestSplitSize(testSplitSize);
si.setCallback(new SplitBayesInput.SplitCallback() {
@Override
- public void splitComplete(File inputFile, int lineCount, int
trainCount, int testCount, int testSplitStart) {
+ public void splitComplete(Path inputFile, int lineCount, int
trainCount, int testCount, int testSplitStart) {
int trainingLines = countMap.get(inputFile.getName()) -
testSplitSize;
- assertSplit(inputFile, charset, testSplitSize, trainingLines,
tempTrainingDirectory, tempTestDirectory);
+ assertSplit(fs, inputFile, charset, testSplitSize, trainingLines,
tempTrainingDirectory, tempTestDirectory);
}
});
@@ -207,8 +216,8 @@ public final class SplitBayesInputTest e
}
@Override
- public void splitComplete(File inputFile, int lineCount, int trainCount,
int testCount, int testSplitStart) {
- assertSplit(tempInputFile, charset, testSplitSize, trainingLines,
tempTrainingDirectory, tempTestDirectory);
+ public void splitComplete(Path inputFile, int lineCount, int trainCount,
int testCount, int testSplitStart) {
+ assertSplit(fs, tempInputFile, charset, testSplitSize, trainingLines,
tempTrainingDirectory, tempTestDirectory);
}
}
@@ -223,21 +232,22 @@ public final class SplitBayesInputTest e
}
}
- private static void assertSplit(File tempInputFile,
+ private static void assertSplit(FileSystem fs,
+ Path tempInputFile,
Charset charset,
int testSplitSize,
int trainingLines,
- File tempTrainingDirectory,
- File tempTestDirectory) {
+ Path tempTrainingDirectory,
+ Path tempTestDirectory) {
try {
- File testFile = new File(tempTestDirectory, tempInputFile.getName());
- assertTrue("test file exists", testFile.isFile());
- assertEquals("test line count", testSplitSize,
SplitBayesInput.countLines(testFile, charset));
-
- File trainingFile = new File(tempTrainingDirectory,
tempInputFile.getName());
- assertTrue("training file exists", trainingFile.isFile());
- assertEquals("training line count", trainingLines,
SplitBayesInput.countLines(trainingFile, charset));
+ Path testFile = new Path(tempTestDirectory, tempInputFile.getName());
+ //assertTrue("test file exists", testFile.isFile());
+ assertEquals("test line count", testSplitSize,
SplitBayesInput.countLines(fs, testFile, charset));
+
+ Path trainingFile = new Path(tempTrainingDirectory,
tempInputFile.getName());
+ //assertTrue("training file exists", trainingFile.isFile());
+ assertEquals("training line count", trainingLines,
SplitBayesInput.countLines(fs, trainingFile, charset));
} catch (IOException ioe) {
fail(ioe.toString());
}