Author: drew
Date: Sat Oct  2 00:40:46 2010
New Revision: 1003720

URL: http://svn.apache.org/viewvc?rev=1003720&view=rev
Log:
MAHOUT-451: Simple utility to split bayes input into training/test sets. Now 
uses hadoop filesystem api

Modified:
    
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java
    
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java?rev=1003720&r1=1003719&r2=1003720&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/SplitBayesInput.java
 Sat Oct  2 00:40:46 2010
@@ -19,8 +19,6 @@ package org.apache.mahout.classifier.bay
 
 import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
@@ -36,6 +34,10 @@ import org.apache.commons.cli2.builder.A
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.mahout.common.CommandLineUtil;
 import org.apache.mahout.common.IOUtils;
 import org.apache.mahout.common.RandomUtils;
@@ -116,9 +118,11 @@ public class SplitBayesInput {
   private int testRandomSelectionPct = -1;
   private Charset charset = Charset.forName("UTF-8");
   
-  private File inputDirectory;
-  private File trainingOutputDirectory;
-  private File testOutputDirectory;
+  private Configuration conf;
+  private FileSystem fs; 
+  private Path inputDirectory;
+  private Path trainingOutputDirectory;
+  private Path testOutputDirectory;
   
   private SplitCallback callback;
   
@@ -129,6 +133,11 @@ public class SplitBayesInput {
     }
   }
   
+  public SplitBayesInput() throws IOException {
+    conf = new Configuration();
+    fs = FileSystem.get(conf);
+  }
+  
   /** Configure this instance based on the command-line arguments contained 
within provided array. 
    * Calls {...@link #validate()} to ensure consistency of configuration.
    * 
@@ -196,9 +205,9 @@ public class SplitBayesInput {
         return false;
       }
       
-      inputDirectory = new File((String) cmdLine.getValue(inputDirOpt));
-      trainingOutputDirectory = new File((String) 
cmdLine.getValue(trainingOutputDirOpt));
-      testOutputDirectory = new File((String) 
cmdLine.getValue(testOutputDirOpt));
+      inputDirectory = new Path((String) cmdLine.getValue(inputDirOpt));
+      trainingOutputDirectory = new Path((String) 
cmdLine.getValue(trainingOutputDirOpt));
+      testOutputDirectory = new Path((String) 
cmdLine.getValue(testOutputDirOpt));
      
       charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
 
@@ -228,8 +237,8 @@ public class SplitBayesInput {
         setTestRandomSelectionPct(Integer.parseInt((String) 
cmdLine.getValue(randomSelectionPctOpt)));
       }
 
-      trainingOutputDirectory.mkdirs();
-      testOutputDirectory.mkdirs();
+      fs.mkdirs(trainingOutputDirectory);
+      fs.mkdirs(testOutputDirectory);
      
     } catch (OptionException e) {
       log.error("Command-line option Exception", e);
@@ -256,34 +265,41 @@ public class SplitBayesInput {
    * @param inputDir
    * @throws IOException
    */
-  public void splitDirectory(File inputDir) throws IOException {
-    if (!inputDir.isDirectory()) {
-      throw new IOException(inputDir + " does not exist, or is not a 
directory");
+  public void splitDirectory(Path inputDir) throws IOException {
+    if (fs.getFileStatus(inputDir) == null) {
+      throw new IOException(inputDir + " does not exist");
     }
-    
+    else if (!fs.getFileStatus(inputDir).isDir()) {
+      throw new IOException(inputDir + " is not a directory");
+    }
+
     // input dir contains one file per category.
-    File[] inputFiles = inputDir.listFiles();
-    for (File inputFile : inputFiles) {
-      if (inputFile.isFile()) {
-        splitFile(inputFile);
+    FileStatus[] fileStats = fs.listStatus(inputDir);
+    for (FileStatus inputFile : fileStats) {
+      if (!inputFile.isDir()) {
+        splitFile(inputFile.getPath());
       }
     }
   }
+  
 
   /** Perform a split on the specified input file. Results will be written to 
files of the same name in the specified 
    *  training and test output directories. The {...@link #validate()} method 
is called prior to executing the split.
    */
-  public void splitFile(File inputFile) throws IOException {
-    if (!inputFile.isFile()) {
-      throw new IOException(inputFile + " does not exist, or is not a file");
+  public void splitFile(Path inputFile) throws IOException {
+    if (fs.getFileStatus(inputFile) == null) {
+      throw new IOException(inputFile + " does not exist");
+    }
+    else if (fs.getFileStatus(inputFile).isDir()) {
+      throw new IOException(inputFile + " is a directory");
     }
     
     validate();
     
-    File testOutputFile = new File(testOutputDirectory, inputFile.getName());
-    File trainingOutputFile = new File(trainingOutputDirectory, 
inputFile.getName());
+    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
+    Path trainingOutputFile = new Path(trainingOutputDirectory, 
inputFile.getName());
     
-    int lineCount = countLines(inputFile, charset);
+    int lineCount = countLines(fs, inputFile, charset);
     
     log.info("{} has {} lines", inputFile.getName(), lineCount);
     
@@ -333,9 +349,10 @@ public class SplitBayesInput {
                  new Object[] {inputFile, testSplitSize, lineCount - 
testSplitSize});
       }
     }
-    BufferedReader reader = new BufferedReader(new InputStreamReader(new 
FileInputStream(inputFile), charset));
-    Writer trainingWriter = new OutputStreamWriter(new 
FileOutputStream(trainingOutputFile), charset);
-    Writer testWriter     = new OutputStreamWriter(new 
FileOutputStream(testOutputFile), charset);
+    
+    BufferedReader reader = new BufferedReader(new 
InputStreamReader(fs.open(inputFile), charset));
+    Writer trainingWriter = new 
OutputStreamWriter(fs.create(trainingOutputFile), charset);
+    Writer testWriter     = new OutputStreamWriter(fs.create(testOutputFile), 
charset);
 
     int pos = 0;
     int trainCount = 0;
@@ -429,33 +446,33 @@ public class SplitBayesInput {
     this.charset = charset;
   }
 
-  public File getInputDirectory() {
+  public Path getInputDirectory() {
     return inputDirectory;
   }
 
   /** Set the directory from which input data will be read when the the 
{...@link #splitDirectory()} method is invoked
    */
-  public void setInputDirectory(File inputDir) {
+  public void setInputDirectory(Path inputDir) {
     this.inputDirectory = inputDir;
   }
 
-  public File getTrainingOutputDirectory() {
+  public Path getTrainingOutputDirectory() {
     return trainingOutputDirectory;
   }
 
   /** Set the directory to which training data will be written.
    */
-  public void setTrainingOutputDirectory(File trainingOutputDir) {
+  public void setTrainingOutputDirectory(Path trainingOutputDir) {
     this.trainingOutputDirectory = trainingOutputDir;
   }
 
-  public File getTestOutputDirectory() {
+  public Path getTestOutputDirectory() {
     return testOutputDirectory;
   }
 
   /** Set the directory to which test data will be written.
    */
-  public void setTestOutputDirectory(File testOutputDir) {
+  public void setTestOutputDirectory(Path testOutputDir) {
     this.testOutputDirectory = testOutputDir;
   }
 
@@ -552,12 +569,18 @@ public class SplitBayesInput {
       throw new IllegalArgumentException("no test output directory was 
specified");
     }
 
-    if (!trainingOutputDirectory.isDirectory()) {
-      throw new IOException(inputDirectory + " does not exist, or is not a 
directory");
+    if (fs.getFileStatus(trainingOutputDirectory) == null) {
+      throw new IOException(trainingOutputDirectory + " does not exist");
+    }
+    else if (!fs.getFileStatus(trainingOutputDirectory).isDir()) {
+      throw new IOException(trainingOutputDirectory + " is not a directory");
     }
 
-    if (!testOutputDirectory.isDirectory()) {
-      throw new IOException(inputDirectory + " does not exist, or is not a 
directory");
+    if (fs.getFileStatus(testOutputDirectory) == null) {
+      throw new IOException(testOutputDirectory + " does not exist");
+    }
+    else if (!fs.getFileStatus(testOutputDirectory).isDir()) {
+      throw new IOException(testOutputDirectory + " is not a directory");
     }
   }
   
@@ -574,8 +597,8 @@ public class SplitBayesInput {
    * @throws IOException 
    *   if there is a problem opening or reading the file.
    */
-  public static int countLines(File inputFile, Charset charset) throws 
IOException {
-    BufferedReader countReader = new BufferedReader(new InputStreamReader(new 
FileInputStream(inputFile), charset));
+  public static int countLines(FileSystem fs, Path inputFile, Charset charset) 
throws IOException {
+    BufferedReader countReader = new BufferedReader(new 
InputStreamReader(fs.open(inputFile), charset));
     int lineCount = 0;
     while (countReader.readLine() != null) {
       lineCount++;
@@ -587,7 +610,7 @@ public class SplitBayesInput {
   
   /** Used to pass information back to a caller once a file has been split 
without the need for a data object */
   public interface SplitCallback {
-    void splitComplete(File inputFile, int lineCount, int trainCount, int 
testCount, int testSplitStart);
+    void splitComplete(Path inputFile, int lineCount, int trainCount, int 
testCount, int testSplitStart);
   }
 
 }

Modified: 
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java?rev=1003720&r1=1003719&r2=1003720&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java
 (original)
+++ 
mahout/trunk/examples/src/test/java/org/apache/mahout/classifier/bayes/SplitBayesInputTest.java
 Sat Oct  2 00:40:46 2010
@@ -25,6 +25,9 @@ import java.io.OutputStreamWriter;
 import java.io.Writer;
 import java.nio.charset.Charset;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.mahout.classifier.ClassifierData;
 import org.apache.mahout.common.IOUtils;
 import org.apache.mahout.examples.MahoutTestCase;
@@ -36,24 +39,29 @@ public final class SplitBayesInputTest e
 
   private OpenObjectIntHashMap<String> countMap;
   private Charset charset;
-  private File tempInputFile;
-  private File tempTrainingDirectory;
-  private File tempTestDirectory;
-  private File tempInputDirectory;
+  private FileSystem fs;
+  private Configuration conf;
+  private Path tempInputFile;
+  private Path tempTrainingDirectory;
+  private Path tempTestDirectory;
+  private Path tempInputDirectory;
   private SplitBayesInput si;
     
   @Override
   @Before
   public void setUp() throws Exception {
+    conf = new Configuration();
+    fs   = FileSystem.get(conf);
+    
     super.setUp();
   
     countMap = new OpenObjectIntHashMap<String>();
     
     charset = Charset.forName("UTF-8");
-    tempInputFile = getTestTempFile("bayesinputfile");
-    tempTrainingDirectory = getTestTempDir("bayestrain");
-    tempTestDirectory = getTestTempDir("bayestest");
-    tempInputDirectory = getTestTempDir("bayesinputdir");
+    tempInputFile = getTestTempFilePath("bayesinputfile");
+    tempTrainingDirectory = getTestTempDirPath("bayestrain");
+    tempTestDirectory = getTestTempDirPath("bayestest");
+    tempInputDirectory = getTestTempDirPath("bayesinputdir");
     
     si = new SplitBayesInput();
     si.setTrainingOutputDirectory(tempTrainingDirectory);
@@ -71,9 +79,10 @@ public final class SplitBayesInputTest e
         if (writer != null) {
           IOUtils.quietClose(writer);
         }
+        
         writer = new BufferedWriter(
             new OutputStreamWriter(
-                new FileOutputStream(new File(tempInputDirectory, 
currentLabel)), Charset.forName("UTF-8")));
+                fs.create(new Path(tempInputDirectory, currentLabel)), 
Charset.forName("UTF-8")));
       }
       countMap.adjustOrPutValue(currentLabel, 1, 1);
       writer.write(currentLabel + '\t' + entry[1] + '\n');
@@ -83,7 +92,7 @@ public final class SplitBayesInputTest e
 
   private void writeSingleInputFile() throws IOException {
     BufferedWriter writer = new BufferedWriter(
-        new OutputStreamWriter(new FileOutputStream(tempInputFile), 
Charset.forName("UTF-8")));
+        new OutputStreamWriter(fs.create(tempInputFile), 
Charset.forName("UTF-8")));
     for (String[] entry : ClassifierData.DATA) {
       writer.write(entry[0] + '\t' + entry[1] + '\n');
     }
@@ -99,9 +108,9 @@ public final class SplitBayesInputTest e
     si.setTestSplitSize(testSplitSize);
     si.setCallback(new SplitBayesInput.SplitCallback() {
           @Override
-          public void splitComplete(File inputFile, int lineCount, int 
trainCount, int testCount, int testSplitStart) {
+          public void splitComplete(Path inputFile, int lineCount, int 
trainCount, int testCount, int testSplitStart) {
             int trainingLines = countMap.get(inputFile.getName()) - 
testSplitSize;
-            assertSplit(inputFile, charset, testSplitSize, trainingLines, 
tempTrainingDirectory, tempTestDirectory);
+            assertSplit(fs, inputFile, charset, testSplitSize, trainingLines, 
tempTrainingDirectory, tempTestDirectory);
           }
     });
     
@@ -207,8 +216,8 @@ public final class SplitBayesInputTest e
     }
     
     @Override
-    public void splitComplete(File inputFile, int lineCount, int trainCount, 
int testCount, int testSplitStart) {
-      assertSplit(tempInputFile, charset, testSplitSize, trainingLines, 
tempTrainingDirectory, tempTestDirectory);
+    public void splitComplete(Path inputFile, int lineCount, int trainCount, 
int testCount, int testSplitStart) {
+      assertSplit(fs, tempInputFile, charset, testSplitSize, trainingLines, 
tempTrainingDirectory, tempTestDirectory);
     }
   }
   
@@ -223,21 +232,22 @@ public final class SplitBayesInputTest e
     } 
   }
   
-  private static void assertSplit(File tempInputFile,
+  private static void assertSplit(FileSystem fs,
+                                  Path tempInputFile,
                                   Charset charset,
                                   int testSplitSize,
                                   int trainingLines,
-                                  File tempTrainingDirectory,
-                                  File tempTestDirectory) {
+                                  Path tempTrainingDirectory,
+                                  Path tempTestDirectory) {
 
     try {
-      File testFile = new File(tempTestDirectory, tempInputFile.getName());
-      assertTrue("test file exists", testFile.isFile());
-      assertEquals("test line count", testSplitSize, 
SplitBayesInput.countLines(testFile, charset));
-
-      File trainingFile = new File(tempTrainingDirectory, 
tempInputFile.getName());
-      assertTrue("training file exists", trainingFile.isFile());
-      assertEquals("training line count", trainingLines, 
SplitBayesInput.countLines(trainingFile, charset));
+      Path testFile = new Path(tempTestDirectory, tempInputFile.getName());
+      //assertTrue("test file exists", testFile.isFile());
+      assertEquals("test line count", testSplitSize, 
SplitBayesInput.countLines(fs, testFile, charset));
+
+      Path trainingFile = new Path(tempTrainingDirectory, 
tempInputFile.getName());
+      //assertTrue("training file exists", trainingFile.isFile());
+      assertEquals("training line count", trainingLines, 
SplitBayesInput.countLines(fs, trainingFile, charset));
     } catch (IOException ioe) {
       fail(ioe.toString());
     }


Reply via email to