Author: srowen
Date: Wed Sep 28 19:37:45 2011
New Revision: 1177027
URL: http://svn.apache.org/viewvc?rev=1177027&view=rev
Log:
MAHOUT-799 remove CSV filter that wasn't working
Removed:
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromCsvFilter.java
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java?rev=1177027&r1=1177026&r2=1177027&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/text/PrefixAdditionFilter.java
Wed Sep 28 19:37:45 2011
@@ -26,6 +26,7 @@ import org.apache.mahout.common.iterator
import java.io.IOException;
import java.io.InputStream;
+import java.nio.charset.Charset;
import java.util.Map;
/**
@@ -37,8 +38,9 @@ public final class PrefixAdditionFilter
String keyPrefix,
Map<String, String> options,
ChunkedWriter writer,
+ Charset charset,
FileSystem fs) {
- super(conf, keyPrefix, options, writer, fs);
+ super(conf, keyPrefix, options, writer, charset, fs);
}
@Override
@@ -47,7 +49,8 @@ public final class PrefixAdditionFilter
ChunkedWriter writer = getWriter();
if (fst.isDir()) {
String dirPath = getPrefix() + Path.SEPARATOR + current.getName() +
Path.SEPARATOR + fst.getPath().getName();
- fs.listStatus(fst.getPath(), new PrefixAdditionFilter(getConf(),
dirPath, getOptions(), writer, fs));
+ fs.listStatus(fst.getPath(),
+ new PrefixAdditionFilter(getConf(), dirPath, getOptions(),
writer, getCharset(), fs));
} else {
InputStream in = null;
try {
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=1177027&r1=1177026&r2=1177027&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
Wed Sep 28 19:37:45 2011
@@ -19,7 +19,7 @@ package org.apache.mahout.text;
import java.io.IOException;
import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
+import java.nio.charset.Charset;
import java.util.Map;
import com.google.common.collect.Maps;
@@ -45,42 +45,10 @@ public class SequenceFilesFromDirectory
private static final String PREFIX_ADDITION_FILTER =
PrefixAdditionFilter.class.getName();
private static final String[] CHUNK_SIZE_OPTION = {"chunkSize", "chunk"};
- static final String[] FILE_FILTER_CLASS_OPTION =
{"fileFilterClass","filter"};
+ private static final String[] FILE_FILTER_CLASS_OPTION =
{"fileFilterClass","filter"};
private static final String[] KEY_PREFIX_OPTION = {"keyPrefix", "prefix"};
- static final String[] CHARSET_OPTION = {"charset", "c"};
+ private static final String[] CHARSET_OPTION = {"charset", "c"};
- public static void run(Configuration conf,
- String keyPrefix,
- Map<String, String> options,
- Path input,
- Path output)
- throws InstantiationException, IllegalAccessException,
InvocationTargetException, IOException,
- NoSuchMethodException, ClassNotFoundException {
- FileSystem fs = FileSystem.get(input.toUri(), conf);
- ChunkedWriter writer = new ChunkedWriter(conf,
Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])), output);
-
- try {
- SequenceFilesFromDirectoryFilter pathFilter;
- String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]);
- if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
- pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options,
writer, fs);
- } else {
- Class<? extends SequenceFilesFromDirectoryFilter> pathFilterClass =
-
Class.forName(fileFilterClassName).asSubclass(SequenceFilesFromDirectoryFilter.class);
- Constructor<? extends SequenceFilesFromDirectoryFilter> constructor =
- pathFilterClass.getConstructor(Configuration.class,
- String.class,
- Map.class,
- ChunkedWriter.class,
- FileSystem.class);
- pathFilter = constructor.newInstance(conf, keyPrefix, options, writer,
fs);
- }
- fs.listStatus(input, pathFilter);
- } finally {
- Closeables.closeQuietly(writer);
- }
- }
-
public static void main(String[] args) throws Exception {
ToolRunner.run(new SequenceFilesFromDirectory(), args);
}
@@ -89,9 +57,7 @@ public class SequenceFilesFromDirectory
* callback main after processing hadoop parameters
*/
@Override
- public int run(String[] args)
- throws IOException, ClassNotFoundException, InstantiationException,
IllegalAccessException, NoSuchMethodException,
- InvocationTargetException {
+ public int run(String[] args) throws Exception {
addOptions();
if (parseArguments(args) == null) {
@@ -107,7 +73,32 @@ public class SequenceFilesFromDirectory
}
String keyPrefix = getOption(KEY_PREFIX_OPTION[0]);
- run(getConf(), keyPrefix, options, input, output);
+ Charset charset = Charset.forName(getOption(CHARSET_OPTION[0]));
+ Configuration conf = getConf();
+ FileSystem fs = FileSystem.get(input.toUri(), conf);
+ ChunkedWriter writer = new ChunkedWriter(conf,
Integer.parseInt(options.get(CHUNK_SIZE_OPTION[0])), output);
+
+ try {
+ SequenceFilesFromDirectoryFilter pathFilter;
+ String fileFilterClassName = options.get(FILE_FILTER_CLASS_OPTION[0]);
+ if (PrefixAdditionFilter.class.getName().equals(fileFilterClassName)) {
+ pathFilter = new PrefixAdditionFilter(conf, keyPrefix, options,
writer, charset, fs);
+ } else {
+ Class<? extends SequenceFilesFromDirectoryFilter> pathFilterClass =
+
Class.forName(fileFilterClassName).asSubclass(SequenceFilesFromDirectoryFilter.class);
+ Constructor<? extends SequenceFilesFromDirectoryFilter> constructor =
+ pathFilterClass.getConstructor(Configuration.class,
+ String.class,
+ Map.class,
+ ChunkedWriter.class,
+ Charset.class,
+ FileSystem.class);
+ pathFilter = constructor.newInstance(conf, keyPrefix, options, writer,
fs);
+ }
+ fs.listStatus(input, pathFilter);
+ } finally {
+ Closeables.closeQuietly(writer);
+ }
return 0;
}
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java?rev=1177027&r1=1177026&r2=1177027&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectoryFilter.java
Wed Sep 28 19:37:45 2011
@@ -32,7 +32,7 @@ import java.util.Map;
/**
* Implement this interface if you wish to extend SequenceFilesFromDirectory
with your own parsing logic.
*/
-public abstract class SequenceFilesFromDirectoryFilter extends
SequenceFilesFromDirectory implements PathFilter {
+public abstract class SequenceFilesFromDirectoryFilter implements PathFilter {
private static final Logger log =
LoggerFactory.getLogger(SequenceFilesFromDirectoryFilter.class);
private final String prefix;
@@ -40,26 +40,20 @@ public abstract class SequenceFilesFromD
private final Charset charset;
private final FileSystem fs;
private final Map<String, String> options;
-
- protected SequenceFilesFromDirectoryFilter() {
- this.prefix = null;
- this.writer = null;
- this.charset = null;
- this.fs = null;
- this.options = null;
- }
+ private final Configuration conf;
protected SequenceFilesFromDirectoryFilter(Configuration conf,
String keyPrefix,
Map<String, String> options,
ChunkedWriter writer,
+ Charset charset,
FileSystem fs) {
this.prefix = keyPrefix;
this.writer = writer;
- this.charset =
Charset.forName(options.get(SequenceFilesFromDirectory.CHARSET_OPTION[0]));
+ this.charset = charset;
this.fs = fs;
this.options = options;
- setConf(conf);
+ this.conf = conf;
}
protected final String getPrefix() {
@@ -81,6 +75,10 @@ public abstract class SequenceFilesFromD
protected final Map<String, String> getOptions() {
return options;
}
+
+ protected final Configuration getConf() {
+ return conf;
+ }
@Override
public final boolean accept(Path current) {
Modified:
mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java?rev=1177027&r1=1177026&r2=1177027&view=diff
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
(original)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/text/TestSequenceFilesFromDirectory.java
Wed Sep 28 19:37:45 2011
@@ -43,10 +43,6 @@ public final class TestSequenceFilesFrom
{"test3", "This is the third text."}
};
- private enum ParserType {
- TEXT, CSV
- }
-
/**
* Story converting text files to SequenceFile
*/
@@ -66,46 +62,15 @@ public final class TestSequenceFilesFrom
// prepare input files
createFilesFromArrays(conf, inputDir, DATA1);
- String prefix = "UID";
- SequenceFilesFromDirectory.main(new String[] {"--input",
- inputDir.toString(), "--output", outputDir.toString(), "--chunkSize",
- "64", "--charset",
- Charsets.UTF_8.name(), "--keyPrefix", prefix});
+ SequenceFilesFromDirectory.main(new String[] {
+ "--input", inputDir.toString(),
+ "--output", outputDir.toString(),
+ "--chunkSize", "64",
+ "--charset", Charsets.UTF_8.name(),
+ "--keyPrefix", "UID"});
// check output chunk files
- checkChunkFiles(conf, outputDir, DATA1, prefix, ParserType.TEXT);
- }
-
- /**
- * Story converting a TSV file to SequenceFile
- */
- @Test
- public void testSequnceFileFromDirectoryTsv() throws Exception {
- Configuration conf = new Configuration();
- FileSystem fs = FileSystem.get(conf);
-
- // create
- Path tmpDir = this.getTestTempDirPath();
- Path inputDir = new Path(tmpDir, "inputDir");
- fs.mkdirs(inputDir);
- Path outputDir = new Path(tmpDir, "outputDir");
-
- // prepare input TSV file
- createTsvFilesFromArrays(conf, inputDir, DATA1);
-
- // convert it to SequenceFile
- String prefix = "UID";
- int chunkSizeInMB = 64;
- int keyColumn = 0;
- int valueColumn = 1;
- SequenceFilesFromCsvFilter.main(new String[] {"--input",
inputDir.toString(),
- "--output", outputDir.toString(), "--charset", Charsets.UTF_8.name(),
- "--chunkSize", Integer.toString(chunkSizeInMB), "--keyPrefix", prefix,
- "--keyColumn", Integer.toString(keyColumn), "--valueColumn",
- Integer.toString(valueColumn)});
-
- // check output chunk files
- checkChunkFiles(conf, outputDir, DATA1, prefix, ParserType.CSV);
+ checkChunkFiles(conf, outputDir, DATA1, "UID");
}
private static void createFilesFromArrays(Configuration conf, Path inputDir,
String[][] data) throws IOException {
@@ -120,23 +85,10 @@ public final class TestSequenceFilesFrom
}
}
- private static void createTsvFilesFromArrays(Configuration conf, Path
inputDir, String[][] data) throws IOException {
- FileSystem fs = FileSystem.get(conf);
- OutputStreamWriter writer = new OutputStreamWriter(fs.create(new
Path(inputDir, "inputTsvFile")));
- try {
- for (String[] aData : data) {
- writer.write(aData[0] + '\t' + aData[1] + '\n');
- }
- } finally {
- Closeables.closeQuietly(writer);
- }
- }
-
private static void checkChunkFiles(Configuration conf,
Path outputDir,
String[][] data,
- String prefix,
- ParserType inputType) throws IOException
{
+ String prefix) throws IOException {
FileSystem fs = FileSystem.get(conf);
// output exists?
@@ -147,11 +99,7 @@ public final class TestSequenceFilesFrom
Map<String,String> fileToData = Maps.newHashMap();
for (String[] aData : data) {
- if (inputType == ParserType.CSV) {
- fileToData.put(prefix + aData[0], aData[1]);
- } else {
- fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
- }
+ fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
}
// read a chunk to check content