Author: gsingers
Date: Tue Jun 9 19:46:14 2009
New Revision: 783116
URL: http://svn.apache.org/viewvc?rev=783116&view=rev
Log:
Add in ability to output only a few chunks
Modified:
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
Modified:
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
URL:
http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java?rev=783116&r1=783115&r2=783116&view=diff
==============================================================================
---
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
(original)
+++
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
Tue Jun 9 19:46:14 2009
@@ -57,7 +57,11 @@
Option chunkSizeOpt =
obuilder.withLongName("chunkSize").withRequired(true).withArgument(
abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).
withDescription("The Size of the chunk, in
megabytes").withShortName("c").create();
- Group group =
gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt).withOption(chunkSizeOpt).create();
+ Option numChunksOpt =
obuilder.withLongName("numChunks").withRequired(false).withArgument(
+
abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()).
+ withDescription("The maximum number of chunks to create. If
specified, program will only create a subset of the
chunks").withShortName("n").create();
+ Group group =
gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt).withOption(chunkSizeOpt).withOption(numChunksOpt).create();
+
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
@@ -66,6 +70,11 @@
String outputDirPath = (String) cmdLine.getValue(outputDirOpt);
int chunkSize = 1024 * 1024 * Integer.parseInt((String)
cmdLine.getValue(chunkSizeOpt));
+ int numChunks = Integer.MAX_VALUE;
+ if (cmdLine.hasOption(numChunksOpt)){
+ numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
+ }
+
BufferedReader dumpReader = new BufferedReader(new InputStreamReader(
new FileInputStream(dumpFilePath), "UTF-8"));
@@ -128,7 +137,9 @@
chunkWriter.write(content.toString(), 0, content.length());
chunkWriter.close();
-
+ if (filenumber >= numChunks){
+ break;
+ }
content = new StringBuilder();
content.append(header);
}