Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1243556&r1=1243555&r2=1243556&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Mon Feb 13 15:14:18 2012 @@ -20,22 +20,20 @@ package org.apache.mahout.utils.vectors; import com.google.common.base.Charsets; import com.google.common.io.Closeables; import com.google.common.io.Files; -import org.apache.commons.cli2.CommandLine; import org.apache.commons.cli2.Group; -import org.apache.commons.cli2.Option; import org.apache.commons.cli2.OptionException; -import org.apache.commons.cli2.builder.ArgumentBuilder; -import org.apache.commons.cli2.builder.DefaultOptionBuilder; -import org.apache.commons.cli2.builder.GroupBuilder; -import org.apache.commons.cli2.commandline.Parser; import org.apache.commons.cli2.util.HelpFormatter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; -import org.apache.mahout.common.CommandLineUtil; +import org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter; +import org.apache.hadoop.util.ToolRunner; +import org.apache.mahout.clustering.WeightedPropertyVectorWritable; +import org.apache.mahout.common.AbstractJob; import org.apache.mahout.common.Pair; import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable; import org.apache.mahout.math.NamedVector; @@ -45,6 +43,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; +import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.HashSet; @@ -56,211 +55,215 @@ import java.util.Set; * out the results using {@link Vector#asFormatString()} to either the console or to a * file. */ -public final class VectorDumper { +public final class VectorDumper extends AbstractJob { private static final Logger log = LoggerFactory.getLogger(VectorDumper.class); private VectorDumper() { } - public static void main(String[] args) throws Exception { - DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); - ArgumentBuilder abuilder = new ArgumentBuilder(); - GroupBuilder gbuilder = new GroupBuilder(); - - Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument( - abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription( - "The Sequence File containing the Vectors").withShortName("s").create(); - Option vectorAsKeyOpt = obuilder.withLongName("useKey").withRequired(false).withDescription( - "If the Key is a vector, then dump that instead").withShortName("u").create(); - Option printKeyOpt = obuilder.withLongName("printKey").withRequired(false).withDescription( - "Print out the key as well, delimited by a tab (or the value if useKey is true)").withShortName("p") - .create(); - Option outputOpt = obuilder.withLongName("output").withRequired(false).withArgument( - abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription( - "The output file. If not specified, dumps to the console").withShortName("o").create(); - Option dictOpt = obuilder.withLongName("dictionary").withRequired(false).withArgument( - abuilder.withName("dictionary").withMinimum(1).withMaximum(1).create()).withDescription( - "The dictionary file. ").withShortName("d").create(); - Option dictTypeOpt = obuilder.withLongName("dictionaryType").withRequired(false).withArgument( - abuilder.withName("dictionaryType").withMinimum(1).withMaximum(1).create()).withDescription( - "The dictionary file type (text|sequencefile)").withShortName("dt").create(); - Option csvOpt = obuilder.withLongName("csv").withRequired(false).withDescription( - "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries") - .withShortName("c").create(); - Option namesAsCommentsOpt = obuilder.withLongName("namesAsComments").withRequired(false).withDescription( - "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name") - .withShortName("n").create(); - Option sortVectorsOpt = obuilder.withLongName("sortVectors").withRequired(false).withDescription( - "Sort output key/value pairs of the vector entries in abs magnitude descending order") - .withShortName("sort").create(); - Option sizeOpt = obuilder.withLongName("sizeOnly").withRequired(false). - withDescription("Dump only the size of the vector").withShortName("sz").create(); - Option numItemsOpt = obuilder.withLongName("numItems").withRequired(false).withArgument( - abuilder.withName("n").withMinimum(1).withMaximum(1).create()). - withDescription("Output at most <n> vecors").withShortName("n").create(); - Option numIndexesPerVectorOpt = obuilder.withLongName("vectorSize").withShortName("vs") - .withRequired(false).withArgument(abuilder.withName("vs").withMinimum(1) - .withMaximum(1).create()) - .withDescription("Truncate vectors to <vs> length when dumping (most useful when in" - + " conjunction with -sort").create(); - Option filtersOpt = obuilder.withLongName("filter").withRequired(false).withArgument( - abuilder.withName("filter").withMinimum(1).withMaximum(100).create()). - withDescription("Only dump out those vectors whose name matches the filter." + - " Multiple items may be specified by repeating the argument.").withShortName("fi").create(); - Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") - .create(); - - Group group = gbuilder.withName("Options").withOption(seqOpt).withOption(outputOpt) - .withOption(dictTypeOpt).withOption(dictOpt).withOption(csvOpt) - .withOption(vectorAsKeyOpt).withOption(printKeyOpt).withOption(sortVectorsOpt) - .withOption(filtersOpt).withOption(helpOpt).withOption(numItemsOpt) - .withOption(sizeOpt).withOption(numIndexesPerVectorOpt).create(); + @Override + public int run(String[] args) throws Exception { + int result = 0; + /** + Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument( + abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription( + "The Sequence File containing the Vectors").withShortName("s").create(); + Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument( + abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create()) + .withDescription("The directory containing Sequence File of Vectors") + .withShortName("d").create(); + */ + addInputOption(); + addOutputOption(); + addOption("useKey", "u", "If the Key is a vector than dump that instead", false); + addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true", false); + addOption("dictionary", "d", "The dictionary file.", false); + addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false); + addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries", false); + addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name", false); + addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)", false); + addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude descending order", false); + addOption("quiet", "q", "Print only file contents", false); + addOption("sizeOnly", "sz", "Dump only the size of the vector", false); + addOption("numItems", "ni", "Output at most <n> vecors", false); + addOption("vectorSize", "vs", "Truncate vectors to <vs> length when dumping (most useful when in" + + " conjunction with -sort", false); + addOption(buildOption("filter", "fi", "Only dump out those vectors whose name matches the filter." + + " Multiple items may be specified by repeating the argument.", true, 1, 100, false, null)); - try { - Parser parser = new Parser(); - parser.setGroup(group); - CommandLine cmdLine = parser.parse(args); - - if (cmdLine.hasOption(helpOpt)) { - CommandLineUtil.printHelpWithGenericOptions(group); - return; - } - - if (cmdLine.hasOption(seqOpt)) { - Configuration conf = new Configuration(); - Path pathPattern = new Path(cmdLine.getValue(seqOpt).toString()); - FileSystem fs = FileSystem.get(conf); - FileStatus[] inputPaths = fs.globStatus(pathPattern); - - String dictionaryType = "text"; - if (cmdLine.hasOption(dictTypeOpt)) { - dictionaryType = cmdLine.getValue(dictTypeOpt).toString(); - } + if (parseArguments(args) == null) { + return -1; + } + + Path[] pathArr = null; + Configuration conf = new Configuration(); + FileSystem fs = FileSystem.get(conf); + Path input = getInputPath(); + FileStatus fileStatus = fs.getFileStatus(input); + if (fileStatus.isDir()){ + pathArr = FileUtil.stat2Paths(fs.listStatus(input, new OutputFilesFilter())); + } else { + FileStatus[] inputPaths = fs.globStatus(input); + pathArr = new Path[inputPaths.length]; + int i = 0; + for (FileStatus fstatus : inputPaths) { + pathArr[i++] = fstatus.getPath(); + } + } - boolean sortVectors = cmdLine.hasOption(sortVectorsOpt); - log.info("Sort? " + sortVectors); - String[] dictionary = null; - if (cmdLine.hasOption(dictOpt)) { - if ("text".equals(dictionaryType)) { - dictionary = VectorHelper.loadTermDictionary(new File(cmdLine.getValue(dictOpt).toString())); - } else if ("sequencefile".equals(dictionaryType)) { - dictionary = VectorHelper.loadTermDictionary(conf, cmdLine.getValue(dictOpt).toString()); - } else { - throw new OptionException(dictTypeOpt); + String dictionaryType = getOption("dictionaryType", "text"); + + boolean sortVectors = hasOption("sortVectors"); + boolean quiet = hasOption("quiet"); + if (quiet == false){ + log.info("Sort? " + sortVectors); + } + + String[] dictionary = null; + if (hasOption("dictionary")) { + String dictFile = getOption("dictionary"); + if ("text".equals(dictionaryType)) { + dictionary = VectorHelper.loadTermDictionary(new File(dictFile)); + } else if ("sequencefile".equals(dictionaryType)) { + dictionary = VectorHelper.loadTermDictionary(conf, dictFile); + } else { + //TODO: support Lucene's FST as a dictionary type + throw new IOException("Invalid dictionary type: " + dictionaryType); + } + } + + Set<String> filters; + if (hasOption("filter")) { + filters = new HashSet<String>(getOptions("filter")); + } else { + filters = null; + } + + boolean useCSV = hasOption("csv"); + + boolean sizeOnly = hasOption("sizeOnly"); + boolean nameOnly = hasOption("nameOnly"); + boolean namesAsComments = hasOption("namesAsComments"); + boolean transposeKeyValue = hasOption("vectorAsKey"); + Writer writer; + boolean shouldClose; + File output = getOutputFile(); + if (output != null) { + shouldClose = true; + writer = Files.newWriter(output, Charsets.UTF_8); + } else { + shouldClose = false; + writer = new OutputStreamWriter(System.out); + } + try { + boolean printKey = hasOption("printKey"); + if (useCSV && dictionary != null) { + writer.write("#"); + for (int j = 0; j < dictionary.length; j++) { + writer.write(dictionary[j]); + if (j < dictionary.length - 1) { + writer.write(','); } } - - Set<String> filters; - if (cmdLine.hasOption(filtersOpt)) { - filters = new HashSet<String>(cmdLine.getValues(filtersOpt)); - } else { - filters = null; + writer.write('\n'); + } + Long numItems = null; + if (hasOption("numItems")) { + numItems = Long.parseLong(getOption("numItems")); + if (quiet){ + writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n'); } - boolean useCSV = cmdLine.hasOption(csvOpt); - - boolean sizeOnly = cmdLine.hasOption(sizeOpt); - boolean namesAsComments = cmdLine.hasOption(namesAsCommentsOpt); - boolean transposeKeyValue = cmdLine.hasOption(vectorAsKeyOpt); - Writer writer; - boolean shouldClose; - if (cmdLine.hasOption(outputOpt)) { - shouldClose = true; - writer = Files.newWriter(new File(cmdLine.getValue(outputOpt).toString()), Charsets.UTF_8); - } else { - shouldClose = false; - writer = new OutputStreamWriter(System.out); + } + int maxIndexesPerVector = hasOption("numIndexesPerVector") + ? Integer.parseInt(getOption("numIndexesPerVector").toString()) + : Integer.MAX_VALUE; + long itemCount = 0; + int fileCount = 0; + for (Path path : pathArr) { + if (numItems != null && numItems <= itemCount) { + break; } - try { - boolean printKey = cmdLine.hasOption(printKeyOpt); - if (useCSV && dictionary != null) { - writer.write("#"); - for (int j = 0; j < dictionary.length; j++) { - writer.write(dictionary[j]); - if (j < dictionary.length - 1) { - writer.write(','); - } - } - writer.write('\n'); + if (quiet) { + log.info("Processing file '{}' ({}/{})", + new Object[]{path, ++fileCount, pathArr.length}); + } + SequenceFileIterable<Writable, Writable> iterable = + new SequenceFileIterable<Writable, Writable>(path, true, conf); + Iterator<Pair<Writable, Writable>> iterator = iterable.iterator(); + long i = 0; + while (iterator.hasNext() && (numItems == null || itemCount < numItems)) { + Pair<Writable, Writable> record = iterator.next(); + Writable keyWritable = record.getFirst(); + Writable valueWritable = record.getSecond(); + if (printKey) { + Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable; + writer.write(notTheVectorWritable.toString()); + writer.write('\t'); } - Long numItems = null; - if (cmdLine.hasOption(numItemsOpt)) { - numItems = Long.parseLong(cmdLine.getValue(numItemsOpt).toString()); - writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n'); + Vector vector = null; + try { + VectorWritable vectorWritable; + vector = ((VectorWritable) + (transposeKeyValue ? keyWritable : valueWritable)).get(); + } catch (ClassCastException e) { + if ((transposeKeyValue ? keyWritable : valueWritable) + instanceof WeightedPropertyVectorWritable) + vector = + ((WeightedPropertyVectorWritable) + (transposeKeyValue ? keyWritable : valueWritable)).getVector(); + else + throw e; } - int maxIndexesPerVector = cmdLine.hasOption(numIndexesPerVectorOpt) - ? Integer.parseInt(cmdLine.getValue(numIndexesPerVectorOpt).toString()) - : Integer.MAX_VALUE; - long itemCount = 0; - int fileCount = 0; - for (FileStatus stat : inputPaths) { - if (numItems != null && numItems <= itemCount) { - break; - } - Path path = stat.getPath(); - log.info("Processing file '{}' ({}/{})", - new Object[]{path, ++fileCount, inputPaths.length}); - SequenceFileIterable<Writable, Writable> iterable = - new SequenceFileIterable<Writable, Writable>(path, true, conf); - Iterator<Pair<Writable,Writable>> iterator = iterable.iterator(); - long i = 0; - while (iterator.hasNext() && (numItems == null || itemCount < numItems)) { - Pair<Writable, Writable> record = iterator.next(); - Writable keyWritable = record.getFirst(); - Writable valueWritable = record.getSecond(); - if (printKey) { - Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable; - writer.write(notTheVectorWritable.toString()); - writer.write('\t'); - } - VectorWritable vectorWritable = - (VectorWritable) (transposeKeyValue ? keyWritable : valueWritable); - Vector vector = vectorWritable.get(); - if (filters != null + if (filters != null && vector instanceof NamedVector - && !filters.contains(((NamedVector)vector).getName())){ - //we are filtering out this item, skip - continue; - } - if (sizeOnly) { - if (vector instanceof NamedVector) { - writer.write(((NamedVector) vector).getName()); - writer.write(":"); - } else { - writer.write(String.valueOf(i++)); - writer.write(":"); - } - writer.write(String.valueOf(vector.size())); - writer.write('\n'); - } else { - String fmtStr; - if (useCSV) { - fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); - } else { - fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, + && !filters.contains(((NamedVector) vector).getName())) { + //we are filtering out this item, skip + continue; + } + if (sizeOnly) { + if (vector instanceof NamedVector) { + writer.write(((NamedVector) vector).getName()); + writer.write(":"); + } else { + writer.write(String.valueOf(i++)); + writer.write(":"); + } + writer.write(String.valueOf(vector.size())); + writer.write('\n'); + } else if (nameOnly) { + if (vector instanceof NamedVector) { + writer.write(((NamedVector) vector).getName()); + writer.write('\n'); + } + } else { + String fmtStr; + if (useCSV) { + fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments); + } else { + fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector, sortVectors); - } - writer.write(fmtStr); - writer.write('\n'); - } - itemCount++; } + writer.write(fmtStr); + writer.write('\n'); } - writer.flush(); - } finally { - if (shouldClose) { - Closeables.closeQuietly(writer); - } + itemCount++; } - } - - } catch (OptionException e) { - log.error("Exception", e); - printHelp(group); + writer.flush(); + } finally { + if (shouldClose) { + Closeables.closeQuietly(writer); + } } + return result; + + } + + public static void main(String[] args) throws Exception { + ToolRunner.run(new Configuration(), new VectorDumper(), args); } private static void printHelp(Group group) {
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1243556&r1=1243555&r2=1243556&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Mon Feb 13 15:14:18 2012 @@ -111,13 +111,22 @@ public final class VectorHelper { public static List<Pair<String, Double>> toWeightedTerms(Collection<Pair<Integer, Double>> entries, final String[] dictionary) { - return Lists.newArrayList(Collections2.transform(entries, - new Function<Pair<Integer, Double>, Pair<String, Double>>() { - @Override - public Pair<String, Double> apply(Pair<Integer, Double> p) { - return Pair.of(dictionary[p.getFirst()], p.getSecond()); - } - })); + if (dictionary != null) + return Lists.newArrayList(Collections2.transform(entries, + new Function<Pair<Integer, Double>, Pair<String, Double>>() { + @Override + public Pair<String, Double> apply(Pair<Integer, Double> p) { + return Pair.of(dictionary[p.getFirst()], p.getSecond()); + } + })); + else + return Lists.newArrayList(Collections2.transform(entries, + new Function<Pair<Integer, Double>, Pair<String, Double>>() { + @Override + public Pair<String, Double> apply(Pair<Integer, Double> p) { + return Pair.of(Integer.toString(p.getFirst()), p.getSecond()); + } + })); } public static String vectorToJson(Vector vector, String[] dictionary, int maxEntries, boolean sort) {
