Author: srowen
Date: Fri Nov 4 23:49:38 2011
New Revision: 1197822
URL: http://svn.apache.org/viewvc?rev=1197822&view=rev
Log:
MAHOUT-838 add MatrixDumper
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java?rev=1197822&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/MatrixDumper.java
Fri Nov 4 23:49:38 2011
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.math.Matrix;
+import org.apache.mahout.math.MatrixWritable;
+
+/**
+ * Export a Matrix in various text formats:
+ * * CSV file
+ *
+ * Input format: Hadoop SequenceFile with Text key and MatrixWritable value, 1
pair
+ * TODO:
+ * Needs class for key value- should not hard-code to Text.
+ * Options for row and column headers- stats software can be picky.
+ * Assumes only one matrix in a file.
+ */
+public final class MatrixDumper extends AbstractJob {
+
+ private MatrixDumper() { }
+
+ public static void main(String[] args) throws Exception {
+ ToolRunner.run(new MatrixDumper(), args);
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+
+ addInputOption();
+ addOption("output", "o", "Output path", null); // AbstractJob output
feature requires param
+ Map<String, String> parsedArgs = parseArguments(args);
+ if (parsedArgs == null) {
+ return -1;
+ }
+ String outputFile = parsedArgs.containsKey("--output") ?
parsedArgs.get("--output") : null;
+ exportCSV(getInputPath(), outputFile, false);
+ return 0;
+ }
+
+ private static void exportCSV(Path inputPath, String outputFile, boolean
doLabels) throws IOException {
+ MatrixWritable mw = new MatrixWritable();
+ Text key = new Text();
+ readSeqFile(inputPath, key, mw);
+ PrintStream ps = getPrintStream(outputFile);
+ Matrix m = mw.get();
+ String[] columnLabels = getLabels(m.numCols(), m.getColumnLabelBindings(),
"col");
+ String[] rowLabels = getLabels(m.numRows(), m.getRowLabelBindings(),
"row");
+ if (doLabels) {
+ ps.print("rowid,");
+ ps.print(columnLabels[0]);
+ for(int c = 1; c < m.numCols(); c++) {
+ ps.print(',' + columnLabels[c]);
+ }
+ ps.println();
+ }
+ for(int r = 0; r < m.numRows(); r++) {
+ if (doLabels) {
+ ps.print(rowLabels[0] + ',');
+ }
+ ps.print(Double.toString(m.getQuick(r,0)));
+ for(int c = 1; c < m.numCols(); c++) {
+ ps.print(",");
+ ps.print(Double.toString(m.getQuick(r,c)));
+ }
+ ps.println();
+ }
+ if (ps != System.out) {
+ ps.close();
+ }
+ }
+
+ private static void readSeqFile(Path inputPath, Text key, MatrixWritable m)
throws IOException {
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, inputPath, conf);
+ reader.getMetadata();
+ reader.next(key, m);
+ }
+
+ private static PrintStream getPrintStream(String outputPath) throws
IOException {
+ if (outputPath != null) {
+ File outputFile = new File(outputPath);
+ if (outputFile.exists()) {
+ outputFile.delete();
+ }
+ outputFile.createNewFile();
+ OutputStream os = new FileOutputStream(outputFile);
+ return new PrintStream(os);
+ } else {
+ return System.out;
+ }
+ }
+
+ /**
+ * return the label set, sorted by matrix order
+ * if there are no labels, fabricate them using the starter string
+ * @param length
+ */
+ private static String[] getLabels(int length, Map<String,Integer> labels,
String start) {
+ if (labels != null) {
+ return sortLabels(labels);
+ } else {
+ String[] sorted = new String[length];
+ for(int i = 1; i <= length; i++) {
+ sorted[i] = start + i;
+ }
+ return sorted;
+ }
+ }
+
+ private static String[] sortLabels(Map<String,Integer> labels) {
+ String[] sorted = new String[labels.keySet().size()];
+ for(String label: labels.keySet()) {
+ Integer index = labels.get(label);
+ sorted[index] = label;
+ }
+ return sorted;
+ }
+
+}