Mahout - SVD matrix factorization - formatting input matrix Converting Input Format into Mahout's SVD Distributed Matrix Factorization Solver
Purpose The code below, converts a matrix from csv format: <from row>,<to col>,<value>\n Into Mahout's SVD solver format. For example, The 3x3 matrix: 0 1.0 2.1 3.0 4.0 5.0 -5.0 6.2 0 Will be given as input in a csv file as: 1,0,3.0 2,0,-5.0 0,1,1.0 1,1,4.0 2,1,6.2 0,2,2.1 1,2,5.0 NOTE: I ASSUME THE MATRIX IS SORTED BY THE COLUMNS ORDER This code is based on code by Danny Leshem, ContextIn. Command line arguments: args[0] - path to csv input file args[1] - cardinality of the matrix (number of columns) args[2] - path the resulting Mahout's SVD input file Method: The code below, goes over the csv file, and for each matrix column, creates a SequentialAccessSparseVector which contains all the non-zero row entries for this column. Then it appends the column vector to file. Compilation: Copy the java code below into an java file named Convert2SVD.java Add to your IDE project path both Mahout and Hadoop jars. Alternatively, a command line option for compilation is given below. view plain<http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html#> print<http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html#> ?<http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html#> 1. import java.io.BufferedReader; 2. import java.io.FileReader; 3. import java.util.StringTokenizer; 4. 5. import org.apache.mahout.math.SequentialAccessSparseVector; 6. import org.apache.mahout.math.Vector; 7. import org.apache.mahout.math.VectorWritable; 8. import org.apache.hadoop.conf.Configuration; 9. import org.apache.hadoop.fs.FileSystem; 10. import org.apache.hadoop.fs.Path; 11. import org.apache.hadoop.io.IntWritable; 12. import org.apache.hadoop.io.SequenceFile; 13. import org.apache.hadoop.io.SequenceFile.CompressionType; 14. 15. /** 16. * Code for converting CSV format to Mahout's SVD format 17. * @author Danny Bickson, CMU 18. * Note: I ASSUME THE CSV FILE IS SORTED BY THE COLUMN (NAMELY THE SECOND FIELD). 19. * 20. */ 21. 22. public class Convert2SVD { 23. 24. 25. public static int Cardinality; 26. 27. /** 28. * 29. * @param args[0] - input csv file 30. * @param args[1] - cardinality (length of vector) 31. * @param args[2] - output file for svd 32. */ 33. public static void main(String[] args){ 34. 35. try { 36. Cardinality = Integer.parseInt(args[1]); 37. final Configuration conf = new Configuration(); 38. final FileSystem fs = FileSystem.get(conf); 39. final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, new Path(args[2]), IntWritable.class, VectorWritable.class , CompressionType.BLOCK); 40. 41. final IntWritable key = new IntWritable(); 42. final VectorWritable value = new VectorWritable(); 43. 44. 45. String thisLine; 46. 47. BufferedReader br = new BufferedReader(new FileReader(args[0])); 48. Vector vector = null; 49. int from = -1,to =-1; 50. int last_to = -1; 51. float val = 0; 52. int total = 0; 53. int nnz = 0; 54. int e = 0; 55. int max_to =0; 56. int max_from = 0; 57. 58. while ((thisLine = br.readLine()) != null) { // while loop begins here 59. 60. StringTokenizer st = new StringTokenizer(thisLine, ","); 61. while(st.hasMoreTokens()) { 62. from = Integer.parseInt(st.nextToken())-1; //convert from 1 based to zero based 63. to = Integer.parseInt(st.nextToken())-1; //convert from 1 based to zero basd 64. val = Float.parseFloat(st.nextToken()); 65. if (max_from < from) max_from = from; 66. if (max_to < to) max_to = to; 67. if (from < 0 || to < 0 || to > Cardinality || val == 0.0) 68. throw new NumberFormatException("wrong data" + from + " to: " + to + " val: " + val); 69. } 70. 71. //we are working on an existing column, set non-zero rows in it 72. if (last_to != to && last_to != -1){ 73. value.set(vector); 74. 75. writer.append(key, value); //write the older vector 76. e+= vector.getNumNondefaultElements(); 77. } 78. //a new column is observed, open a new vector for it 79. if (last_to != to){ 80. vector = new SequentialAccessSparseVector(Cardinality); 81. key.set(to); // open a new vector 82. total++; 83. } 84. 85. vector.set(from, val); 86. nnz++; 87. 88. if (nnz % 1000000 == 0){ 89. System.out.println("Col" + total + " nnz: " + nnz); 90. } 91. last_to = to; 92. 93. } // end while 94. 95. value.set(vector); 96. writer.append(key,value);//write last row 97. e+= vector.getNumNondefaultElements(); 98. total++; 99. 100. writer.close(); 101. System.out.println("Wrote a total of " + total + " cols " + " nnz: " + nnz); 102. if (e != nnz) 103. System.err.println("Bug:missing edges! we only got" + e); 104. 105. System.out.println("Highest column: " + max_to + " highest row: " + max_from ); 106. } catch(Exception ex){ 107. ex.printStackTrace(); 108. } 109. } 110. } import java.io.BufferedReader; import java.io.FileReader; import java.util.StringTokenizer; import org.apache.mahout.math.SequentialAccessSparseVector; import org.apache.mahout.math.Vector; import org.apache.mahout.math.VectorWritable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; /** * Code for converting CSV format to Mahout's SVD format * @author Danny Bickson, CMU * Note: I ASSUME THE CSV FILE IS SORTED BY THE COLUMN (NAMELY THE SECOND FIELD). * */ public class Convert2SVD { public static int Cardinality; /** * * @param args[0] - input csv file * @param args[1] - cardinality (length of vector) * @param args[2] - output file for svd */ public static void main(String[] args){ try { Cardinality = Integer.parseInt(args[1]); final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, new Path(args[2]), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); final IntWritable key = new IntWritable(); final VectorWritable value = new VectorWritable(); String thisLine; BufferedReader br = new BufferedReader(new FileReader(args[0])); Vector vector = null; int from = -1,to =-1; int last_to = -1; float val = 0; int total = 0; int nnz = 0; int e = 0; int max_to =0; int max_from = 0; while ((thisLine = br.readLine()) != null) { // while loop begins here StringTokenizer st = new StringTokenizer(thisLine, ","); while(st.hasMoreTokens()) { from = Integer.parseInt(st.nextToken())-1; //convert from 1 based to zero based to = Integer.parseInt(st.nextToken())-1; //convert from 1 based to zero basd val = Float.parseFloat(st.nextToken()); if (max_from < from) max_from = from; if (max_to < to) max_to = to; if (from < 0 || to < 0 || to > Cardinality || val == 0.0) throw new NumberFormatException("wrong data" + from + " to: " + to + " val: " + val); } //we are working on an existing column, set non-zero rows in it if (last_to != to && last_to != -1){ value.set(vector); writer.append(key, value); //write the older vector e+= vector.getNumNondefaultElements(); } //a new column is observed, open a new vector for it if (last_to != to){ vector = new SequentialAccessSparseVector(Cardinality); key.set(to); // open a new vector total++; } vector.set(from, val); nnz++; if (nnz % 1000000 == 0){ System.out.println("Col" + total + " nnz: " + nnz); } last_to = to; } // end while value.set(vector); writer.append(key,value);//write last row e+= vector.getNumNondefaultElements(); total++; writer.close(); System.out.println("Wrote a total of " + total + " cols " + " nnz: " + nnz); if (e != nnz) System.err.println("Bug:missing edges! we only got" + e); System.out.println("Highest column: " + max_to + " highest row: " + max_from ); } catch(Exception ex){ ex.printStackTrace(); } } } A second option to compile this file is create a Makefile, with the following in it: view plain<http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html#> print<http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html#> ?<http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html#> 1. all: 2. javac -cp /mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/core- 3.1.1.jar:/mnt/bigbrofs/usr7/bickson/mahout-0.4 /taste-web/target/mahout-taste-webapp-0.5 -SNAPSHOT/WEB-INF/lib/mahout-core-0.5 -SNAPSHOT.jar:/mnt/bigbrofs/usr7/bickson/mahout-0.4 /taste-web/target/mahout-taste-webapp-0.5 -SNAPSHOT/WEB-INF/lib/mahout-math-0.5 -SNAPSHOT.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/commons-cli- 1.2.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/hadoop-0.20.2 -core.jar *.java all: javac -cp /mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/core-3.1.1.jar:/mnt/bigbrofs/usr7/bickson/mahout-0.4/taste-web/target/mahout-taste-webapp-0.5-SNAPSHOT/WEB-INF/lib/mahout-core-0.5-SNAPSHOT.jar:/mnt/bigbrofs/usr7/bickson/mahout-0.4/taste-web/target/mahout-taste-webapp-0.5-SNAPSHOT/WEB-INF/lib/mahout-math-0.5-SNAPSHOT.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/commons-cli-1.2.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/hadoop-0.20.2-core.jar *.java Note that you will have the change location of the jars to point to where your jars are stored. Example for running this conversion for netflix data: view plain<http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html#> print<http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html#> ?<http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html#> 1. java -cp .:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/core-3.1.1 .jar:/mnt/bigbrofs/usr7/bickson/mahout-0.4 /taste-web/target/mahout-taste-webapp-0.5 -SNAPSHOT/WEB-INF/lib/mahout-core-0.5 -SNAPSHOT.jar:/mnt/bigbrofs/usr7/bickson/mahout-0.4 /taste-web/target/mahout-taste-webapp-0.5 -SNAPSHOT/WEB-INF/lib/mahout-math-0.5 -SNAPSHOT.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/commons-cli- 1.2.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/hadoop-0.20.2 -core.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/commons-logging- 1.0.4.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2 /lib/commons-logging-api-1.0.4.jar Convert2SVD ../../netflixe.csv 17770 netflixe.seq 2. Aug 23, 2011 1:16:06 PM org.apache.hadoop.util.NativeCodeLoader <clinit> 3. WARNING: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 4. Aug 23, 2011 1:16:06 PM org.apache.hadoop.io.compress.CodecPool getCompressor 5. INFO: Got brand-new compressor 6. Row241 nnz: 1000000 7. Row381 nnz: 2000000 8. Row571 nnz: 3000000 9. Row789 nnz: 4000000 10. Row1046 nnz: 5000000 11. Row1216 nnz: 6000000 12. Row1441 nnz: 7000000 13. 14. ... 15. </clinit> java -cp .:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/core-3.1.1.jar:/mnt/bigbrofs/usr7/bickson/mahout-0.4/taste-web/target/mahout-taste-webapp-0.5-SNAPSHOT/WEB-INF/lib/mahout-core-0.5-SNAPSHOT.jar:/mnt/bigbrofs/usr7/bickson/mahout-0.4/taste-web/target/mahout-taste-webapp-0.5-SNAPSHOT/WEB-INF/lib/mahout-math-0.5-SNAPSHOT.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/commons-cli-1.2.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/hadoop-0.20.2-core.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/commons-logging-1.0.4.jar:/mnt/bigbrofs/usr7/bickson/hadoop-0.20.2/lib/commons-logging-api-1.0.4.jar Convert2SVD ../../netflixe.csv 17770 netflixe.seq Aug 23, 2011 1:16:06 PM org.apache.hadoop.util.NativeCodeLoader WARNING: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Aug 23, 2011 1:16:06 PM org.apache.hadoop.io.compress.CodecPool getCompressor INFO: Got brand-new compressor Row241 nnz: 1000000 Row381 nnz: 2000000 Row571 nnz: 3000000 Row789 nnz: 4000000 Row1046 nnz: 5000000 Row1216 nnz: 6000000 Row1441 nnz: 7000000 ... NOTE: You may want also to checkout GraphLab's collaborative filtering library: here <http://graphlab.org/pmf.html>. GraphLab has a 100% compatible SVD solver to Mahout, with performance gains up to x50 times faster. I have created Java code to convert Mahout sequence files into Graphlab's format and back. Email me and I will send you the code. 2011/8/29 myn <[email protected]> > thanks > But could you send the content ofhttp:// > bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html to me ? > I can`t open it in china . > > > > > > At 2011-08-29 15:29:40,"Danny Bickson" <[email protected]> wrote: > >Command line arguments are found here: > >https://cwiki.apache.org/MAHOUT/dimensional-reduction.html > >I wrote a quick tutorial on how to prepare sparse matrices as input to > >Mahout SVD here: > >http://bickson.blogspot.com/2011/02/mahout-svd-matrix-factorization.html > > > >Let me know if you have further questions. > > > >2011/8/29 myn <[email protected]> > > > >> i want to study Singular Value Decomposition algorithms; > >> I also have a book called mahout in action,but i can`t found sth about > this > >> algorithm; > >> is there someplace introduce how to use the method? > >> till now DistributedLanczosSolver is not a mapreduce method > >> org.apache.mahout.math.hadoop.decomposer.DistributedLanczosSolver = svd >
