Author: srowen
Date: Fri Jun 22 11:58:13 2012
New Revision: 1352857

URL: http://svn.apache.org/viewvc?rev=1352857&view=rev
Log:
MAHOUT-985 ignore ARFF instance weights, handle ? correctly

Modified:
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
    
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java?rev=1352857&r1=1352856&r2=1352857&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFIterator.java
 Fri Jun 22 11:58:13 2012
@@ -32,6 +32,7 @@ final class ARFFIterator extends Abstrac
   // This pattern will make sure a , inside a string is not a point for split.
   // Ex: "Arizona" , "0:08 PM, PDT" , 110 will be split considering "0:08 PM, 
PDT" as one string
   private static final Pattern COMMA_PATTERN = 
Pattern.compile(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)");
+  private static final Pattern WORDS_WITHOUT_SPARSE = 
Pattern.compile("([\\w[^{]])*");
 
   private final BufferedReader reader;
   private final ARFFModel model;
@@ -60,7 +61,7 @@ final class ARFFIterator extends Abstrac
     }
     Vector result;
     if (line.startsWith(ARFFModel.ARFF_SPARSE)) {
-      line = line.substring(1, line.length() - 1);
+      line = line.substring(1, line.indexOf(ARFFModel.ARFF_SPARSE_END));
       String[] splits = COMMA_PATTERN.split(line);
       result = new RandomAccessSparseVector(model.getLabelSize());
       for (String split : splits) {
@@ -68,13 +69,19 @@ final class ARFFIterator extends Abstrac
         int idIndex = split.indexOf(' ');
         int idx = Integer.parseInt(split.substring(0, idIndex).trim());
         String data = split.substring(idIndex).trim();
-        result.setQuick(idx, model.getValue(data, idx));
+        if (!"?".equals(data)) {
+          result.setQuick(idx, model.getValue(data, idx));
+        }
       }
     } else {
       result = new DenseVector(model.getLabelSize());
       String[] splits = COMMA_PATTERN.split(line);
       for (int i = 0; i < splits.length; i++) {
-        result.setQuick(i, model.getValue(splits[i], i));
+        String split = splits[i];
+        split = split.trim();
+        if (WORDS_WITHOUT_SPARSE.matcher(split).matches() && 
!"?".equals(split)) {
+          result.setQuick(i, model.getValue(split, i));
+        }
       }
     }
     //result.setLabelBindings(labelBindings);

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java?rev=1352857&r1=1352856&r2=1352857&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/arff/ARFFModel.java
 Fri Jun 22 11:58:13 2012
@@ -28,6 +28,7 @@ import java.util.Map;
  */
 public interface ARFFModel {
   String ARFF_SPARSE = "{"; //indicates the vector is sparse
+  String ARFF_SPARSE_END = "}";
   String ARFF_COMMENT = "%";
   String ATTRIBUTE = "@attribute";
   String DATA = "@data";

Modified: 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java?rev=1352857&r1=1352856&r2=1352857&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
 (original)
+++ 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/arff/ARFFVectorIterableTest.java
 Fri Jun 22 11:58:13 2012
@@ -75,24 +75,36 @@ public final class ARFFVectorIterableTes
   public void testDense() throws Exception {
     ARFFModel model = new MapBackedARFFModel();
     Iterable<Vector> iterable = new ARFFVectorIterable(SAMPLE_DENSE_ARFF, 
model);
+    Vector firstVector = iterable.iterator().next();
+    assertEquals(1.0, firstVector.get(0), 0);
+    assertEquals(65.0, firstVector.get(1), 0);
+    assertEquals(1.0, firstVector.get(3), 0);
+    assertEquals(1.0, firstVector.get(4), 0);
+    
     int count = 0;
     for (Vector vector : iterable) {
       assertTrue("Vector is not dense", vector instanceof DenseVector);
       count++;
     }
-    assertEquals(10, count);
+    assertEquals(5, count);
   }
 
   @Test
   public void testSparse() throws Exception {
     ARFFModel model = new MapBackedARFFModel();
     Iterable<Vector> iterable = new ARFFVectorIterable(SAMPLE_SPARSE_ARFF, 
model);
+    
+    Vector firstVector = iterable.iterator().next();
+    assertEquals(23.1, firstVector.get(1), 0);
+    assertEquals(3.23, firstVector.get(2), 0);
+    assertEquals(1.2, firstVector.get(3), 0);
+    
     int count = 0;
     for (Vector vector : iterable) {
       assertTrue("Vector is not dense", vector instanceof 
RandomAccessSparseVector);
       count++;
     }
-    assertEquals(10, count);
+    assertEquals(9, count);
   }
 
   @Test
@@ -212,17 +224,19 @@ public final class ARFFVectorIterableTes
   }
   
   private static final String SAMPLE_DENSE_ARFF = "   % Comments\n" + "   % 
\n" + "   % Comments go here"
-                                                  + "   % \n" + "   @RELATION 
Mahout\n" + '\n'
-                                                  + "   @ATTRIBUTE foo  
NUMERIC\n"
-                                                  + "   @ATTRIBUTE bar   
NUMERIC\n"
-                                                  + "   @ATTRIBUTE hockey  
NUMERIC\n"
-                                                  + "   @ATTRIBUTE football   
NUMERIC\n" + "  \n" + '\n'
-                                                  + '\n' + "   @DATA\n" + "   
23.1,3.23,1.2,0.2\n"
-                                                  + "   2.9,3.0,1.2,0.2\n" + " 
  2.7,3.2,1.3,0.2\n"
-                                                  + "   2.6,3.1,1.23,0.2\n" + 
"   23.0,3.6,1.2,0.2\n"
-                                                  + "   23.2,3.9,1.7,0.2\n" + 
"   2.6,3.2,1.2,0.3\n"
-                                                  + "   23.0,3.2,1.23,0.2\n" + 
"   2.2,2.9,1.2,0.2\n"
-                                                  + "   2.9,3.1,1.23,0.1\n";
+                                                  + "   % \n" + "   @RELATION 
golf\n" + '\n'
+                                                  + "   @ATTRIBUTE outlook 
{sunny,overcast, rain}\n"
+                                                  + "   @ATTRIBUTE temperature 
  NUMERIC\n"
+                                                  + "   @ATTRIBUTE humidity  
NUMERIC\n"
+                                                  + "   @ATTRIBUTE windy 
{false, true}\n" 
+                                                  + "   @ATTRIBUTE class 
{dont_play, play}\n" + "  \n" + '\n'  
+                                                  + '\n' + "   @DATA\n" 
+                                                  + "   sunny,    65, ?, 
false, dont_play, {2} \n"
+                                                  + "   sunny,    80, 90,  
true, dont_play\n" 
+                                                  + "   overcast, 83, 78, 
false, play ,{3} \n"
+                                                  + "   rain,     70, 96, 
false, play\n" 
+                                                  + "   rain,     68, 80, 
false, play\n"
+                                                  + "   rain,     65, 70, 
true, play\n";
   
   private static final String SAMPLE_SPARSE_ARFF = "   % Comments\n" + "   % 
\n" + "   % Comments go here"
                                                    + "   % \n" + "   @RELATION 
Mahout\n" + '\n'
@@ -231,8 +245,8 @@ public final class ARFFVectorIterableTes
                                                    + "   @ATTRIBUTE hockey  
NUMERIC\n"
                                                    + "   @ATTRIBUTE football   
NUMERIC\n"
                                                    + "   @ATTRIBUTE tennis   
NUMERIC\n" + "  \n" + '\n'
-                                                   + '\n' + "   @DATA\n" + "   
{1 23.1,2 3.23,3 1.2,4 0.2}\n"
-                                                   + "   {0 2.9}\n" + "   {0 
2.7,2 3.2,3 1.3,4 0.2}\n"
+                                                   + '\n' + "   @DATA\n" + "   
{1 23.1,2 3.23,3 1.2,4 ?} {5}\n"
+                                                   + "   {0 2.9}\n" + "   {0 
2.7,2 3.2,3 1.3,4 0.2} {10} \n"
                                                    + "   {1 2.6,2 3.1,3 1.23,4 
0.2}\n"
                                                    + "   {1 23.0,2 3.6,3 1.2,4 
0.2}\n"
                                                    + "   {0 23.2,1 3.9,3 1.7,4 
0.2}\n"


Reply via email to