Author: robinanil
Date: Tue Mar 22 16:15:41 2011
New Revision: 1084235

URL: http://svn.apache.org/viewvc?rev=1084235&view=rev
Log:
MAHOUT-625 Fixing support bug in due to dangling item in the header table, 
Adding tests based on retail data from  http://fimi.ua.ac.be/data/, Contributed 
by Jaroslaw Odzga

Added:
    
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java
    
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java
    mahout/trunk/core/src/test/resources/retail.dat
    mahout/trunk/core/src/test/resources/retail_results_with_min_sup_100.dat
Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java?rev=1084235&r1=1084234&r2=1084235&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
 Tue Mar 22 16:15:41 2011
@@ -646,6 +646,7 @@ public class FPGrowth<A extends Comparab
           int prevNodeId = prevNode.get(parent);
           if (tree.childCount(prevNodeId) <= 1 && tree.childCount(nextNode) <= 
1) {
             tree.addCount(prevNodeId, tree.count(nextNode));
+            tree.addCount(nextNode, -1 * tree.count(nextNode));
             if (tree.childCount(nextNode) == 1) {
               tree.addChild(prevNodeId, tree.childAtIndex(nextNode, 0));
               tree.setParent(tree.childAtIndex(nextNode, 0), prevNodeId);

Added: 
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java?rev=1084235&view=auto
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java
 (added)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java
 Tue Mar 22 16:15:41 2011
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.mahout.common.FileLineIterable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.StringRecordIterator;
+import org.apache.mahout.fpm.pfpgrowth.convertors.StatusUpdater;
+import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth;
+import org.junit.Test;
+
+import com.google.common.io.Resources;
+
+public class FPGrowthRetailDataTest {
+  @Test
+  public void testSpecificCaseFromRetailDataMinSup500() throws IOException {
+    FPGrowth<String> fp = new FPGrowth<String>();
+    
+    StringRecordIterator it = new StringRecordIterator(new 
FileLineIterable(Resources.getResource(
+      "retail.dat").openStream()), "\\s+");
+    int pattern_41_36_39 = 0;
+    while (it.hasNext()) {
+      Pair<List<String>,Long> next = it.next();
+      List<String> items = next.getFirst();
+      if (items.contains("41") && items.contains("36") && 
items.contains("39")) {
+        pattern_41_36_39++;
+      }
+    }
+    
+    final Map<Set<String>,Long> results = new HashMap<Set<String>,Long>();
+    
+    Set<String> returnableFeatures = new HashSet<String>();
+    returnableFeatures.add("41");
+    returnableFeatures.add("36");
+    returnableFeatures.add("39");
+    
+    fp.generateTopKFrequentPatterns(
+      new StringRecordIterator(new 
FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"),
+
+      fp.generateFList(new StringRecordIterator(new 
FileLineIterable(Resources.getResource("retail.dat")
+          .openStream()), "\\s+"), 500), 500, 1000, returnableFeatures,
+      new OutputCollector<String,List<Pair<List<String>,Long>>>() {
+        
+        @Override
+        public void collect(String key, List<Pair<List<String>,Long>> value) 
throws IOException {
+          
+          for (Pair<List<String>,Long> v : value) {
+            List<String> l = v.getFirst();
+            results.put(new HashSet<String>(l), v.getSecond());
+          }
+        }
+        
+      }, new StatusUpdater() {
+        
+        @Override
+        public void update(String status) {}
+      });
+    
+    assertEquals(new Long(pattern_41_36_39), results.get(returnableFeatures));
+    
+  }
+  
+  /*
+  @Test
+  public void testRetailDataMinSup100() throws IOException {
+    StringRecordIterator it = new StringRecordIterator(new 
FileLineIterable(Resources.getResource(
+      "retail_results_with_min_sup_100.dat").openStream()), "\\s+");
+    final Map<Set<String>,Long> expectedResults = new 
HashMap<Set<String>,Long>();
+    while (it.hasNext()) {
+      Pair<List<String>,Long> next = it.next();
+      List<String> items = new ArrayList<String>(next.getFirst());
+      String supportString = items.remove(items.size() - 1);
+      Long support = Long.parseLong(supportString.substring(1, 
supportString.length() - 1));
+      expectedResults.put(new HashSet<String>(items), support);
+    }
+    
+    FPGrowth<String> fp = new FPGrowth<String>();
+    
+    final Map<Set<String>,Long> results = new HashMap<Set<String>,Long>();
+    
+    fp.generateTopKFrequentPatterns(
+      new StringRecordIterator(new 
FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"),
+
+      fp.generateFList(new StringRecordIterator(new 
FileLineIterable(Resources.getResource("retail.dat")
+          .openStream()), "\\s+"), 100), 100, 1000, null,
+      new OutputCollector<String,List<Pair<List<String>,Long>>>() {
+        
+        @Override
+        public void collect(String key, List<Pair<List<String>,Long>> value) 
throws IOException {
+          
+          for (Pair<List<String>,Long> v : value) {
+            List<String> l = v.getFirst();
+            results.put(new HashSet<String>(l), v.getSecond());
+          }
+        }
+      }, new StatusUpdater() {
+        
+        @Override
+        public void update(String status) {}
+      });
+    
+    assertEquals(expectedResults.size(), results.size());
+    
+    for (Entry<Set<String>,Long> entry : results.entrySet()) {
+      Set<String> key = entry.getKey();
+      assertEquals(expectedResults.get(key), results.get(entry.getKey()));
+    }
+  }*/
+}

Added: 
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java?rev=1084235&view=auto
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java
 (added)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java
 Tue Mar 22 16:15:41 2011
@@ -0,0 +1,146 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.mahout.common.FileLineIterable;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringRecordIterator;
+import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.io.Resources;
+
+public class PFPGrowthRetailDataTest extends MahoutTestCase {
+  
+  private final Parameters params = new Parameters();
+  private static final Logger log = 
LoggerFactory.getLogger(PFPGrowthRetailDataTest.class);
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    params.set(PFPGrowth.MIN_SUPPORT, "100");
+    params.set(PFPGrowth.MAX_HEAPSIZE, "10000");
+    params.set(PFPGrowth.NUM_GROUPS, "50");
+    params.set(PFPGrowth.ENCODING, "UTF-8");
+    params.set(PFPGrowth.TREE_CACHE_SIZE, "5");
+    File inputDir = getTestTempDir("transactions");
+    File outputDir = getTestTempDir("frequentpatterns");
+    File input = new File(inputDir, "test.txt");
+    params.set(PFPGrowth.INPUT, input.getAbsolutePath());
+    params.set(PFPGrowth.OUTPUT, outputDir.getAbsolutePath());
+    Writer writer = new OutputStreamWriter(new FileOutputStream(input), 
Charset.forName("UTF-8"));
+    try {
+      StringRecordIterator it = new StringRecordIterator(new 
FileLineIterable(Resources.getResource(
+        "retail.dat").openStream()), "\\s+");
+      Collection<List<String>> transactions = new ArrayList<List<String>>();
+      
+      while (it.hasNext()) {
+        Pair<List<String>,Long> next = it.next();
+        transactions.add(next.getFirst());
+      }
+      
+      for (List<String> transaction : transactions) {
+        String sep = "";
+        for (String item : transaction) {
+          writer.write(sep + item);
+          sep = ",";
+        }
+        writer.write("\n");
+      }
+      
+    } finally {
+      writer.close();
+    }
+  }
+  
+  @Test
+  public void testRetailDataMinSup100() throws IOException, 
InterruptedException, ClassNotFoundException {
+    StringRecordIterator it = new StringRecordIterator(new 
FileLineIterable(Resources.getResource(
+      "retail_results_with_min_sup_100.dat").openStream()), "\\s+");
+    
+    final Map<Set<String>,Long> expectedResults = new 
HashMap<Set<String>,Long>();
+    while (it.hasNext()) {
+      Pair<List<String>,Long> next = it.next();
+      List<String> items = new ArrayList<String>(next.getFirst());
+      String supportString = items.remove(items.size() - 1);
+      Long support = Long.parseLong(supportString.substring(1, 
supportString.length() - 1));
+      expectedResults.put(new HashSet<String>(items), support);
+    }
+    
+    log.info("Starting Parallel Counting Test: {}", 
params.get(PFPGrowth.MAX_HEAPSIZE));
+    PFPGrowth.startParallelCounting(params);
+    log.info("Starting Grouping Test: {}", params.get(PFPGrowth.MAX_HEAPSIZE));
+    PFPGrowth.startGroupingItems(params);
+    log.info("Starting Parallel FPGrowth Test: {}", 
params.get(PFPGrowth.MAX_HEAPSIZE));
+    PFPGrowth.startGroupingItems(params);
+    PFPGrowth.startTransactionSorting(params);
+    PFPGrowth.startParallelFPGrowth(params);
+    log.info("Starting Pattern Aggregation Test: {}", 
params.get(PFPGrowth.MAX_HEAPSIZE));
+    PFPGrowth.startAggregating(params);
+    List<Pair<String,TopKStringPatterns>> frequentPatterns = 
PFPGrowth.readFrequentPattern(params);
+    
+    final Map<Set<String>,Long> results = new HashMap<Set<String>,Long>();
+    for (Pair<String,TopKStringPatterns> topK : frequentPatterns) {
+      Iterator<Pair<List<String>,Long>> topKIt = topK.getSecond().iterator();
+      while (topKIt.hasNext()) {
+        Pair<List<String>,Long> entry = topKIt.next();
+        results.put(new HashSet<String>(entry.getFirst()), entry.getSecond());
+      }
+    }
+    
+    for (Entry<Set<String>,Long> entry : results.entrySet()) {
+      Set<String> key = entry.getKey();
+      if (expectedResults.get(key) == null) {
+        System.out.println("missing: " + key);
+      } else {
+        if (!expectedResults.get(key).equals(results.get(entry.getKey()))) {
+          System.out.println("invalid: " + key + ", expected: " + 
expectedResults.get(key) + ", got: "
+                             + results.get(entry.getKey()));
+        }
+      }
+    }
+    
+    for (Entry<Set<String>,Long> entry : expectedResults.entrySet()) {
+      Set<String> key = entry.getKey();
+      if (results.get(key) == null) {
+        System.out.println("missing: " + key);
+      }
+    }
+    assertEquals(expectedResults.size(), results.size());
+  }
+}


Reply via email to