Author: robinanil
Date: Tue Mar 22 16:15:41 2011
New Revision: 1084235
URL: http://svn.apache.org/viewvc?rev=1084235&view=rev
Log:
MAHOUT-625 Fixing support bug in due to dangling item in the header table,
Adding tests based on retail data from http://fimi.ua.ac.be/data/, Contributed
by Jaroslaw Odzga
Added:
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java
mahout/trunk/core/src/test/resources/retail.dat
mahout/trunk/core/src/test/resources/retail_results_with_min_sup_100.dat
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java?rev=1084235&r1=1084234&r2=1084235&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FPGrowth.java
Tue Mar 22 16:15:41 2011
@@ -646,6 +646,7 @@ public class FPGrowth<A extends Comparab
int prevNodeId = prevNode.get(parent);
if (tree.childCount(prevNodeId) <= 1 && tree.childCount(nextNode) <=
1) {
tree.addCount(prevNodeId, tree.count(nextNode));
+ tree.addCount(nextNode, -1 * tree.count(nextNode));
if (tree.childCount(nextNode) == 1) {
tree.addChild(prevNodeId, tree.childAtIndex(nextNode, 0));
tree.setParent(tree.childAtIndex(nextNode, 0), prevNodeId);
Added:
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java?rev=1084235&view=auto
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java
(added)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/FPGrowthRetailDataTest.java
Tue Mar 22 16:15:41 2011
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.mahout.common.FileLineIterable;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.StringRecordIterator;
+import org.apache.mahout.fpm.pfpgrowth.convertors.StatusUpdater;
+import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth;
+import org.junit.Test;
+
+import com.google.common.io.Resources;
+
+public class FPGrowthRetailDataTest {
+ @Test
+ public void testSpecificCaseFromRetailDataMinSup500() throws IOException {
+ FPGrowth<String> fp = new FPGrowth<String>();
+
+ StringRecordIterator it = new StringRecordIterator(new
FileLineIterable(Resources.getResource(
+ "retail.dat").openStream()), "\\s+");
+ int pattern_41_36_39 = 0;
+ while (it.hasNext()) {
+ Pair<List<String>,Long> next = it.next();
+ List<String> items = next.getFirst();
+ if (items.contains("41") && items.contains("36") &&
items.contains("39")) {
+ pattern_41_36_39++;
+ }
+ }
+
+ final Map<Set<String>,Long> results = new HashMap<Set<String>,Long>();
+
+ Set<String> returnableFeatures = new HashSet<String>();
+ returnableFeatures.add("41");
+ returnableFeatures.add("36");
+ returnableFeatures.add("39");
+
+ fp.generateTopKFrequentPatterns(
+ new StringRecordIterator(new
FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"),
+
+ fp.generateFList(new StringRecordIterator(new
FileLineIterable(Resources.getResource("retail.dat")
+ .openStream()), "\\s+"), 500), 500, 1000, returnableFeatures,
+ new OutputCollector<String,List<Pair<List<String>,Long>>>() {
+
+ @Override
+ public void collect(String key, List<Pair<List<String>,Long>> value)
throws IOException {
+
+ for (Pair<List<String>,Long> v : value) {
+ List<String> l = v.getFirst();
+ results.put(new HashSet<String>(l), v.getSecond());
+ }
+ }
+
+ }, new StatusUpdater() {
+
+ @Override
+ public void update(String status) {}
+ });
+
+ assertEquals(new Long(pattern_41_36_39), results.get(returnableFeatures));
+
+ }
+
+ /*
+ @Test
+ public void testRetailDataMinSup100() throws IOException {
+ StringRecordIterator it = new StringRecordIterator(new
FileLineIterable(Resources.getResource(
+ "retail_results_with_min_sup_100.dat").openStream()), "\\s+");
+ final Map<Set<String>,Long> expectedResults = new
HashMap<Set<String>,Long>();
+ while (it.hasNext()) {
+ Pair<List<String>,Long> next = it.next();
+ List<String> items = new ArrayList<String>(next.getFirst());
+ String supportString = items.remove(items.size() - 1);
+ Long support = Long.parseLong(supportString.substring(1,
supportString.length() - 1));
+ expectedResults.put(new HashSet<String>(items), support);
+ }
+
+ FPGrowth<String> fp = new FPGrowth<String>();
+
+ final Map<Set<String>,Long> results = new HashMap<Set<String>,Long>();
+
+ fp.generateTopKFrequentPatterns(
+ new StringRecordIterator(new
FileLineIterable(Resources.getResource("retail.dat").openStream()), "\\s+"),
+
+ fp.generateFList(new StringRecordIterator(new
FileLineIterable(Resources.getResource("retail.dat")
+ .openStream()), "\\s+"), 100), 100, 1000, null,
+ new OutputCollector<String,List<Pair<List<String>,Long>>>() {
+
+ @Override
+ public void collect(String key, List<Pair<List<String>,Long>> value)
throws IOException {
+
+ for (Pair<List<String>,Long> v : value) {
+ List<String> l = v.getFirst();
+ results.put(new HashSet<String>(l), v.getSecond());
+ }
+ }
+ }, new StatusUpdater() {
+
+ @Override
+ public void update(String status) {}
+ });
+
+ assertEquals(expectedResults.size(), results.size());
+
+ for (Entry<Set<String>,Long> entry : results.entrySet()) {
+ Set<String> key = entry.getKey();
+ assertEquals(expectedResults.get(key), results.get(entry.getKey()));
+ }
+ }*/
+}
Added:
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java?rev=1084235&view=auto
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java
(added)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/fpm/pfpgrowth/PFPGrowthRetailDataTest.java
Tue Mar 22 16:15:41 2011
@@ -0,0 +1,146 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.fpm.pfpgrowth;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import org.apache.mahout.common.FileLineIterable;
+import org.apache.mahout.common.MahoutTestCase;
+import org.apache.mahout.common.Pair;
+import org.apache.mahout.common.Parameters;
+import org.apache.mahout.common.StringRecordIterator;
+import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.io.Resources;
+
+public class PFPGrowthRetailDataTest extends MahoutTestCase {
+
+ private final Parameters params = new Parameters();
+ private static final Logger log =
LoggerFactory.getLogger(PFPGrowthRetailDataTest.class);
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ params.set(PFPGrowth.MIN_SUPPORT, "100");
+ params.set(PFPGrowth.MAX_HEAPSIZE, "10000");
+ params.set(PFPGrowth.NUM_GROUPS, "50");
+ params.set(PFPGrowth.ENCODING, "UTF-8");
+ params.set(PFPGrowth.TREE_CACHE_SIZE, "5");
+ File inputDir = getTestTempDir("transactions");
+ File outputDir = getTestTempDir("frequentpatterns");
+ File input = new File(inputDir, "test.txt");
+ params.set(PFPGrowth.INPUT, input.getAbsolutePath());
+ params.set(PFPGrowth.OUTPUT, outputDir.getAbsolutePath());
+ Writer writer = new OutputStreamWriter(new FileOutputStream(input),
Charset.forName("UTF-8"));
+ try {
+ StringRecordIterator it = new StringRecordIterator(new
FileLineIterable(Resources.getResource(
+ "retail.dat").openStream()), "\\s+");
+ Collection<List<String>> transactions = new ArrayList<List<String>>();
+
+ while (it.hasNext()) {
+ Pair<List<String>,Long> next = it.next();
+ transactions.add(next.getFirst());
+ }
+
+ for (List<String> transaction : transactions) {
+ String sep = "";
+ for (String item : transaction) {
+ writer.write(sep + item);
+ sep = ",";
+ }
+ writer.write("\n");
+ }
+
+ } finally {
+ writer.close();
+ }
+ }
+
+ @Test
+ public void testRetailDataMinSup100() throws IOException,
InterruptedException, ClassNotFoundException {
+ StringRecordIterator it = new StringRecordIterator(new
FileLineIterable(Resources.getResource(
+ "retail_results_with_min_sup_100.dat").openStream()), "\\s+");
+
+ final Map<Set<String>,Long> expectedResults = new
HashMap<Set<String>,Long>();
+ while (it.hasNext()) {
+ Pair<List<String>,Long> next = it.next();
+ List<String> items = new ArrayList<String>(next.getFirst());
+ String supportString = items.remove(items.size() - 1);
+ Long support = Long.parseLong(supportString.substring(1,
supportString.length() - 1));
+ expectedResults.put(new HashSet<String>(items), support);
+ }
+
+ log.info("Starting Parallel Counting Test: {}",
params.get(PFPGrowth.MAX_HEAPSIZE));
+ PFPGrowth.startParallelCounting(params);
+ log.info("Starting Grouping Test: {}", params.get(PFPGrowth.MAX_HEAPSIZE));
+ PFPGrowth.startGroupingItems(params);
+ log.info("Starting Parallel FPGrowth Test: {}",
params.get(PFPGrowth.MAX_HEAPSIZE));
+ PFPGrowth.startGroupingItems(params);
+ PFPGrowth.startTransactionSorting(params);
+ PFPGrowth.startParallelFPGrowth(params);
+ log.info("Starting Pattern Aggregation Test: {}",
params.get(PFPGrowth.MAX_HEAPSIZE));
+ PFPGrowth.startAggregating(params);
+ List<Pair<String,TopKStringPatterns>> frequentPatterns =
PFPGrowth.readFrequentPattern(params);
+
+ final Map<Set<String>,Long> results = new HashMap<Set<String>,Long>();
+ for (Pair<String,TopKStringPatterns> topK : frequentPatterns) {
+ Iterator<Pair<List<String>,Long>> topKIt = topK.getSecond().iterator();
+ while (topKIt.hasNext()) {
+ Pair<List<String>,Long> entry = topKIt.next();
+ results.put(new HashSet<String>(entry.getFirst()), entry.getSecond());
+ }
+ }
+
+ for (Entry<Set<String>,Long> entry : results.entrySet()) {
+ Set<String> key = entry.getKey();
+ if (expectedResults.get(key) == null) {
+ System.out.println("missing: " + key);
+ } else {
+ if (!expectedResults.get(key).equals(results.get(entry.getKey()))) {
+ System.out.println("invalid: " + key + ", expected: " +
expectedResults.get(key) + ", got: "
+ + results.get(entry.getKey()));
+ }
+ }
+ }
+
+ for (Entry<Set<String>,Long> entry : expectedResults.entrySet()) {
+ Set<String> key = entry.getKey();
+ if (results.get(key) == null) {
+ System.out.println("missing: " + key);
+ }
+ }
+ assertEquals(expectedResults.size(), results.size());
+ }
+}