n...

srowen Sat, 04 Jun 2011 04:14:58 -0700

Author: srowen
Date: Sat Jun  4 11:14:14 2011
New Revision: 1131377

URL: http://svn.apache.org/viewvc?rev=1131377&view=rev
Log:
MAHOUT-415 collocations filter for Lucene from Drew


Added:
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
    
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/
    
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/
    
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java

Added: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java?rev=1131377&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
 (added)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
 Sat Jun  4 11:14:14 2011
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+
+import com.google.common.base.Charsets;
+import org.apache.hadoop.util.bloom.Filter;
+import org.apache.hadoop.util.bloom.Key;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Emits tokens based on bloom filter membership.
+ */
+public class BloomTokenFilter extends TokenFilter {
+  
+  private final Filter filter;
+  private final TermAttribute termAtt;
+  private final CharsetEncoder encoder;
+  private final Key key;
+  private final boolean keepMembers;
+  
+  /** 
+   * @param filter tokens will be checked for membership in this bloom filter
+   * @param in the tokenstream to read.
+   * @param keepMembers keep memoers of the bloom filter? If true works like
+   *   a whitelist and members found in the list are kept and all others are
+   *   dropped. If false works like a stoplist and members found in the 
+   *   filter are dropped all others are kept.
+   */
+  public BloomTokenFilter(Filter filter, boolean keepMembers, TokenStream in) {
+    super(in);
+    this.filter = filter;
+    this.keepMembers = keepMembers;
+    this.key = new Key();
+    this.termAtt = addAttribute(TermAttribute.class);
+    this.encoder = Charsets.UTF_8.newEncoder().
+      onMalformedInput(CodingErrorAction.REPORT).
+      onUnmappableCharacter(CodingErrorAction.REPORT);
+  }
+  
+  @Override
+  public boolean incrementToken() throws IOException {
+    while (input.incrementToken()) {
+      ByteBuffer bytes =  encoder.encode(CharBuffer.wrap(termAtt.termBuffer(), 
0, termAtt.termLength()));
+      key.set(bytes.array(), 1.0f);
+      boolean member = filter.membershipTest(key);
+      if ((keepMembers && member) || (!keepMembers && !member)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+}

Added: 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java?rev=1131377&view=auto
==============================================================================
--- 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
 (added)
+++ 
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
 Sat Jun  4 11:14:14 2011
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+
+import com.google.common.base.Charsets;
+import org.apache.hadoop.util.bloom.BloomFilter;
+import org.apache.hadoop.util.bloom.Filter;
+import org.apache.hadoop.util.bloom.Key;
+import org.apache.hadoop.util.hash.Hash;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.mahout.utils.MahoutTestCase;
+import org.junit.Test;
+
+public final class BloomTokenFilterTest extends MahoutTestCase {
+  
+  private static final CharsetEncoder encoder = Charsets.UTF_8.newEncoder();
+
+  private static final String input = "The best of times the worst of times";
+  private static final String[] allTokens = {
+      "The", "best", "of", "times", "the", "worst", "of", "times"
+  };
+  private static final String[] expectedNonKeepTokens = { "best", "times", 
"the", "worst", "times" };
+  private static final String[] expectedKeepTokens = { "The", "of", "of" };
+  private static final String[] filterTokens    = { "The", "of" };
+  private static final String[] notFilterTokens = { "best", "worst", "the", 
"times"};
+  private static final String[] shingleKeepTokens = {
+      "The best", "best of times", "the worst", "worst of times", "of times"
+  };
+  private static final String[] expectedShingleTokens = {
+      "The best", "best of times", "of times", "the worst", "worst of times", 
"of times"
+  };
+  
+  /** test standalone filter without tokenfilter wrapping */
+  @Test
+  public void testFilter() throws IOException {
+    Filter filter = getFilter(filterTokens);
+    Key k = new Key();
+    for (String s: filterTokens) {
+      setKey(k,s);
+      assertTrue("Key for string " + s + " should be filter member", 
filter.membershipTest(k));
+    }
+    
+    for (String s: notFilterTokens)  {
+      setKey(k,s);
+      assertFalse("Key for string " + s + " should not be filter member", 
filter.membershipTest(k));
+    }
+  }
+  
+  /** normal case, unfiltered analyzer */
+  @Test
+  public void testAnalyzer() throws IOException {
+    StringReader reader = new StringReader(input);
+    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+    TokenStream ts = analyzer.tokenStream(null, reader);
+    validateTokens(allTokens, ts);
+  }
+  
+  /** filtered analyzer */
+  @Test
+  public void testNonKeepdAnalyzer() throws IOException {
+    StringReader reader = new StringReader(input);
+    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+    TokenStream ts = analyzer.tokenStream(null, reader);
+    BloomTokenFilter f = new BloomTokenFilter(getFilter(filterTokens), false 
/* toss matching tokens */, ts);
+    validateTokens(expectedNonKeepTokens, f);
+  }
+
+  /** keep analyzer */
+  @Test
+  public void testKeepAnalyzer() throws IOException {
+    StringReader reader = new StringReader(input);
+    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+    TokenStream ts = analyzer.tokenStream(null, reader);
+    BloomTokenFilter f = new BloomTokenFilter(getFilter(filterTokens), true /* 
keep matching tokens */, ts);
+    validateTokens(expectedKeepTokens, f);
+  }
+  
+  /** shingles, keep those matching whitelist */
+  @Test
+  public void testShingleFilteredAnalyzer() throws IOException {
+    StringReader reader = new StringReader(input);
+    WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+    TokenStream ts = analyzer.tokenStream(null, reader);
+    ShingleFilter sf = new ShingleFilter(ts, 3);
+    BloomTokenFilter f = new BloomTokenFilter(getFilter(shingleKeepTokens),  
true, sf);
+    validateTokens(expectedShingleTokens, f);
+  }
+  
+  private static void setKey(Key k, String s) throws IOException {
+    ByteBuffer buffer = encoder.encode(CharBuffer.wrap(s.toCharArray()));
+    k.set(buffer.array(), 1.0);
+  }
+  
+  private static void validateTokens(String[] expected, TokenStream ts) throws 
IOException {
+    int pos = 0;
+    while (ts.incrementToken()) {
+      assertTrue("Analyzer produced too many tokens", pos <= expected.length);
+      TermAttribute termAttr = ts.getAttribute(TermAttribute.class);
+      assertEquals("Unexpected term", expected[pos++], termAttr.term());
+    }
+    assertEquals("Analyzer produced too few terms", expected.length, pos);
+  }
+
+  private static Filter getFilter(String[] tokens) throws IOException {
+    Filter filter = new BloomFilter(100,50, Hash.JENKINS_HASH);
+    Key k = new Key();
+    for (String s: tokens) {
+      setKey(k,s);
+      filter.add(k);
+    }
+    return filter;
+  }
+  
+}

svn commit: r1131377 - in /mahout/trunk/integration/src: main/java/org/apache/mahout/utils/nlp/collocations/ main/java/org/apache/mahout/utils/nlp/collocations/llr/ test/java/org/apache/mahout/utils/nlp/collocations/ test/java/org/apache/mahout/utils/n...

Reply via email to