Author: srowen
Date: Sat Jun 4 11:14:14 2011
New Revision: 1131377
URL: http://svn.apache.org/viewvc?rev=1131377&view=rev
Log:
MAHOUT-415 collocations filter for Lucene from Drew
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
Added:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java?rev=1131377&view=auto
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
(added)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilter.java
Sat Jun 4 11:14:14 2011
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CodingErrorAction;
+
+import com.google.common.base.Charsets;
+import org.apache.hadoop.util.bloom.Filter;
+import org.apache.hadoop.util.bloom.Key;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Emits tokens based on bloom filter membership.
+ */
+public class BloomTokenFilter extends TokenFilter {
+
+ private final Filter filter;
+ private final TermAttribute termAtt;
+ private final CharsetEncoder encoder;
+ private final Key key;
+ private final boolean keepMembers;
+
+ /**
+ * @param filter tokens will be checked for membership in this bloom filter
+ * @param in the tokenstream to read.
+ * @param keepMembers keep memoers of the bloom filter? If true works like
+ * a whitelist and members found in the list are kept and all others are
+ * dropped. If false works like a stoplist and members found in the
+ * filter are dropped all others are kept.
+ */
+ public BloomTokenFilter(Filter filter, boolean keepMembers, TokenStream in) {
+ super(in);
+ this.filter = filter;
+ this.keepMembers = keepMembers;
+ this.key = new Key();
+ this.termAtt = addAttribute(TermAttribute.class);
+ this.encoder = Charsets.UTF_8.newEncoder().
+ onMalformedInput(CodingErrorAction.REPORT).
+ onUnmappableCharacter(CodingErrorAction.REPORT);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ while (input.incrementToken()) {
+ ByteBuffer bytes = encoder.encode(CharBuffer.wrap(termAtt.termBuffer(),
0, termAtt.termLength()));
+ key.set(bytes.array(), 1.0f);
+ boolean member = filter.membershipTest(key);
+ if ((keepMembers && member) || (!keepMembers && !member)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+}
Added:
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java?rev=1131377&view=auto
==============================================================================
---
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
(added)
+++
mahout/trunk/integration/src/test/java/org/apache/mahout/utils/nlp/collocations/llr/BloomTokenFilterTest.java
Sat Jun 4 11:14:14 2011
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.mahout.utils.nlp.collocations.llr;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetEncoder;
+
+import com.google.common.base.Charsets;
+import org.apache.hadoop.util.bloom.BloomFilter;
+import org.apache.hadoop.util.bloom.Filter;
+import org.apache.hadoop.util.bloom.Key;
+import org.apache.hadoop.util.hash.Hash;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.mahout.utils.MahoutTestCase;
+import org.junit.Test;
+
+public final class BloomTokenFilterTest extends MahoutTestCase {
+
+ private static final CharsetEncoder encoder = Charsets.UTF_8.newEncoder();
+
+ private static final String input = "The best of times the worst of times";
+ private static final String[] allTokens = {
+ "The", "best", "of", "times", "the", "worst", "of", "times"
+ };
+ private static final String[] expectedNonKeepTokens = { "best", "times",
"the", "worst", "times" };
+ private static final String[] expectedKeepTokens = { "The", "of", "of" };
+ private static final String[] filterTokens = { "The", "of" };
+ private static final String[] notFilterTokens = { "best", "worst", "the",
"times"};
+ private static final String[] shingleKeepTokens = {
+ "The best", "best of times", "the worst", "worst of times", "of times"
+ };
+ private static final String[] expectedShingleTokens = {
+ "The best", "best of times", "of times", "the worst", "worst of times",
"of times"
+ };
+
+ /** test standalone filter without tokenfilter wrapping */
+ @Test
+ public void testFilter() throws IOException {
+ Filter filter = getFilter(filterTokens);
+ Key k = new Key();
+ for (String s: filterTokens) {
+ setKey(k,s);
+ assertTrue("Key for string " + s + " should be filter member",
filter.membershipTest(k));
+ }
+
+ for (String s: notFilterTokens) {
+ setKey(k,s);
+ assertFalse("Key for string " + s + " should not be filter member",
filter.membershipTest(k));
+ }
+ }
+
+ /** normal case, unfiltered analyzer */
+ @Test
+ public void testAnalyzer() throws IOException {
+ StringReader reader = new StringReader(input);
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+ TokenStream ts = analyzer.tokenStream(null, reader);
+ validateTokens(allTokens, ts);
+ }
+
+ /** filtered analyzer */
+ @Test
+ public void testNonKeepdAnalyzer() throws IOException {
+ StringReader reader = new StringReader(input);
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+ TokenStream ts = analyzer.tokenStream(null, reader);
+ BloomTokenFilter f = new BloomTokenFilter(getFilter(filterTokens), false
/* toss matching tokens */, ts);
+ validateTokens(expectedNonKeepTokens, f);
+ }
+
+ /** keep analyzer */
+ @Test
+ public void testKeepAnalyzer() throws IOException {
+ StringReader reader = new StringReader(input);
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+ TokenStream ts = analyzer.tokenStream(null, reader);
+ BloomTokenFilter f = new BloomTokenFilter(getFilter(filterTokens), true /*
keep matching tokens */, ts);
+ validateTokens(expectedKeepTokens, f);
+ }
+
+ /** shingles, keep those matching whitelist */
+ @Test
+ public void testShingleFilteredAnalyzer() throws IOException {
+ StringReader reader = new StringReader(input);
+ WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
+ TokenStream ts = analyzer.tokenStream(null, reader);
+ ShingleFilter sf = new ShingleFilter(ts, 3);
+ BloomTokenFilter f = new BloomTokenFilter(getFilter(shingleKeepTokens),
true, sf);
+ validateTokens(expectedShingleTokens, f);
+ }
+
+ private static void setKey(Key k, String s) throws IOException {
+ ByteBuffer buffer = encoder.encode(CharBuffer.wrap(s.toCharArray()));
+ k.set(buffer.array(), 1.0);
+ }
+
+ private static void validateTokens(String[] expected, TokenStream ts) throws
IOException {
+ int pos = 0;
+ while (ts.incrementToken()) {
+ assertTrue("Analyzer produced too many tokens", pos <= expected.length);
+ TermAttribute termAttr = ts.getAttribute(TermAttribute.class);
+ assertEquals("Unexpected term", expected[pos++], termAttr.term());
+ }
+ assertEquals("Analyzer produced too few terms", expected.length, pos);
+ }
+
+ private static Filter getFilter(String[] tokens) throws IOException {
+ Filter filter = new BloomFilter(100,50, Hash.JENKINS_HASH);
+ Key k = new Key();
+ for (String s: tokens) {
+ setKey(k,s);
+ filter.add(k);
+ }
+ return filter;
+ }
+
+}