Updated Branches: refs/heads/master 3eb2d3f3b -> ad90b151d
CRUNCH-75: Added BloomFilters in crunch-contrib Project: http://git-wip-us.apache.org/repos/asf/incubator-crunch/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-crunch/commit/ad90b151 Tree: http://git-wip-us.apache.org/repos/asf/incubator-crunch/tree/ad90b151 Diff: http://git-wip-us.apache.org/repos/asf/incubator-crunch/diff/ad90b151 Branch: refs/heads/master Commit: ad90b151d9114cebe5382f678170e4a5bee1c119 Parents: 3eb2d3f Author: Rahul Sharma <[email protected]> Authored: Wed Oct 10 22:00:07 2012 +0530 Committer: Rahul Sharma <[email protected]> Committed: Wed Oct 10 22:00:07 2012 +0530 ---------------------------------------------------------------------- crunch-contrib/pom.xml | 63 + .../crunch/contrib/bloomfilter/BloomFiltersIT.java | 61 + crunch-contrib/src/it/resources/shakes.txt | 3667 +++++++++++++++ .../contrib/bloomfilter/BloomFilterFactory.java | 109 + .../crunch/contrib/bloomfilter/BloomFilterFn.java | 68 + .../crunch/contrib/bloomfilter/package-info.java | 24 + .../org/apache/crunch/contrib/package-info.java | 25 + crunch-dist/pom.xml | 4 + pom.xml | 7 + 9 files changed, 4028 insertions(+), 0 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/ad90b151/crunch-contrib/pom.xml ---------------------------------------------------------------------- diff --git a/crunch-contrib/pom.xml b/crunch-contrib/pom.xml new file mode 100644 index 0000000..ef0c4ff --- /dev/null +++ b/crunch-contrib/pom.xml @@ -0,0 +1,63 @@ +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd" xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.crunch</groupId> + <artifactId>crunch-parent</artifactId> + <version>0.4.0-incubating-SNAPSHOT</version> + </parent> + + <artifactId>crunch-contrib</artifactId> + <name>Apache Crunch Contrib</name> + + <dependencies> + + <dependency> + <groupId>org.apache.crunch</groupId> + <artifactId>crunch</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.crunch</groupId> + <artifactId>crunch-test</artifactId> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + <scope>test</scope> <!-- only needed for LocalJobRunner --> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-client</artifactId> + <scope>provided</scope> + </dependency> + + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.codehaus.mojo</groupId> + <artifactId>build-helper-maven-plugin</artifactId> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/incubator-crunch/blob/ad90b151/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java ---------------------------------------------------------------------- diff --git a/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java b/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java new file mode 100644 index 0000000..d91e07f --- /dev/null +++ b/crunch-contrib/src/it/java/org/apache/crunch/contrib/bloomfilter/BloomFiltersIT.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.crunch.contrib.bloomfilter; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.StringUtils; +import org.apache.crunch.test.CrunchTestSupport; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.util.bloom.BloomFilter; +import org.apache.hadoop.util.bloom.Key; +import org.junit.Test; + +public class BloomFiltersIT extends CrunchTestSupport implements Serializable { + + @Test + public void testFilterCreation() throws IOException { + String inputPath = tempDir.copyResourceFileName("shakes.txt"); + BloomFilterFn<String> filterFn = new BloomFilterFn<String>() { + @Override + public Collection<Key> generateKeys(String input) { + List<String> parts = Arrays.asList(StringUtils.split(input, " ")); + Collection<Key> keys = new HashSet<Key>(); + for (String stringpart : parts) { + keys.add(new Key(stringpart.getBytes())); + } + return keys; + } + }; + Map<String, BloomFilter> filterValues = BloomFilterFactory.createFilter(new Path(inputPath), filterFn).getValue(); + assertEquals(1, filterValues.size()); + BloomFilter filter = filterValues.get("shakes.txt"); + assertTrue(filter.membershipTest(new Key("Mcbeth".getBytes()))); + assertTrue(filter.membershipTest(new Key("apples".getBytes()))); + } + +}
