BIGTOP-3086: Drop datafu packaging Since pig has been removed with BIGTOP-3075, we should remove datafu as well, since datafu-pig depends on pig.
Change-Id: Ide8d8cb5e8223cf6307a48f7e31a606a02dcefdb Signed-off-by: Yuqi Gu <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/4cee56bd Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/4cee56bd Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/4cee56bd Branch: refs/heads/master Commit: 4cee56bdc831b254805a121d43c33fc6a3a9c53e Parents: aaffc1e Author: Yuqi Gu <[email protected]> Authored: Mon Oct 8 06:11:23 2018 +0000 Committer: Jun He <[email protected]> Committed: Mon Dec 10 05:17:39 2018 +0000 ---------------------------------------------------------------------- bigtop-ci/jenkins/jobsCreator.groovy | 4 +- .../src/common/datafu/do-component-build | 22 - .../src/common/datafu/install_datafu.sh | 80 ---- bigtop-packages/src/deb/datafu/changelog | 1 - bigtop-packages/src/deb/datafu/compat | 1 - bigtop-packages/src/deb/datafu/control | 38 -- bigtop-packages/src/deb/datafu/copyright | 15 - .../src/deb/datafu/pig-udf-datafu.install | 1 - bigtop-packages/src/deb/datafu/rules | 36 -- bigtop-packages/src/deb/datafu/source/format | 1 - bigtop-packages/src/rpm/datafu/BUILD/.gitignore | 0 bigtop-packages/src/rpm/datafu/RPMS/.gitignore | 0 .../src/rpm/datafu/SOURCES/.gitignore | 0 .../src/rpm/datafu/SPECS/datafu.spec | 72 --- bigtop-packages/src/rpm/datafu/SRPMS/.gitignore | 0 bigtop-tests/test-artifacts/datafu/pom.xml | 68 --- .../java/datafu/linkanalysis/PageRank.java | 441 ------------------- .../java/datafu/pig/linkanalysis/PageRank.java | 372 ---------------- .../apache/bigtop/itest/datafu/PigTests.java | 211 --------- .../bigtop/itest/datafu/bags/BagTests.java | 308 ------------- .../bigtop/itest/datafu/bags/sets/SetTests.java | 74 ---- .../bigtop/itest/datafu/date/TimeTests.java | 65 --- .../bigtop/itest/datafu/geo/GeoTests.java | 75 ---- .../bigtop/itest/datafu/hash/HashTests.java | 63 --- .../itest/datafu/linkanalysis/PageRankTest.java | 299 ------------- .../datafu/linkanalysis/PageRankTests.java | 120 ----- .../itest/datafu/numbers/NumberTests.java | 65 --- .../itest/datafu/sessions/SessionTests.java | 92 ---- .../itest/datafu/stats/MarkovPairTests.java | 105 ----- .../itest/datafu/stats/QuantileTests.java | 196 --------- .../itest/datafu/stats/WilsonBinConfTests.java | 81 ---- .../bigtop/itest/datafu/urls/UserAgentTest.java | 57 --- .../bigtop/itest/datafu/util/AssertTests.java | 93 ---- .../datafu/util/IntBoolConversionPigTests.java | 77 ---- .../datafu/bags/aliasBagFieldsTest.pig | 20 - .../resources/datafu/bags/appendToBagTest.pig | 9 - .../resources/datafu/bags/bagConcatTest.pig | 11 - .../main/resources/datafu/bags/bagSplitTest.pig | 14 - .../datafu/bags/bagSplitWithBagNumTest.pig | 11 - .../bags/comprehensiveBagSplitAndEnumerate.pig | 26 -- .../resources/datafu/bags/distinctByTest.pig | 12 - .../resources/datafu/bags/enumerateTest.pig | 16 - .../datafu/bags/enumerateWithReverseTest.pig | 16 - .../datafu/bags/enumerateWithStartTest.pig | 16 - .../datafu/bags/firstTupleFromBagTest.pig | 9 - .../datafu/bags/nullToEmptyBagTest.pig | 14 - .../resources/datafu/bags/prependToBagTest.pig | 9 - .../datafu/bags/sets/setIntersectTest.pig | 9 - .../resources/datafu/bags/sets/setUnionTest.pig | 13 - .../datafu/bags/unorderedPairsTests.pig | 16 - .../datafu/bags/unorderedPairsTests2.pig | 12 - .../datafu/date/timeCountPageViewsTest.pig | 13 - .../main/resources/datafu/geo/haversineTest.pig | 9 - .../resources/datafu/hash/md5Base64Test.pig | 9 - .../src/main/resources/datafu/hash/md5Test.pig | 9 - .../datafu/linkanalysis/pageRankTest.pig | 25 -- .../datafu/numbers/randomIntRangeTest.pig | 8 - .../datafu/sessions/sessionizeTest.pig | 17 - .../datafu/stats/markovPairDefault.pig | 14 - .../datafu/stats/markovPairLookahead.pig | 14 - .../main/resources/datafu/stats/medianTest.pig | 21 - .../resources/datafu/stats/quantileTest.pig | 21 - .../datafu/stats/streamingMedianTest.pig | 21 - .../datafu/stats/streamingQuantileTest.pig | 18 - .../datafu/stats/wilsonBinConfTests.pig | 11 - .../resources/datafu/urls/userAgentTest.pig | 8 - .../datafu/util/assertWithMessageTest.pig | 10 - .../datafu/util/assertWithoutMessageTest.pig | 10 - .../resources/datafu/util/intToBoolTest.pig | 10 - .../datafu/util/intToBoolToIntTest.pig | 12 - .../package/src/main/resources/package_data.xml | 19 - bigtop-tests/test-artifacts/pom.xml | 1 - .../test-execution/smokes/datafu/pom.xml | 140 ------ bigtop.bom | 11 - 74 files changed, 2 insertions(+), 3795 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-ci/jenkins/jobsCreator.groovy ---------------------------------------------------------------------- diff --git a/bigtop-ci/jenkins/jobsCreator.groovy b/bigtop-ci/jenkins/jobsCreator.groovy index 5ced028..3d5240d 100644 --- a/bigtop-ci/jenkins/jobsCreator.groovy +++ b/bigtop-ci/jenkins/jobsCreator.groovy @@ -16,8 +16,8 @@ */ // FIXME: it would be nice to extract the following from bigtop.mk on the fly -def bigtopComponents = ["bigtop-groovy", "bigtop-jsvc", "bigtop-tomcat", "bigtop-utils", - "zookeeper", "hadoop", "hbase", "hive", "pig", "crunch", "datafu", +def bigtopComponents = ["bigtop-groovy", "bigtop-jsvc", "bigtop-tomcat", "bigtop-utils", + "zookeeper", "hadoop", "hbase", "hive", "pig", "crunch", "flume", "giraph", "ignite-hadoop", "mahout", "oozie", "phoenix", "solr", "spark", "sqoop", "alluxio", "whirr"] // FIXME: it would be nice to extract the following from some static configuration file http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/common/datafu/do-component-build ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/common/datafu/do-component-build b/bigtop-packages/src/common/datafu/do-component-build deleted file mode 100644 index e8ea9a7..0000000 --- a/bigtop-packages/src/common/datafu/do-component-build +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -ex - -. `dirname $0`/bigtop.bom - -gradle -b bootstrap.gradle -./gradlew clean assemble http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/common/datafu/install_datafu.sh ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/common/datafu/install_datafu.sh b/bigtop-packages/src/common/datafu/install_datafu.sh deleted file mode 100755 index df65c9e..0000000 --- a/bigtop-packages/src/common/datafu/install_datafu.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -usage() { - echo " -usage: $0 <options> - Required not-so-options: - --build-dir=DIR path to pig dist.dir - --prefix=PREFIX path to install into - - Optional options: - --lib-dir=DIR path to install pig home [/usr/lib/pig] - --build-dir=DIR path to pig dist dir - ... [ see source for more similar options ] - " - exit 1 -} - -OPTS=$(getopt \ - -n $0 \ - -o '' \ - -l 'prefix:' \ - -l 'lib-dir:' \ - -l 'build-dir:' -- "$@") - -if [ $? != 0 ] ; then - usage -fi - -eval set -- "$OPTS" -while true ; do - case "$1" in - --prefix) - PREFIX=$2 ; shift 2 - ;; - --build-dir) - BUILD_DIR=$2 ; shift 2 - ;; - --lib-dir) - LIB_DIR=$2 ; shift 2 - ;; - --) - shift ; break - ;; - *) - echo "Unknown option: $1" - usage - exit 1 - ;; - esac -done - -for var in PREFIX BUILD_DIR ; do - if [ -z "$(eval "echo \$$var")" ]; then - echo Missing param: $var - usage - fi -done - -LIB_DIR=${LIB_DIR:-/usr/lib/pig} - -# First we'll move everything into lib -install -d -m 0755 $PREFIX/$LIB_DIR -cp $BUILD_DIR/datafu-*.jar $PREFIX/$LIB_DIR http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/deb/datafu/changelog ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/deb/datafu/changelog b/bigtop-packages/src/deb/datafu/changelog deleted file mode 100644 index 547ed02..0000000 --- a/bigtop-packages/src/deb/datafu/changelog +++ /dev/null @@ -1 +0,0 @@ ---- This is auto-generated http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/deb/datafu/compat ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/deb/datafu/compat b/bigtop-packages/src/deb/datafu/compat deleted file mode 100644 index 7f8f011..0000000 --- a/bigtop-packages/src/deb/datafu/compat +++ /dev/null @@ -1 +0,0 @@ -7 http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/deb/datafu/control ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/deb/datafu/control b/bigtop-packages/src/deb/datafu/control deleted file mode 100644 index 04a5a65..0000000 --- a/bigtop-packages/src/deb/datafu/control +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -Source: pig-udf-datafu -Section: misc -Priority: extra -Maintainer: Bigtop <[email protected]> -Build-Depends: debhelper (>= 7.0.50~) -Standards-Version: 3.8.0 -Homepage: https://github.com/linkedin/datafu - -Package: pig-udf-datafu -Architecture: all -Depends: pig -Description: A collection of user-defined functions for Hadoop and Pig. - DataFu is a collection of user-defined functions for working with large-scale - data in Hadoop and Pig. This library was born out of the need for a stable, - well-tested library of UDFs for data mining and statistics. It is used - at LinkedIn in many of our off-line workflows for data derived products like - "People You May Know" and "Skills". - . - It contains functions for: PageRank, Quantiles (median), variance, Sessionization, - Convenience bag functions (e.g., set operations, enumerating bags, etc), - Convenience utility functions (e.g., assertions, easier writing of EvalFuncs) - and more... - http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/deb/datafu/copyright ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/deb/datafu/copyright b/bigtop-packages/src/deb/datafu/copyright deleted file mode 100644 index 422ec82..0000000 --- a/bigtop-packages/src/deb/datafu/copyright +++ /dev/null @@ -1,15 +0,0 @@ -Format: http://dep.debian.net/deps/dep5 -Source: https://github.com/linkedin/datafu -Upstream-Name: DataFu - -Files: * -Copyright: 2010, LinkedIn, Inc -License: Apache-2.0 - -Files debian/* -Copyright: 2011, The Apache Software Foundation -License: Apache-2.0 - -License: Apache-2.0 - On Debian systems, the complete text of the Apache 2.0 license - can be found in "/usr/share/common-licenses/Apache-2.0". http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/deb/datafu/pig-udf-datafu.install ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/deb/datafu/pig-udf-datafu.install b/bigtop-packages/src/deb/datafu/pig-udf-datafu.install deleted file mode 100644 index 6a9697b..0000000 --- a/bigtop-packages/src/deb/datafu/pig-udf-datafu.install +++ /dev/null @@ -1 +0,0 @@ -/usr/lib/pig http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/deb/datafu/rules ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/deb/datafu/rules b/bigtop-packages/src/deb/datafu/rules deleted file mode 100755 index 24a5f38..0000000 --- a/bigtop-packages/src/deb/datafu/rules +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/make -f -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# -*- makefile -*- - -# Uncomment this to turn on verbose mode. -export DH_VERBOSE=1 - -# This has to be exported to make some magic below work. -export DH_OPTIONS - -%: - dh $@ - -override_dh_auto_build: - # we'll just use the build from the tarball. - bash debian/do-component-build -Divy.home=${HOME}/.ivy2 - -override_dh_auto_install: - sh -x debian/install_datafu.sh \ - --build-dir=datafu-pig/build/libs \ - --prefix=debian/tmp http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/deb/datafu/source/format ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/deb/datafu/source/format b/bigtop-packages/src/deb/datafu/source/format deleted file mode 100644 index 163aaf8..0000000 --- a/bigtop-packages/src/deb/datafu/source/format +++ /dev/null @@ -1 +0,0 @@ -3.0 (quilt) http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/rpm/datafu/BUILD/.gitignore ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/rpm/datafu/BUILD/.gitignore b/bigtop-packages/src/rpm/datafu/BUILD/.gitignore deleted file mode 100644 index e69de29..0000000 http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/rpm/datafu/RPMS/.gitignore ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/rpm/datafu/RPMS/.gitignore b/bigtop-packages/src/rpm/datafu/RPMS/.gitignore deleted file mode 100644 index e69de29..0000000 http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/rpm/datafu/SOURCES/.gitignore ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/rpm/datafu/SOURCES/.gitignore b/bigtop-packages/src/rpm/datafu/SOURCES/.gitignore deleted file mode 100644 index e69de29..0000000 http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/rpm/datafu/SPECS/datafu.spec ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/rpm/datafu/SPECS/datafu.spec b/bigtop-packages/src/rpm/datafu/SPECS/datafu.spec deleted file mode 100644 index 0185736..0000000 --- a/bigtop-packages/src/rpm/datafu/SPECS/datafu.spec +++ /dev/null @@ -1,72 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -%define datafu_name datafu -%define lib_datafu /usr/lib/pig - -%if %{?suse_version:1}0 -%define doc_datafu %{_docdir}/datafu-doc -%else -%define doc_datafu %{_docdir}/datafu-doc-%{datafu_version} -%endif - -# disable repacking jars -%define __os_install_post %{nil} - -Name: pig-udf-datafu -Version: %{datafu_version} -Release: %{datafu_release} -Summary: A collection of user-defined functions for Hadoop and Pig. -URL: https://github.com/linkedin/datafu -Group: Development/Libraries -BuildArch: noarch -Buildroot: %(mktemp -ud %{_tmppath}/%{datafu_name}-%{version}-%{release}-XXXXXX) -License: ASL 2.0 -Source0: %{datafu_name}-%{datafu_base_version}.tar.gz -Source1: do-component-build -Source2: install_%{datafu_name}.sh -Requires: hadoop-client, bigtop-utils >= 0.7 - - -%description -DataFu is a collection of user-defined functions for working with large-scale -data in Hadoop and Pig. This library was born out of the need for a stable, -well-tested library of UDFs for data mining and statistics. It is used -at LinkedIn in many of our off-line workflows for data derived products like -"People You May Know" and "Skills". - -It contains functions for: PageRank, Quantiles (median), variance, Sessionization, -Convenience bag functions (e.g., set operations, enumerating bags, etc), -Convenience utility functions (e.g., assertions, easier writing of EvalFuncs) -and more... - -%prep -%setup -n apache-%{datafu_name}-incubating-sources-%{datafu_base_version} - -%build -bash $RPM_SOURCE_DIR/do-component-build - -%install -%__rm -rf $RPM_BUILD_ROOT -sh $RPM_SOURCE_DIR/install_datafu.sh \ - --build-dir=datafu-pig/build/libs \ - --prefix=$RPM_BUILD_ROOT - -####################### -#### FILES SECTION #### -####################### -%files -%defattr(-,root,root,755) -%{lib_datafu} http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-packages/src/rpm/datafu/SRPMS/.gitignore ---------------------------------------------------------------------- diff --git a/bigtop-packages/src/rpm/datafu/SRPMS/.gitignore b/bigtop-packages/src/rpm/datafu/SRPMS/.gitignore deleted file mode 100644 index e69de29..0000000 http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/pom.xml ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/pom.xml b/bigtop-tests/test-artifacts/datafu/pom.xml deleted file mode 100644 index f0ed55b..0000000 --- a/bigtop-tests/test-artifacts/datafu/pom.xml +++ /dev/null @@ -1,68 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - - <parent> - <groupId>org.apache.bigtop.itest</groupId> - <artifactId>bigtop-smokes</artifactId> - <version>1.3.1-SNAPSHOT</version> - <relativePath>../pom.xml</relativePath> - </parent> - - <modelVersion>4.0.0</modelVersion> - <groupId>org.apache.bigtop.itest</groupId> - <artifactId>datafu-smoke</artifactId> - <version>1.3.1-SNAPSHOT</version> - <name>datafusmoke</name> - - <dependencies> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-client</artifactId> - </dependency> - <dependency> - <groupId>org.apache.pig</groupId> - <artifactId>pig</artifactId> - <version>0.11.1</version> - </dependency> - <dependency> - <groupId>org.apache.pig</groupId> - <artifactId>pigunit</artifactId> - <version>0.11.1</version> - </dependency> - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - <version>r06</version> - </dependency> - <dependency> - <groupId>joda-time</groupId> - <artifactId>joda-time</artifactId> - <version>1.6</version> - </dependency> - <dependency> - <groupId>it.unimi.dsi</groupId> - <artifactId>fastutil</artifactId> - <version>6.3</version> - </dependency> - </dependencies> - - <build> - </build> - -</project> http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/src/main/groovy/java/datafu/linkanalysis/PageRank.java ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/src/main/groovy/java/datafu/linkanalysis/PageRank.java b/bigtop-tests/test-artifacts/datafu/src/main/groovy/java/datafu/linkanalysis/PageRank.java deleted file mode 100644 index 2cadcf9..0000000 --- a/bigtop-tests/test-artifacts/datafu/src/main/groovy/java/datafu/linkanalysis/PageRank.java +++ /dev/null @@ -1,441 +0,0 @@ -/* - * Copyright 2010 LinkedIn, Inc - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package datafu.linkanalysis; - -import it.unimi.dsi.fastutil.floats.FloatArrayList; -import it.unimi.dsi.fastutil.ints.Int2IntMap; -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; -import it.unimi.dsi.fastutil.ints.IntArrayList; - -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.DataInputStream; -import java.io.DataOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.Map; - -import com.google.common.collect.AbstractIterator; - -/** - * An implementation of {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}. - * This implementation is not distributed. It is intended for graphs of a reasonable size which can be processed - * on a single machine. Nodes are stored in memory. Edges are stored in memory and can optionally be spilled to - * disk once a certain limit is reached. - */ -public class PageRank -{ - private float totalRankChange; - private long edgeCount; - private long nodeCount; - - // the damping factor - private static float ALPHA = 0.85f; - - // edge weights (which are doubles) are multiplied by this value so they can be stored as integers internally - private static float EDGE_WEIGHT_MULTIPLIER = 100000; - - private final Int2IntOpenHashMap nodeIndices = new Int2IntOpenHashMap(); - private final FloatArrayList nodeData = new FloatArrayList(); // rank, total weight, contribution, (repeat) - - private final IntArrayList danglingNodes = new IntArrayList(); - - private final IntArrayList edges = new IntArrayList(); // source, dest node count... dest id, weight pos, (repeat) - - private boolean shouldHandleDanglingNodes = false; - private boolean shouldCacheEdgesOnDisk = false; - private long edgeCachingThreshold; - - private File edgesFile; - private DataOutputStream edgeDataOutputStream; - private boolean usingEdgeDiskCache; - - public interface ProgressIndicator - { - void progress(); - } - - public void clear() throws IOException - { - this.edgeCount = 0; - this.nodeCount = 0; - this.totalRankChange = 0.0f; - - this.nodeIndices.clear(); - this.nodeData.clear(); - this.edges.clear(); - this.danglingNodes.clear(); - - if (edgeDataOutputStream != null) - { - this.edgeDataOutputStream.close(); - this.edgeDataOutputStream = null; - } - - this.usingEdgeDiskCache = false; - this.edgesFile = null; - } - - /** - * Gets whether disk is being used to cache edges. - * @return True if the edges are cached on disk. - */ - public boolean isUsingEdgeDiskCache() - { - return usingEdgeDiskCache; - } - - /** - * Enable disk caching of edges once there are too many (disabled by default). - */ - public void enableEdgeDiskCaching() - { - shouldCacheEdgesOnDisk = true; - } - - /** - * Disable disk caching of edges once there are too many (disabled by default). - */ - public void disableEdgeDiskCaching() - { - shouldCacheEdgesOnDisk = false; - } - - /** - * Gets whether edge disk caching is enabled. - * @return True if edge disk caching is enabled. - */ - public boolean isEdgeDiskCachingEnabled() - { - return shouldCacheEdgesOnDisk; - } - - /** - * Gets the number of edges past which they will be cached on disk instead of in memory. - * Edge disk caching must be enabled for this to have any effect. - * @return Edge count past which caching occurs - */ - public long getEdgeCachingThreshold() - { - return edgeCachingThreshold; - } - - /** - * Set the number of edges past which they will be cached on disk instead of in memory. - * Edge disk caching must be enabled for this to have any effect. - * @param count Edge count past which caching occurs - */ - public void setEdgeCachingThreshold(long count) - { - edgeCachingThreshold = count; - } - - /** - * Enables dangling node handling (disabled by default). - */ - public void enableDanglingNodeHandling() - { - shouldHandleDanglingNodes = true; - } - - /** - * Disables dangling node handling (disabled by default). - */ - public void disableDanglingNodeHandling() - { - shouldHandleDanglingNodes = false; - } - - public long nodeCount() - { - return this.nodeCount; - } - - public long edgeCount() - { - return this.edgeCount; - } - - public Int2IntMap.FastEntrySet getNodeIds() - { - return this.nodeIndices.int2IntEntrySet(); - } - - public float getNodeRank(int nodeId) - { - int nodeIndex = this.nodeIndices.get(nodeId); - return nodeData.get(nodeIndex); - } - - public float getTotalRankChange() - { - return this.totalRankChange; - } - - private void maybeCreateNode(int nodeId) - { - // create from node if it doesn't already exist - if (!nodeIndices.containsKey(nodeId)) - { - int index = this.nodeData.size(); - - this.nodeData.add(0.0f); // rank - this.nodeData.add(0.0f); // total weight - this.nodeData.add(0.0f); // contribution - - this.nodeIndices.put(nodeId, index); - - this.nodeCount++; - } - } - - public void addEdges(Integer sourceId, ArrayList<Map<String,Object>> sourceEdges) throws IOException - { - int source = sourceId.intValue(); - - maybeCreateNode(source); - - if (this.shouldCacheEdgesOnDisk && !usingEdgeDiskCache && (sourceEdges.size() + this.edgeCount) >= this.edgeCachingThreshold) - { - writeEdgesToDisk(); - } - - // store the source node id itself - appendEdgeData(source); - - // store how many outgoing edges this node has - appendEdgeData(sourceEdges.size()); - - // store the outgoing edges - for (Map<String,Object> edge : sourceEdges) - { - int dest = ((Integer)edge.get("dest")).intValue(); - float weight = ((Double)edge.get("weight")).floatValue(); - - maybeCreateNode(dest); - - appendEdgeData(dest); - - // location of weight in weights array - appendEdgeData(Math.max(1, (int)(weight * EDGE_WEIGHT_MULTIPLIER))); - - this.edgeCount++; - } - } - - private void appendEdgeData(int data) throws IOException - { - if (this.edgeDataOutputStream != null) - { - this.edgeDataOutputStream.writeInt(data); - } - else - { - this.edges.add(data); - } - } - - public void init(ProgressIndicator progressIndicator) throws IOException - { - if (this.edgeDataOutputStream != null) - { - this.edgeDataOutputStream.close(); - this.edgeDataOutputStream = null; - } - - // initialize all nodes to an equal share of the total rank (1.0) - float nodeRank = 1.0f / this.nodeCount; - for (int j=0; j<this.nodeData.size(); j+=3) - { - nodeData.set(j, nodeRank); - progressIndicator.progress(); - } - - Iterator<Integer> edgeData = getEdgeData(); - - while(edgeData.hasNext()) - { - int sourceId = edgeData.next(); - int nodeEdgeCount = edgeData.next(); - - while (nodeEdgeCount-- > 0) - { - // skip the destination node id - edgeData.next(); - - float weight = edgeData.next(); - - int nodeIndex = this.nodeIndices.get(sourceId); - - float totalWeight = this.nodeData.getFloat(nodeIndex+1); - totalWeight += weight; - this.nodeData.set(nodeIndex+1, totalWeight); - - progressIndicator.progress(); - } - } - - // if handling dangling nodes, get a list of them by finding those nodes with no outgoing - // edges (i.e. total outgoing edge weight is 0.0) - if (shouldHandleDanglingNodes) - { - for (Map.Entry<Integer,Integer> e : nodeIndices.entrySet()) - { - int nodeId = e.getKey(); - int nodeIndex = e.getValue(); - float totalWeight = nodeData.getFloat(nodeIndex+1); - if (totalWeight == 0.0f) - { - danglingNodes.add(nodeId); - } - } - } - } - - public float nextIteration(ProgressIndicator progressIndicator) throws IOException - { - distribute(progressIndicator); - commit(progressIndicator); - - return getTotalRankChange(); - } - - public void distribute(ProgressIndicator progressIndicator) throws IOException - { - Iterator<Integer> edgeData = getEdgeData(); - - while(edgeData.hasNext()) - { - int sourceId = edgeData.next(); - int nodeEdgeCount = edgeData.next(); - - while (nodeEdgeCount-- > 0) - { - int toId = edgeData.next(); - float weight = edgeData.next(); - - int fromNodeIndex = this.nodeIndices.get(sourceId); - int toNodeIndex = this.nodeIndices.get(toId); - - float contributionChange = weight * this.nodeData.getFloat(fromNodeIndex) / this.nodeData.getFloat(fromNodeIndex+1); - - float currentContribution = this.nodeData.getFloat(toNodeIndex+2); - this.nodeData.set(toNodeIndex+2, currentContribution + contributionChange); - - progressIndicator.progress(); - } - } - - if (shouldHandleDanglingNodes) - { - // get the rank from each of the dangling nodes - float totalRank = 0.0f; - for (int nodeId : danglingNodes) - { - int nodeIndex = nodeIndices.get(nodeId); - float rank = nodeData.get(nodeIndex); - totalRank += rank; - } - - // distribute the dangling node ranks to all the nodes in the graph - // note: the alpha factor is applied in the commit stage - float contributionIncrease = totalRank / this.nodeCount; - for (int i=2; i<nodeData.size(); i += 3) - { - float contribution = nodeData.getFloat(i); - contribution += contributionIncrease; - nodeData.set(i, contribution); - } - } - } - - public void commit(ProgressIndicator progressIndicator) - { - this.totalRankChange = 0.0f; - - for (int id : nodeIndices.keySet()) - { - int nodeIndex = this.nodeIndices.get(id); - - float alpha = datafu.linkanalysis.PageRank.ALPHA; - float newRank = (1.0f - alpha)/nodeCount + alpha * this.nodeData.get(nodeIndex+2); - - this.nodeData.set(nodeIndex+2, 0.0f); - - float lastRankDiff = newRank - this.nodeData.get(nodeIndex); - - this.nodeData.set(nodeIndex, newRank); - - this.totalRankChange += Math.abs(lastRankDiff); - - progressIndicator.progress(); - } - } - - private void writeEdgesToDisk() throws IOException - { - this.edgesFile = File.createTempFile("fastgraph", null); - - FileOutputStream outStream = new FileOutputStream(this.edgesFile); - BufferedOutputStream bufferedStream = new BufferedOutputStream(outStream); - this.edgeDataOutputStream = new DataOutputStream(bufferedStream); - - for (int edgeData : edges) - { - this.edgeDataOutputStream.writeInt(edgeData); - } - - this.edges.clear(); - usingEdgeDiskCache = true; - } - - private Iterator<Integer> getEdgeData() throws IOException - { - if (!usingEdgeDiskCache) - { - return this.edges.iterator(); - } - else - { - FileInputStream fileInputStream = new FileInputStream(this.edgesFile); - BufferedInputStream inputStream = new BufferedInputStream(fileInputStream); - final DataInputStream dataInputStream = new DataInputStream(inputStream); - - return new AbstractIterator<Integer>() { - - @Override - protected Integer computeNext() - { - try - { - return dataInputStream.readInt(); - } - catch (IOException e) - { - return endOfData(); - } - } - - }; - } - } -} - http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/src/main/groovy/java/datafu/pig/linkanalysis/PageRank.java ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/src/main/groovy/java/datafu/pig/linkanalysis/PageRank.java b/bigtop-tests/test-artifacts/datafu/src/main/groovy/java/datafu/pig/linkanalysis/PageRank.java deleted file mode 100644 index 2460fc2..0000000 --- a/bigtop-tests/test-artifacts/datafu/src/main/groovy/java/datafu/pig/linkanalysis/PageRank.java +++ /dev/null @@ -1,372 +0,0 @@ -/* - * Copyright 2010 LinkedIn, Inc - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package datafu.pig.linkanalysis; - -import it.unimi.dsi.fastutil.ints.Int2IntMap; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.pig.Accumulator; -import org.apache.pig.EvalFunc; -import org.apache.pig.data.BagFactory; -import org.apache.pig.data.DataBag; -import org.apache.pig.data.DataType; -import org.apache.pig.data.Tuple; -import org.apache.pig.data.TupleFactory; -import org.apache.pig.impl.logicalLayer.FrontendException; -import org.apache.pig.impl.logicalLayer.schema.Schema; - -import datafu.linkanalysis.PageRank.ProgressIndicator; - - -/** - * A UDF which implements {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}. - * Each graph is stored in memory while running the algorithm, with edges optionally - * spilled to disk to conserve memory. This can be used to distribute the execution of PageRank on a large number of - * reasonable sized graphs. It does not distribute execuion of PageRank on a single graph. Each graph is identified - * by an integer valued topic ID. - * <p> - * Example: - * <pre> - * {@code - * - * topic_edges = LOAD 'input_edges' as (topic:INT,source:INT,dest:INT,weight:DOUBLE); - * - * topic_edges_grouped = GROUP topic_edges by (topic, source) ; - * topic_edges_grouped = FOREACH topic_edges_grouped GENERATE - * group.topic as topic, - * group.source as source, - * topic_edges.(dest,weight) as edges; - * - * topic_edges_grouped_by_topic = GROUP topic_edges_grouped BY topic; - * - * topic_ranks = FOREACH topic_edges_grouped_by_topic GENERATE - * group as topic, - * FLATTEN(PageRank(topic_edges_grouped.(source,edges))) as (source,rank); - * - * skill_ranks = FOREACH skill_ranks GENERATE - * topic, source, rank; - * - * } - * </pre> - */ -public class PageRank extends EvalFunc<DataBag> implements Accumulator<DataBag> -{ - private final datafu.linkanalysis.PageRank graph = new datafu.linkanalysis.PageRank(); - - private int maxNodesAndEdges = 100000000; - private int maxEdgesInMemory = 30000000; - private double tolerance = 1e-16; - private int maxIters = 150; - private boolean useEdgeDiskStorage = false; - private boolean enableDanglingNodeHandling = false; - private boolean aborted = false; - - TupleFactory tupleFactory = TupleFactory.getInstance(); - BagFactory bagFactory = BagFactory.getInstance(); - - public PageRank() - { - initialize(); - } - - public PageRank(String... parameters) - { - if (parameters.length % 2 != 0) - { - throw new RuntimeException("Invalid parameters list"); - } - - for (int i=0; i<parameters.length; i+=2) - { - String parameterName = parameters[i]; - String value = parameters[i+1]; - if (parameterName.equals("max_nodes_and_edges")) - { - maxNodesAndEdges = Integer.parseInt(value); - } - else if (parameterName.equals("max_edges_in_memory")) - { - maxEdgesInMemory = Integer.parseInt(value); - } - else if (parameterName.equals("tolerance")) - { - tolerance = Double.parseDouble(value); - } - else if (parameterName.equals("max_iters")) - { - maxIters = Integer.parseInt(value); - } - else if (parameterName.equals("spill_to_edge_disk_storage")) - { - useEdgeDiskStorage = Boolean.parseBoolean(value); - } - else if (parameterName.equals("dangling_nodes")) - { - enableDanglingNodeHandling = Boolean.parseBoolean(value); - } - } - - initialize(); - } - - private void initialize() - { - long heapSize = Runtime.getRuntime().totalMemory(); - long heapMaxSize = Runtime.getRuntime().maxMemory(); - long heapFreeSize = Runtime.getRuntime().freeMemory(); -// System.out.println(String.format("Heap size: %d, Max heap size: %d, Heap free size: %d", heapSize, heapMaxSize, heapFreeSize)); - - if (useEdgeDiskStorage) - { - this.graph.enableEdgeDiskCaching(); - } - else - { - this.graph.disableEdgeDiskCaching(); - } - - if (enableDanglingNodeHandling) - { - this.graph.enableDanglingNodeHandling(); - } - else - { - this.graph.disableDanglingNodeHandling(); - } - - this.graph.setEdgeCachingThreshold(maxEdgesInMemory); - } - - @Override - public void accumulate(Tuple t) throws IOException - { - if (aborted) - { - return; - } - - DataBag bag = (DataBag) t.get(0); - if (bag == null || bag.size() == 0) - return; - - for (Tuple sourceTuple : bag) - { - Integer sourceId = (Integer)sourceTuple.get(0); - DataBag edges = (DataBag)sourceTuple.get(1); - - ArrayList<Map<String,Object>> edgesMapList = new ArrayList<Map<String, Object>>(); - - for (Tuple edgeTuple : edges) - { - Integer destId = (Integer)edgeTuple.get(0); - Double weight = (Double)edgeTuple.get(1); - HashMap<String,Object> edgeMap = new HashMap<String, Object>(); - edgeMap.put("dest",destId); - edgeMap.put("weight",weight); - edgesMapList.add(edgeMap); - } - - graph.addEdges(sourceId, edgesMapList); - - if (graph.nodeCount() + graph.edgeCount() > maxNodesAndEdges) - { - System.out.println(String.format("There are too many nodes and edges (%d + %d > %d). Aborting.", graph.nodeCount(), graph.edgeCount(), maxNodesAndEdges)); - aborted = true; - } - - reporter.progress(); - } - } - - @Override - public DataBag getValue() - { - if (aborted) - { - return null; - } - - System.out.println(String.format("Nodes: %d, Edges: %d", graph.nodeCount(), graph.edgeCount())); - - ProgressIndicator progressIndicator = getProgressIndicator(); - System.out.println("Finished loading graph."); - long startTime = System.nanoTime(); - System.out.println("Initializing."); - try - { - graph.init(progressIndicator); - } - catch (IOException e) - { - e.printStackTrace(); - return null; - } - System.out.println(String.format("Done, took %f ms", (System.nanoTime() - startTime)/10.0e6)); - - float totalDiff; - int iter = 0; - - System.out.println("Beginning iterations"); - startTime = System.nanoTime(); - do - { - // TODO log percentage complete every 5 minutes - try - { - totalDiff = graph.nextIteration(progressIndicator); - } - catch (IOException e) - { - e.printStackTrace(); - return null; - } - iter++; - } while(iter < maxIters && totalDiff > tolerance); - System.out.println(String.format("Done, %d iterations took %f ms", iter, (System.nanoTime() - startTime)/10.0e6)); - - DataBag output = bagFactory.newDefaultBag(); - - for (Int2IntMap.Entry node : graph.getNodeIds()) - { - int nodeId = node.getIntKey(); - float rank = graph.getNodeRank(nodeId); - List nodeData = new ArrayList(2); - nodeData.add(nodeId); - nodeData.add(rank); - output.add(tupleFactory.newTuple(nodeData)); - } - - return output; - } - - @Override - public void cleanup() - { - try - { - aborted = false; - this.graph.clear(); - } - catch (IOException e) - { - e.printStackTrace(); - } - } - - @Override - public DataBag exec(Tuple input) throws IOException - { - try - { - accumulate(input); - - return getValue(); - } - finally - { - cleanup(); - } - } - - private ProgressIndicator getProgressIndicator() - { - return new ProgressIndicator() - { - @Override - public void progress() - { - reporter.progress(); - } - }; - } - - @Override - public Schema outputSchema(Schema input) - { - try - { - Schema.FieldSchema inputFieldSchema = input.getField(0); - - if (inputFieldSchema.type != DataType.BAG) - { - throw new RuntimeException("Expected a BAG as input"); - } - - Schema inputBagSchema = inputFieldSchema.schema; - - if (inputBagSchema.getField(0).type != DataType.TUPLE) - { - throw new RuntimeException(String.format("Expected input bag to contain a TUPLE, but instead found %s", - DataType.findTypeName(inputBagSchema.getField(0).type))); - } - - Schema inputTupleSchema = inputBagSchema.getField(0).schema; - - if (inputTupleSchema.getField(0).type != DataType.INTEGER) - { - throw new RuntimeException(String.format("Expected source to be an INTEGER, but instead found %s", - DataType.findTypeName(inputTupleSchema.getField(0).type))); - } - - if (inputTupleSchema.getField(1).type != DataType.BAG) - { - throw new RuntimeException(String.format("Expected edges to be represented with a BAG")); - } - - Schema.FieldSchema edgesFieldSchema = inputTupleSchema.getField(1); - - if (edgesFieldSchema.schema.getField(0).type != DataType.TUPLE) - { - throw new RuntimeException(String.format("Expected edges field to contain a TUPLE, but instead found %s", - DataType.findTypeName(edgesFieldSchema.schema.getField(0).type))); - } - - Schema edgesTupleSchema = edgesFieldSchema.schema.getField(0).schema; - - if (edgesTupleSchema.getField(0).type != DataType.INTEGER) - { - throw new RuntimeException(String.format("Expected destination edge ID to an INTEGER, but instead found %s", - DataType.findTypeName(edgesFieldSchema.schema.getField(0).type))); - } - - if (edgesTupleSchema.getField(1).type != DataType.DOUBLE) - { - throw new RuntimeException(String.format("Expected destination edge weight to a DOUBLE, but instead found %s", - DataType.findTypeName(edgesFieldSchema.schema.getField(1).type))); - } - - Schema tupleSchema = new Schema(); - tupleSchema.add(new Schema.FieldSchema("node",DataType.INTEGER)); - tupleSchema.add(new Schema.FieldSchema("rank",DataType.FLOAT)); - - return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass() - .getName() - .toLowerCase(), input), - tupleSchema, - DataType.BAG)); - } - catch (FrontendException e) - { - throw new RuntimeException(e); - } - } -} http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/PigTests.java ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/PigTests.java b/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/PigTests.java deleted file mode 100644 index 8b11111..0000000 --- a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/PigTests.java +++ /dev/null @@ -1,211 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.bigtop.itest.datafu; - -import static org.junit.Assert.*; - -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileWriter; -import java.io.FilenameFilter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.regex.Pattern; - -import org.apache.commons.io.IOUtils; -import org.apache.pig.data.Tuple; -import org.apache.pig.pigunit.PigTest; -import org.apache.pig.tools.parameters.ParseException; - -public abstract class PigTests -{ - protected String[] getDefaultArgs() - { - String[] args = { - "JAR_PATH=" + getJarPath() - }; - return args; - } - - protected List<String> getDefaultArgsAsList() - { - String[] args = getDefaultArgs(); - List<String> argsList = new ArrayList<String>(args.length); - for (String arg : args) - { - argsList.add(arg); - } - return argsList; - } - - protected PigTest createPigTest(String scriptPath, String... args) throws IOException - { - // append args to list of default args - List<String> theArgs = getDefaultArgsAsList(); - for (String arg : args) - { - theArgs.add(arg); - } - - String[] lines = getLinesFromFile(scriptPath); - - for (String arg : theArgs) - { - String[] parts = arg.split("=",2); - if (parts.length == 2) - { - for (int i=0; i<lines.length; i++) - { - lines[i] = lines[i].replaceAll(Pattern.quote("$" + parts[0]), parts[1]); - } - } - } - - return new PigTest(lines); - } - - protected PigTest createPigTest(String scriptPath) throws IOException - { - return createPigTest(scriptPath, getDefaultArgs()); - } - - protected String getJarPath() - { - String jarDir = "dist"; - if (System.getProperty("datafu.jar.dir") != null) - { - jarDir = System.getProperty("datafu.jar.dir"); - } - - String jarDirPath = new File(/* System.getProperty("user.dir"), */ jarDir).getAbsolutePath(); - - File userDir = new File(jarDirPath); - - String[] files = userDir.list(new FilenameFilter() { - - @Override - public boolean accept(File dir, String name) - { - return name.startsWith("datafu") && name.endsWith(".jar") && !name.contains("sources") && !name.contains("javadoc"); - } - - }); - - if (files.length == 0) - { - throw new RuntimeException("Could not find JAR file"); - } - else if (files.length > 1) - { - throw new RuntimeException("Found more JAR files than expected"); - } - - return userDir.getAbsolutePath() + "/" + files[0]; - } - - protected List<Tuple> getLinesForAlias(PigTest test, String alias) throws IOException, ParseException - { - return getLinesForAlias(test,alias,true); - } - - protected List<Tuple> getLinesForAlias(PigTest test, String alias, boolean logValues) throws IOException, ParseException - { - Iterator<Tuple> tuplesIterator = test.getAlias(alias); - List<Tuple> tuples = new ArrayList<Tuple>(); - if (logValues) - { - System.out.println(String.format("Values for %s: ", alias)); - } - while (tuplesIterator.hasNext()) - { - Tuple tuple = tuplesIterator.next(); - if (logValues) - { - System.out.println(tuple.toString()); - } - tuples.add(tuple); - } - return tuples; - } - - protected void writeLinesToFile(String fileName, String... lines) throws IOException - { - File inputFile = deleteIfExists(getFile(fileName)); - writeLinesToFile(inputFile, lines); - } - - protected void writeLinesToFile(File file, String[] lines) throws IOException - { - FileWriter writer = new FileWriter(file); - for (String line : lines) - { - writer.write(line + "\n"); - } - writer.close(); - } - - protected void assertOutput(PigTest test, String alias, String... expected) throws IOException, ParseException - { - List<Tuple> tuples = getLinesForAlias(test, alias); - assertEquals(expected.length, tuples.size()); - int i=0; - for (String e : expected) - { - assertEquals(e, tuples.get(i++).toString()); - } - } - - protected File deleteIfExists(File file) - { - if (file.exists()) - { - file.delete(); - } - return file; - } - - protected File getFile(String fileName) - { - return new File(System.getProperty("user.dir"), fileName).getAbsoluteFile(); - } - - /** - * Gets the lines from a given file. - * - * @param relativeFilePath The path relative to the datafu-tests project. - * @return The lines from the file - * @throws IOException - */ - protected String[] getLinesFromFile(String relativeFilePath) throws IOException - { - // assume that the working directory is the datafu-tests project - File file = new File(System.getProperty("user.dir"), relativeFilePath).getAbsoluteFile(); - BufferedInputStream content = new BufferedInputStream(new FileInputStream(file)); - Object[] lines = IOUtils.readLines(content).toArray(); - String[] result = new String[lines.length]; - for (int i=0; i<lines.length; i++) - { - result[i] = (String)lines[i]; - } - return result; - } -} http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/bags/BagTests.java ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/bags/BagTests.java b/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/bags/BagTests.java deleted file mode 100644 index 8e72846..0000000 --- a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/bags/BagTests.java +++ /dev/null @@ -1,308 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.bigtop.itest.datafu.bags; - -import org.apache.pig.pigunit.PigTest; -import org.junit.Test; - -import org.apache.bigtop.itest.datafu.PigTests; - - -public class BagTests extends PigTests -{ - @Test - public void nullToEmptyBagTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/nullToEmptyBagTest.pig"); - - writeLinesToFile("input", - "({(1),(2),(3),(4),(5)})", - "()", - "{(4),(5)})"); - - test.runScript(); - - assertOutput(test, "data2", - "({(1),(2),(3),(4),(5)})", - "({})", - "({(4),(5)})"); - } - - @Test - public void appendToBagTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/appendToBagTest.pig"); - - writeLinesToFile("input", - "1\t{(1),(2),(3)}\t(4)", - "2\t{(10),(20),(30),(40),(50)}\t(60)"); - - test.runScript(); - - assertOutput(test, "data2", - "(1,{(1),(2),(3),(4)})", - "(2,{(10),(20),(30),(40),(50),(60)})"); - } - - @Test - public void firstTupleFromBagTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/firstTupleFromBagTest.pig"); - - writeLinesToFile("input", "1\t{(4),(9),(16)}"); - - test.runScript(); - - assertOutput(test, "data2", "(1,(4))"); - } - - - @Test - public void prependToBagTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/prependToBagTest.pig"); - - writeLinesToFile("input", - "1\t{(1),(2),(3)}\t(4)", - "2\t{(10),(20),(30),(40),(50)}\t(60)"); - - test.runScript(); - - assertOutput(test, "data2", - "(1,{(4),(1),(2),(3)})", - "(2,{(60),(10),(20),(30),(40),(50)})"); - } - - @Test - public void bagConcatTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/bagConcatTest.pig"); - - writeLinesToFile("input", - "({(1),(2),(3)}\t{(3),(5),(6)}\t{(10),(13)})", - "({(2),(3),(4)}\t{(5),(5)}\t{(20)})"); - - test.runScript(); - - assertOutput(test, "data2", - "({(1),(2),(3),(3),(5),(6),(10),(13)})", - "({(2),(3),(4),(5),(5),(20)})"); - } - - @Test - public void unorderedPairsTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/unorderedPairsTests.pig"); - - String[] input = { - "{(1),(2),(3),(4),(5)}" - }; - - String[] output = { - "(1,2)", - "(1,3)", - "(1,4)", - "(1,5)", - "(2,3)", - "(2,4)", - "(2,5)", - "(3,4)", - "(3,5)", - "(4,5)" - }; - - test.assertOutput("data",input,"data4",output); - } - - @Test - public void unorderedPairsTest2() throws Exception - { - PigTest test = createPigTest("datafu/bags/unorderedPairsTests2.pig"); - - this.writeLinesToFile("input", "1\t{(1),(2),(3),(4),(5)}"); - - String[] output = { - "(1,2)", - "(1,3)", - "(1,4)", - "(1,5)", - "(2,3)", - "(2,4)", - "(2,5)", - "(3,4)", - "(3,5)", - "(4,5)" - }; - - test.runScript(); - this.getLinesForAlias(test, "data3"); - - this.assertOutput(test, "data3", - "(1,(1),(2))", - "(1,(1),(3))", - "(1,(1),(4))", - "(1,(1),(5))", - "(1,(2),(3))", - "(1,(2),(4))", - "(1,(2),(5))", - "(1,(3),(4))", - "(1,(3),(5))", - "(1,(4),(5))"); - } - - @Test - public void bagSplitTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/bagSplitTest.pig", - "MAX=5"); - - writeLinesToFile("input", - "{(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010),(11,1111),(12,1212)}"); - - test.runScript(); - - assertOutput(test, "data3", - "({(1,11),(2,22),(3,33),(4,44),(5,55)})", - "({(6,66),(7,77),(8,88),(9,99),(10,1010)})", - "({(11,1111),(12,1212)})"); - } - - @Test - public void bagSplitWithBagNumTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/bagSplitWithBagNumTest.pig", - "MAX=10"); - - writeLinesToFile("input", - "{(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010),(11,1111),(12,1212)}"); - - test.runScript(); - - assertOutput(test, "data3", - "({(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010)},0)", - "({(11,1111),(12,1212)},1)"); - } - - @Test - public void enumerateWithReverseTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/enumerateWithReverseTest.pig"); - - writeLinesToFile("input", - "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})"); - - test.runScript(); - - assertOutput(test, "data4", - "(10,{(1),(2),(3)},5)", - "(20,{(4),(5),(6)},4)", - "(30,{(7),(8)},3)", - "(40,{(9),(10),(11)},2)", - "(50,{(12),(13),(14),(15)},1)"); - } - - @Test - public void enumerateWithStartTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/enumerateWithStartTest.pig"); - - writeLinesToFile("input", - "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})"); - - test.runScript(); - - assertOutput(test, "data4", - "(10,{(1),(2),(3)},1)", - "(20,{(4),(5),(6)},2)", - "(30,{(7),(8)},3)", - "(40,{(9),(10),(11)},4)", - "(50,{(12),(13),(14),(15)},5)"); - } - - @Test - public void enumerateTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/enumerateTest.pig"); - - writeLinesToFile("input", - "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})"); - - test.runScript(); - - assertOutput(test, "data4", - "(10,{(1),(2),(3)},0)", - "(20,{(4),(5),(6)},1)", - "(30,{(7),(8)},2)", - "(40,{(9),(10),(11)},3)", - "(50,{(12),(13),(14),(15)},4)"); - } - - @Test - public void comprehensiveBagSplitAndEnumerate() throws Exception - { - PigTest test = createPigTest("datafu/bags/comprehensiveBagSplitAndEnumerate.pig"); - - writeLinesToFile("input", - "({(A,1.0),(B,2.0),(C,3.0),(D,4.0),(E,5.0)})"); - - test.runScript(); - - assertOutput(test, "data_out", - // bag #1 - "(A,1.0,1)", - "(B,2.0,1)", - "(C,3.0,1)", - // bag #2 - "(D,4.0,2)", - "(E,5.0,2)"); - } - - @Test - public void aliasBagFieldsTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/aliasBagFieldsTest.pig"); - - writeLinesToFile("input", - "({(A,1,0),(B,2,0),(C,3,0),(D,4,0),(E,5,0)})"); - - test.runScript(); - - assertOutput(test, "data4", - "(A,1)", - "(B,2)", - "(C,3)", - "(D,4)", - "(E,5)"); - } - - @Test - public void distinctByTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/distinctByTest.pig"); - - writeLinesToFile("input", - "({(Z,1,0),(A,1,0),(A,1,0),(B,2,0),(B,22,1),(C,3,0),(D,4,0),(E,5,0)})"); - - test.runScript(); - - assertOutput(test, "data2", - "({(Z,1,0),(A,1,0),(B,2,0),(C,3,0),(D,4,0),(E,5,0)})"); - } - -} http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/bags/sets/SetTests.java ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/bags/sets/SetTests.java b/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/bags/sets/SetTests.java deleted file mode 100644 index 938ef3a..0000000 --- a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/bags/sets/SetTests.java +++ /dev/null @@ -1,74 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.bigtop.itest.datafu.bags.sets; - -import org.apache.pig.pigunit.PigTest; -import org.junit.Test; - -import org.apache.bigtop.itest.datafu.PigTests; - -public class SetTests extends PigTests -{ - @Test - public void setIntersectTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/sets/setIntersectTest.pig"); - - String[] input = { - "{(1,10),(2,20),(3,30),(4,40),(5,50),(6,60)}\t{(0,0),(2,20),(4,40),(8,80)}", - "{(1,10),(1,10),(2,20),(3,30),(3,30),(4,40),(4,40)}\t{(1,10),(3,30)}" - }; - - String[] output = { - "({(2,20),(4,40)})", - "({(1,10),(3,30)})" - }; - - test.assertOutput("data",input,"data2",output); - } - - @Test - public void setIntersectOutOfOrderTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/sets/setIntersectTest.pig"); - - this.writeLinesToFile("input", - "{(1,10),(3,30),(2,20),(4,40),(5,50),(6,60)}\t{(0,0),(2,20),(4,40),(8,80)}"); - - test.runScript(); - - this.getLinesForAlias(test, "data2"); - } - - @Test - public void setUnionTest() throws Exception - { - PigTest test = createPigTest("datafu/bags/sets/setUnionTest.pig"); - - String[] input = { - "{(1,10),(1,20),(1,30),(1,40),(1,50),(1,60),(1,80)}\t{(1,1),(1,20),(1,25),(1,25),(1,25),(1,40),(1,70),(1,80)}" - }; - - String[] output = { - "({(1,10),(1,20),(1,30),(1,40),(1,50),(1,60),(1,80),(1,1),(1,25),(1,70)})" - }; - - test.assertOutput("data",input,"data2",output); - } -} http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/date/TimeTests.java ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/date/TimeTests.java b/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/date/TimeTests.java deleted file mode 100644 index 87fab7b..0000000 --- a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/date/TimeTests.java +++ /dev/null @@ -1,65 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.bigtop.itest.datafu.date; - -import org.apache.pig.pigunit.PigTest; -import org.junit.Test; - -import org.apache.bigtop.itest.datafu.PigTests; - -public class TimeTests extends PigTests -{ - @Test - public void timeCountPageViewsTest() throws Exception - { - PigTest test = createPigTest("datafu/date/timeCountPageViewsTest.pig", - "TIME_WINDOW=30m", - "JAR_PATH=" + getJarPath()); - - String[] input = { - "1\t100\t2010-01-01T01:00:00Z", - "1\t100\t2010-01-01T01:15:00Z", - "1\t100\t2010-01-01T01:31:00Z", - "1\t100\t2010-01-01T01:35:00Z", - "1\t100\t2010-01-01T02:30:00Z", - - "1\t101\t2010-01-01T01:00:00Z", - "1\t101\t2010-01-01T01:31:00Z", - "1\t101\t2010-01-01T02:10:00Z", - "1\t101\t2010-01-01T02:40:30Z", - "1\t101\t2010-01-01T03:30:00Z", - - "1\t102\t2010-01-01T01:00:00Z", - "1\t102\t2010-01-01T01:01:00Z", - "1\t102\t2010-01-01T01:02:00Z", - "1\t102\t2010-01-01T01:10:00Z", - "1\t102\t2010-01-01T01:15:00Z", - "1\t102\t2010-01-01T01:25:00Z", - "1\t102\t2010-01-01T01:30:00Z" - }; - - String[] output = { - "(1,100,2)", - "(1,101,5)", - "(1,102,1)" - }; - - test.assertOutput("views",input,"view_counts",output); - } -} http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/geo/GeoTests.java ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/geo/GeoTests.java b/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/geo/GeoTests.java deleted file mode 100644 index 12d9f97..0000000 --- a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/geo/GeoTests.java +++ /dev/null @@ -1,75 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.bigtop.itest.datafu.geo; - -import static org.junit.Assert.*; - -import java.util.List; - -import org.apache.pig.data.Tuple; -import org.apache.pig.pigunit.PigTest; -import org.junit.Test; - -import org.apache.bigtop.itest.datafu.PigTests; - -public class GeoTests extends PigTests -{ - @Test - public void haversineTest() throws Exception - { - PigTest test = createPigTest("datafu/geo/haversineTest.pig"); - - // Approximate latitude and longitude for major cities from maps.google.com - double[] la = {34.040143,-118.243103}; - double[] tokyo = {35.637209,139.65271}; - double[] ny = {40.716038,-73.99498}; - double[] paris = {48.857713,2.342491}; - double[] sydney = {-33.872696,151.195221}; - - this.writeLinesToFile("input", - coords(la,tokyo), - coords(ny,tokyo), - coords(ny,sydney), - coords(ny,paris)); - - test.runScript(); - - List<Tuple> distances = this.getLinesForAlias(test, "data2"); - - // ensure distance is within 20 miles of expected (distances found online) - assertWithin(5478.0, distances.get(0), 20.0); // la <-> tokyo - assertWithin(6760.0, distances.get(1), 20.0); // ny <-> tokyo - assertWithin(9935.0, distances.get(2), 20.0); // ny <-> sydney - assertWithin(3635.0, distances.get(3), 20.0); // ny <-> paris - - } - - private void assertWithin(double expected, Tuple actual, double maxDiff) throws Exception - { - Double actualVal = (Double)actual.get(0); - assertTrue(Math.abs(expected-actualVal) < maxDiff); - } - - private String coords(double[] coords1, double[] coords2) - { - assertTrue(coords1.length == 2); - assertTrue(coords2.length == 2); - return String.format("%f\t%f\t%f\t%f", coords1[0], coords1[1], coords2[0], coords2[1]); - } -} http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/hash/HashTests.java ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/hash/HashTests.java b/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/hash/HashTests.java deleted file mode 100644 index e900c15..0000000 --- a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/hash/HashTests.java +++ /dev/null @@ -1,63 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.bigtop.itest.datafu.hash; - -import org.apache.pig.pigunit.PigTest; -import org.junit.Test; - -import org.apache.bigtop.itest.datafu.PigTests; - -public class HashTests extends PigTests -{ - @Test - public void md5Test() throws Exception - { - PigTest test = createPigTest("datafu/hash/md5Test.pig"); - - writeLinesToFile("input", - "ladsljkasdglk", - "lkadsljasgjskdjks", - "aladlasdgjks"); - - test.runScript(); - - assertOutput(test, "data_out", - "(d9a82575758bb4978949dc0659205cc6)", - "(9ec37f02fae0d8d6a7f4453a62272f1f)", - "(cb94139a8b9f3243e68a898ec6bd9b3d)"); - } - - @Test - public void md5Base64Test() throws Exception - { - PigTest test = createPigTest("datafu/hash/md5Base64Test.pig"); - - writeLinesToFile("input", - "ladsljkasdglk", - "lkadsljasgjskdjks", - "aladlasdgjks"); - - test.runScript(); - - assertOutput(test, "data_out", - "(2agldXWLtJeJSdwGWSBcxg==)", - "(nsN/Avrg2Nan9EU6YicvHw==)", - "(y5QTmoufMkPmiomOxr2bPQ==)"); - } -} http://git-wip-us.apache.org/repos/asf/bigtop/blob/4cee56bd/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/linkanalysis/PageRankTest.java ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/linkanalysis/PageRankTest.java b/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/linkanalysis/PageRankTest.java deleted file mode 100644 index 407815c..0000000 --- a/bigtop-tests/test-artifacts/datafu/src/main/groovy/org/apache/bigtop/itest/datafu/linkanalysis/PageRankTest.java +++ /dev/null @@ -1,299 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * <p/> - * http://www.apache.org/licenses/LICENSE-2.0 - * <p/> - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.bigtop.itest.datafu.linkanalysis; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; - -import org.junit.Test; - -public class PageRankTest -{ - @Test - public void wikipediaGraphInMemoryTest() throws Exception { - System.out.println(); - System.out.println("Starting wikipediaGraphInMemoryTest"); - - datafu.linkanalysis.PageRank graph = new datafu.linkanalysis.PageRank(); - - String[] edges = getWikiExampleEdges(); - - Map<String,Integer> nodeIdsMap = loadGraphFromEdgeList(graph, edges); - - // Without dangling node handling we will not get the true page rank since the total rank will - // not add to 1.0. Without dangling node handling some of the page rank drains out of the graph. - graph.enableDanglingNodeHandling(); - - performIterations(graph, 150, 1e-18f); - - String[] expectedRanks = getWikiExampleExpectedRanks(); - - Map<String,Float> expectedRanksMap = parseExpectedRanks(expectedRanks); - - validateExpectedRanks(graph, nodeIdsMap, expectedRanksMap); - } - - @Test - public void wikipediaGraphDiskCacheTest() throws Exception { - System.out.println(); - System.out.println("Starting wikipediaGraphDiskCacheTest"); - - datafu.linkanalysis.PageRank graph = new datafu.linkanalysis.PageRank(); - - String[] edges = getWikiExampleEdges(); - - graph.enableEdgeDiskCaching(); - graph.setEdgeCachingThreshold(5); - - Map<String,Integer> nodeIdsMap = loadGraphFromEdgeList(graph, edges); - - assert graph.isUsingEdgeDiskCache() : "Expected disk cache to be used"; - - // Without dangling node handling we will not get the true page rank since the total rank will - // not add to 1.0. Without dangling node handling some of the page rank drains out of the graph. - graph.enableDanglingNodeHandling(); - - performIterations(graph, 150, 1e-18f); - - String[] expectedRanks = getWikiExampleExpectedRanks(); - - Map<String,Float> expectedRanksMap = parseExpectedRanks(expectedRanks); - - validateExpectedRanks(graph, nodeIdsMap, expectedRanksMap); - } - - @Test - public void hubAndSpokeInMemoryTest() throws Exception { - System.out.println(); - System.out.println("Starting hubAndSpokeInMemoryTest"); - - datafu.linkanalysis.PageRank graph = new datafu.linkanalysis.PageRank(); - - String[] edges = getHubAndSpokeEdges(); - - Map<String,Integer> nodeIdsMap = loadGraphFromEdgeList(graph, edges); - - graph.enableDanglingNodeHandling(); - - performIterations(graph, 150, 1e-18f); - - // no need to validate, this is just a perf test for runtime comparison - } - - @Test - public void hubAndSpokeDiskCacheTest() throws Exception { - System.out.println(); - System.out.println("Starting hubAndSpokeDiskCacheTest"); - - datafu.linkanalysis.PageRank graph = new datafu.linkanalysis.PageRank(); - - String[] edges = getHubAndSpokeEdges(); - - graph.enableEdgeDiskCaching(); - graph.setEdgeCachingThreshold(5); - - Map<String,Integer> nodeIdsMap = loadGraphFromEdgeList(graph, edges); - - graph.enableDanglingNodeHandling(); - - performIterations(graph, 150, 1e-18f); - - // no need to validate, this is just a perf test for runtime comparison - } - - private String[] getHubAndSpokeEdges() - { - int count = 50000; - String[] edges = new String[count]; - - for (int i=0; i<count; i++) - { - edges[i] = String.format("S%d H", i); - } - return edges; - } - - public static String[] getWikiExampleEdges() - { - // graph taken from: - // http://en.wikipedia.org/wiki/PageRank - String[] edges = { - "B C", - "C B", - "D A", - "D B", - "E D", - "E B", - "E F", - "F E", - "F B", - "P1 B", - "P1 E", - "P2 B", - "P2 E", - "P3 B", - "P3 E", - "P4 E", - "P5 E" - }; - return edges; - } - - public static String[] getWikiExampleExpectedRanks() - { - // these ranks come from the Wikipedia page: - // http://en.wikipedia.org/wiki/PageRank - String[] expectedRanks = { - "A 3.3", - "B 38.4", - "C 34.3", - "D 3.9", - "E 8.1", - "F 3.9", - "P1 1.6", - "P2 1.6", - "P3 1.6", - "P4 1.6", - "P5 1.6" - }; - return expectedRanks; - } - - private Map<String,Integer> loadGraphFromEdgeList(datafu.linkanalysis.PageRank graph, String[] edges) throws IOException - { - Map<Integer,ArrayList<Map<String,Object>>> nodeEdgesMap = new HashMap<Integer,ArrayList<Map<String,Object>>>(); - Map<String,Integer> nodeIdsMap = new HashMap<String,Integer>(); - - for (String edge : edges) - { - String[] parts = edge.split(" "); - assert parts.length == 2 : "Expected two parts"; - - int sourceId = getOrCreateId(parts[0], nodeIdsMap); - int destId = getOrCreateId(parts[1], nodeIdsMap); - - Map<String,Object> edgeMap = new HashMap<String,Object>(); - edgeMap.put("weight", 1.0); - edgeMap.put("dest", destId); - - ArrayList<Map<String,Object>> nodeEdges = null; - - if (nodeEdgesMap.containsKey(sourceId)) - { - nodeEdges = nodeEdgesMap.get(sourceId); - } - else - { - nodeEdges = new ArrayList<Map<String,Object>>(); - nodeEdgesMap.put(sourceId, nodeEdges); - } - - nodeEdges.add(edgeMap); - } - - for (Map.Entry<Integer, ArrayList<Map<String,Object>>> e : nodeEdgesMap.entrySet()) - { - graph.addEdges(e.getKey(), e.getValue()); - } - - return nodeIdsMap; - } - - private void performIterations(datafu.linkanalysis.PageRank graph, int maxIters, float tolerance) throws IOException - { - System.out.println(String.format("Beginning iteration (maxIters = %d, tolerance=%e)", maxIters, tolerance)); - - datafu.linkanalysis.PageRank.ProgressIndicator progressIndicator = getDummyProgressIndicator(); - - System.out.println("Initializing graph"); - long startTime = System.nanoTime(); - graph.init(progressIndicator); - System.out.println(String.format("Done, took %f ms", (System.nanoTime() - startTime)/10.0e6)); - - float totalDiff; - int iter = 0; - - System.out.println("Beginning iterations"); - startTime = System.nanoTime(); - do - { - totalDiff = graph.nextIteration(progressIndicator); - iter++; - } while(iter < maxIters && totalDiff > tolerance); - System.out.println(String.format("Done, took %f ms", (System.nanoTime() - startTime)/10.0e6)); - } - - private datafu.linkanalysis.PageRank.ProgressIndicator getDummyProgressIndicator() - { - return new datafu.linkanalysis.PageRank.ProgressIndicator() - { - @Override - public void progress() - { - // do nothing - } - }; - } - - private void validateExpectedRanks(datafu.linkanalysis.PageRank graph, Map<String,Integer> nodeIds, Map<String,Float> expectedRanks) - { - System.out.println("Validating page rank results"); - - for (Map.Entry<String,Integer> e : nodeIds.entrySet()) - { - float rank = graph.getNodeRank(e.getValue()); - - float expectedRank = expectedRanks.get(e.getKey()); - // require 0.1% accuracy - assert (Math.abs(expectedRank - rank*100.0f) < 0.1) : String.format("Did not get expected rank for %s", e.getKey()); - } - - System.out.println("All ranks match expected"); - } - - public static Map<String,Float> parseExpectedRanks(String[] expectedRanks) - { - Map<String,Float> expectedRanksMap = new HashMap<String,Float>(); - for (String expectedRankString : expectedRanks) - { - String[] parts = expectedRankString.split(" "); - assert parts.length == 2 : "Expected two parts"; - String name = parts[0]; - Float expectedRank = Float.parseFloat(parts[1]); - expectedRanksMap.put(name, expectedRank); - } - return expectedRanksMap; - } - - private Integer getOrCreateId(String name, Map<String,Integer> nodeIds) - { - if (nodeIds.containsKey(name)) - { - return nodeIds.get(name); - } - else - { - Integer id = nodeIds.size(); - nodeIds.put(name, id); - return id; - } - } -}
