[20/20] git commit: BIGTOP-282. Removing MovieLens data files

mackrorysd Tue, 13 May 2014 11:59:59 -0700

BIGTOP-282. Removing MovieLens data files


Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/f7127056
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/f7127056
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/f7127056

Branch: refs/heads/master
Commit: f7127056a2ade0df855c983596a9d0d68f8320d1
Parents: 5b07cef
Author: Sean Mackrory <[email protected]>
Authored: Tue May 13 08:55:03 2014 -0600
Committer: Sean Mackrory <[email protected]>
Committed: Tue May 13 08:55:30 2014 -0600

----------------------------------------------------------------------
 .../hive/src/main/resources/scripts/ql/basic/in |      3 -
 .../src/main/resources/scripts/ql/basic/out     |      6 -
 .../resources/seed_data_files/ml-data/README    |    145 -
 .../resources/seed_data_files/ml-data/allbut.pl |     50 -
 .../resources/seed_data_files/ml-data/mku.sh    |     41 -
 .../resources/seed_data_files/ml-data/u.data    | 100000 ----------------
 .../resources/seed_data_files/ml-data/u.genre   |     20 -
 .../resources/seed_data_files/ml-data/u.info    |      3 -
 .../resources/seed_data_files/ml-data/u.item    |   1682 -
 .../seed_data_files/ml-data/u.occupation        |     21 -
 .../resources/seed_data_files/ml-data/u.user    |    943 -
 .../resources/seed_data_files/ml-data/u1.base   |  80000 -------------
 .../resources/seed_data_files/ml-data/u1.test   |  20000 ----
 .../resources/seed_data_files/ml-data/u2.base   |  80000 -------------
 .../resources/seed_data_files/ml-data/u2.test   |  20000 ----
 .../resources/seed_data_files/ml-data/u3.base   |  80000 -------------
 .../resources/seed_data_files/ml-data/u3.test   |  20000 ----
 .../resources/seed_data_files/ml-data/u4.base   |  80000 -------------
 .../resources/seed_data_files/ml-data/u4.test   |  20000 ----
 .../resources/seed_data_files/ml-data/u5.base   |  80000 -------------
 .../resources/seed_data_files/ml-data/u5.test   |  20000 ----
 .../resources/seed_data_files/ml-data/ua.base   |  90570 --------------
 .../resources/seed_data_files/ml-data/ua.test   |   9430 --
 .../resources/seed_data_files/ml-data/ub.base   |  90570 --------------
 .../resources/seed_data_files/ml-data/ub.test   |   9430 --
 .../hive/src/main/resources/test.hql            |      3 -
 26 files changed, 802917 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in
----------------------------------------------------------------------
diff --git 
a/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in 
b/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in
index 0845daa..56a9d85 100644
--- a/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in
+++ b/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in
@@ -23,9 +23,6 @@ ROW FORMAT DELIMITED
 FIELDS TERMINATED BY '\t'
 STORED AS TEXTFILE;
 
-LOAD DATA LOCAL INPATH 'seed_data_files/ml-data/u.data'
-OVERWRITE INTO TABLE u_data;
-
 INSERT OVERWRITE DIRECTORY '/tmp/count'
 SELECT COUNT(1) FROM u_data;
 dfs -cat /tmp/count/* ;

http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out
----------------------------------------------------------------------
diff --git 
a/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out 
b/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out
index 87b086b..809a1c1 100644
--- a/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out
+++ b/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out
@@ -11,12 +11,6 @@ FIELDS TERMINATED BY '\t'
 STORED AS TEXTFILE
 
 
-LOAD DATA LOCAL INPATH 'seed_data_files/ml-data/u.data'
-OVERWRITE INTO TABLE u_data
-Copying file: 
file:/root/bigtop/bigtop-tests/test-execution/smokes/hive/target/seed_data_files/ml-data/u.data
-Deleted /user/hive/warehouse/u_data
-
-
 INSERT OVERWRITE DIRECTORY '/tmp/count'
 SELECT COUNT(1) FROM u_data
 dfs -cat /tmp/count/*

http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/README
----------------------------------------------------------------------
diff --git 
a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/README
 
b/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/README
deleted file mode 100644
index 8118ee5..0000000
--- 
a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/README
+++ /dev/null
@@ -1,145 +0,0 @@
-SUMMARY & USAGE LICENSE
-=============================================
-
-MovieLens data sets were collected by the GroupLens Research Project
-at the University of Minnesota.
- 
-This data set consists of:
-       * 100,000 ratings (1-5) from 943 users on 1682 movies. 
-       * Each user has rated at least 20 movies. 
-        * Simple demographic info for the users (age, gender, occupation, zip)
-
-The data was collected through the MovieLens web site
-(movielens.umn.edu) during the seven-month period from September 19th, 
-1997 through April 22nd, 1998. This data has been cleaned up - users
-who had less than 20 ratings or did not have complete demographic
-information were removed from this data set. Detailed descriptions of
-the data file can be found at the end of this file.
-
-Neither the University of Minnesota nor any of the researchers
-involved can guarantee the correctness of the data, its suitability
-for any particular purpose, or the validity of results based on the
-use of the data set.  The data set may be used for any research
-purposes under the following conditions:
-
-     * The user may not state or imply any endorsement from the
-       University of Minnesota or the GroupLens Research Group.
-
-     * The user must acknowledge the use of the data set in
-       publications resulting from the use of the data set, and must
-       send us an electronic or paper copy of those publications.
-
-     * The user may not redistribute the data without separate
-       permission.
-
-     * The user may not use this information for any commercial or
-       revenue-bearing purposes without first obtaining permission
-       from a faculty member of the GroupLens Research Project at the
-       University of Minnesota.
-
-If you have any further questions or comments, please contact Jon Herlocker
-<[email protected]>. 
-
-ACKNOWLEDGEMENTS
-==============================================
-
-Thanks to Al Borchers for cleaning up this data and writing the
-accompanying scripts.
-
-PUBLISHED WORK THAT HAS USED THIS DATASET
-==============================================
-
-Herlocker, J., Konstan, J., Borchers, A., Riedl, J.. An Algorithmic
-Framework for Performing Collaborative Filtering. Proceedings of the
-1999 Conference on Research and Development in Information
-Retrieval. Aug. 1999.
-
-FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT
-==============================================
-
-The GroupLens Research Project is a research group in the Department
-of Computer Science and Engineering at the University of Minnesota.
-Members of the GroupLens Research Project are involved in many
-research projects related to the fields of information filtering,
-collaborative filtering, and recommender systems. The project is lead
-by professors John Riedl and Joseph Konstan. The project began to
-explore automated collaborative filtering in 1992, but is most well
-known for its world wide trial of an automated collaborative filtering
-system for Usenet news in 1996.  The technology developed in the
-Usenet trial formed the base for the formation of Net Perceptions,
-Inc., which was founded by members of GroupLens Research. Since then
-the project has expanded its scope to research overall information
-filtering solutions, integrating in content-based methods as well as
-improving current collaborative filtering technology.
-
-Further information on the GroupLens Research project, including
-research publications, can be found at the following web site:
-        
-        http://www.grouplens.org/
-
-GroupLens Research currently operates a movie recommender based on
-collaborative filtering:
-
-        http://www.movielens.org/
-
-DETAILED DESCRIPTIONS OF DATA FILES
-==============================================
-
-Here are brief descriptions of the data.
-
-ml-data.tar.gz   -- Compressed tar file.  To rebuild the u data files do this:
-                gunzip ml-data.tar.gz
-                tar xvf ml-data.tar
-                mku.sh
-
-u.data     -- The full u data set, 100000 ratings by 943 users on 1682 items.
-              Each user has rated at least 20 movies.  Users and items are
-              numbered consecutively from 1.  The data is randomly
-              ordered. This is a tab separated list of 
-                user id | item id | rating | timestamp. 
-              The time stamps are unix seconds since 1/1/1970 UTC   
-
-u.info     -- The number of users, items, and ratings in the u data set.
-
-u.item     -- Information about the items (movies); this is a tab separated
-              list of
-              movie id | movie title | release date | video release date |
-              IMDb URL | unknown | Action | Adventure | Animation |
-              Children's | Comedy | Crime | Documentary | Drama | Fantasy |
-              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
-              Thriller | War | Western |
-              The last 19 fields are the genres, a 1 indicates the movie
-              is of that genre, a 0 indicates it is not; movies can be in
-              several genres at once.
-              The movie ids are the ones used in the u.data data set.
-
-u.genre    -- A list of the genres.
-
-u.user     -- Demographic information about the users; this is a tab
-              separated list of
-              user id | age | gender | occupation | zip code
-              The user ids are the ones used in the u.data data set.
-
-u.occupation -- A list of the occupations.
-
-u1.base    -- The data sets u1.base and u1.test through u5.base and u5.test
-u1.test       are 80%/20% splits of the u data into training and test data.
-u2.base       Each of u1, ..., u5 have disjoint test sets; this if for
-u2.test       5 fold cross validation (where you repeat your experiment
-u3.base       with each training and test set and average the results).
-u3.test       These data sets can be generated from u.data by mku.sh.
-u4.base
-u4.test
-u5.base
-u5.test
-
-ua.base    -- The data sets ua.base, ua.test, ub.base, and ub.test
-ua.test       split the u data into a training set and a test set with
-ub.base       exactly 10 ratings per user in the test set.  The sets
-ub.test       ua.test and ub.test are disjoint.  These data sets can
-              be generated from u.data by mku.sh.
-
-allbut.pl  -- The script that generates training and test sets where
-              all but n of a users ratings are in the training data.
-
-mku.sh     -- A shell script to generate all the u data sets from u.data.

http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/allbut.pl
----------------------------------------------------------------------
diff --git 
a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/allbut.pl
 
b/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/allbut.pl
deleted file mode 100755
index 3ec46e7..0000000
--- 
a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/allbut.pl
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/local/bin/perl
-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# get args
-if (@ARGV < 3) {
-       print STDERR "Usage: $0 base_name start stop max_test [ratings ...]\n";
-       exit 1;
-}
-$basename = shift;
-$start = shift;
-$stop = shift;
-$maxtest = shift;
-
-# open files
-open( TESTFILE, ">$basename.test" ) or die "Cannot open $basename.test for 
writing\n";
-open( BASEFILE, ">$basename.base" ) or die "Cannot open $basename.base for 
writing\n";
-
-# init variables
-$testcnt = 0;
-
-while (<>) {
-       ($user) = split;
-       if (! defined $ratingcnt{$user}) {
-               $ratingcnt{$user} = 0;
-       }
-       ++$ratingcnt{$user};
-       if (($testcnt < $maxtest || $maxtest <= 0)
-       && $ratingcnt{$user} >= $start && $ratingcnt{$user} <= $stop) {
-               ++$testcnt;
-               print TESTFILE;
-       }
-       else {
-               print BASEFILE;
-       }
-}

http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/mku.sh
----------------------------------------------------------------------
diff --git 
a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/mku.sh
 
b/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/mku.sh
deleted file mode 100755
index 8dd03aa..0000000
--- 
a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/mku.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-trap `rm -f tmp.$$; exit 1` 1 2 15
-
-for i in 1 2 3 4 5
-do
-       head -`expr $i \* 20000` u.data | tail -20000 > tmp.$$
-       sort -t"        " -k 1,1n -k 2,2n tmp.$$ > u$i.test
-       head -`expr \( $i - 1 \) \* 20000` u.data > tmp.$$
-       tail -`expr \( 5 - $i \) \* 20000` u.data >> tmp.$$
-       sort -t"        " -k 1,1n -k 2,2n tmp.$$ > u$i.base
-done
-
-allbut.pl ua 1 10 100000 u.data
-sort -t"       " -k 1,1n -k 2,2n ua.base > tmp.$$
-mv tmp.$$ ua.base
-sort -t"       " -k 1,1n -k 2,2n ua.test > tmp.$$
-mv tmp.$$ ua.test
-
-allbut.pl ub 11 20 100000 u.data
-sort -t"       " -k 1,1n -k 2,2n ub.base > tmp.$$
-mv tmp.$$ ub.base
-sort -t"       " -k 1,1n -k 2,2n ub.test > tmp.$$
-mv tmp.$$ ub.test
-

[20/20] git commit: BIGTOP-282. Removing MovieLens data files

Reply via email to