BIGTOP-282. Removing MovieLens data files
Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/f7127056 Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/f7127056 Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/f7127056 Branch: refs/heads/master Commit: f7127056a2ade0df855c983596a9d0d68f8320d1 Parents: 5b07cef Author: Sean Mackrory <[email protected]> Authored: Tue May 13 08:55:03 2014 -0600 Committer: Sean Mackrory <[email protected]> Committed: Tue May 13 08:55:30 2014 -0600 ---------------------------------------------------------------------- .../hive/src/main/resources/scripts/ql/basic/in | 3 - .../src/main/resources/scripts/ql/basic/out | 6 - .../resources/seed_data_files/ml-data/README | 145 - .../resources/seed_data_files/ml-data/allbut.pl | 50 - .../resources/seed_data_files/ml-data/mku.sh | 41 - .../resources/seed_data_files/ml-data/u.data | 100000 ---------------- .../resources/seed_data_files/ml-data/u.genre | 20 - .../resources/seed_data_files/ml-data/u.info | 3 - .../resources/seed_data_files/ml-data/u.item | 1682 - .../seed_data_files/ml-data/u.occupation | 21 - .../resources/seed_data_files/ml-data/u.user | 943 - .../resources/seed_data_files/ml-data/u1.base | 80000 ------------- .../resources/seed_data_files/ml-data/u1.test | 20000 ---- .../resources/seed_data_files/ml-data/u2.base | 80000 ------------- .../resources/seed_data_files/ml-data/u2.test | 20000 ---- .../resources/seed_data_files/ml-data/u3.base | 80000 ------------- .../resources/seed_data_files/ml-data/u3.test | 20000 ---- .../resources/seed_data_files/ml-data/u4.base | 80000 ------------- .../resources/seed_data_files/ml-data/u4.test | 20000 ---- .../resources/seed_data_files/ml-data/u5.base | 80000 ------------- .../resources/seed_data_files/ml-data/u5.test | 20000 ---- .../resources/seed_data_files/ml-data/ua.base | 90570 -------------- .../resources/seed_data_files/ml-data/ua.test | 9430 -- .../resources/seed_data_files/ml-data/ub.base | 90570 -------------- .../resources/seed_data_files/ml-data/ub.test | 9430 -- .../hive/src/main/resources/test.hql | 3 - 26 files changed, 802917 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in b/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in index 0845daa..56a9d85 100644 --- a/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in +++ b/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/in @@ -23,9 +23,6 @@ ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE; -LOAD DATA LOCAL INPATH 'seed_data_files/ml-data/u.data' -OVERWRITE INTO TABLE u_data; - INSERT OVERWRITE DIRECTORY '/tmp/count' SELECT COUNT(1) FROM u_data; dfs -cat /tmp/count/* ; http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out b/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out index 87b086b..809a1c1 100644 --- a/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out +++ b/bigtop-tests/test-artifacts/hive/src/main/resources/scripts/ql/basic/out @@ -11,12 +11,6 @@ FIELDS TERMINATED BY '\t' STORED AS TEXTFILE -LOAD DATA LOCAL INPATH 'seed_data_files/ml-data/u.data' -OVERWRITE INTO TABLE u_data -Copying file: file:/root/bigtop/bigtop-tests/test-execution/smokes/hive/target/seed_data_files/ml-data/u.data -Deleted /user/hive/warehouse/u_data - - INSERT OVERWRITE DIRECTORY '/tmp/count' SELECT COUNT(1) FROM u_data dfs -cat /tmp/count/* http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/README ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/README b/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/README deleted file mode 100644 index 8118ee5..0000000 --- a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/README +++ /dev/null @@ -1,145 +0,0 @@ -SUMMARY & USAGE LICENSE -============================================= - -MovieLens data sets were collected by the GroupLens Research Project -at the University of Minnesota. - -This data set consists of: - * 100,000 ratings (1-5) from 943 users on 1682 movies. - * Each user has rated at least 20 movies. - * Simple demographic info for the users (age, gender, occupation, zip) - -The data was collected through the MovieLens web site -(movielens.umn.edu) during the seven-month period from September 19th, -1997 through April 22nd, 1998. This data has been cleaned up - users -who had less than 20 ratings or did not have complete demographic -information were removed from this data set. Detailed descriptions of -the data file can be found at the end of this file. - -Neither the University of Minnesota nor any of the researchers -involved can guarantee the correctness of the data, its suitability -for any particular purpose, or the validity of results based on the -use of the data set. The data set may be used for any research -purposes under the following conditions: - - * The user may not state or imply any endorsement from the - University of Minnesota or the GroupLens Research Group. - - * The user must acknowledge the use of the data set in - publications resulting from the use of the data set, and must - send us an electronic or paper copy of those publications. - - * The user may not redistribute the data without separate - permission. - - * The user may not use this information for any commercial or - revenue-bearing purposes without first obtaining permission - from a faculty member of the GroupLens Research Project at the - University of Minnesota. - -If you have any further questions or comments, please contact Jon Herlocker -<[email protected]>. - -ACKNOWLEDGEMENTS -============================================== - -Thanks to Al Borchers for cleaning up this data and writing the -accompanying scripts. - -PUBLISHED WORK THAT HAS USED THIS DATASET -============================================== - -Herlocker, J., Konstan, J., Borchers, A., Riedl, J.. An Algorithmic -Framework for Performing Collaborative Filtering. Proceedings of the -1999 Conference on Research and Development in Information -Retrieval. Aug. 1999. - -FURTHER INFORMATION ABOUT THE GROUPLENS RESEARCH PROJECT -============================================== - -The GroupLens Research Project is a research group in the Department -of Computer Science and Engineering at the University of Minnesota. -Members of the GroupLens Research Project are involved in many -research projects related to the fields of information filtering, -collaborative filtering, and recommender systems. The project is lead -by professors John Riedl and Joseph Konstan. The project began to -explore automated collaborative filtering in 1992, but is most well -known for its world wide trial of an automated collaborative filtering -system for Usenet news in 1996. The technology developed in the -Usenet trial formed the base for the formation of Net Perceptions, -Inc., which was founded by members of GroupLens Research. Since then -the project has expanded its scope to research overall information -filtering solutions, integrating in content-based methods as well as -improving current collaborative filtering technology. - -Further information on the GroupLens Research project, including -research publications, can be found at the following web site: - - http://www.grouplens.org/ - -GroupLens Research currently operates a movie recommender based on -collaborative filtering: - - http://www.movielens.org/ - -DETAILED DESCRIPTIONS OF DATA FILES -============================================== - -Here are brief descriptions of the data. - -ml-data.tar.gz -- Compressed tar file. To rebuild the u data files do this: - gunzip ml-data.tar.gz - tar xvf ml-data.tar - mku.sh - -u.data -- The full u data set, 100000 ratings by 943 users on 1682 items. - Each user has rated at least 20 movies. Users and items are - numbered consecutively from 1. The data is randomly - ordered. This is a tab separated list of - user id | item id | rating | timestamp. - The time stamps are unix seconds since 1/1/1970 UTC - -u.info -- The number of users, items, and ratings in the u data set. - -u.item -- Information about the items (movies); this is a tab separated - list of - movie id | movie title | release date | video release date | - IMDb URL | unknown | Action | Adventure | Animation | - Children's | Comedy | Crime | Documentary | Drama | Fantasy | - Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | - Thriller | War | Western | - The last 19 fields are the genres, a 1 indicates the movie - is of that genre, a 0 indicates it is not; movies can be in - several genres at once. - The movie ids are the ones used in the u.data data set. - -u.genre -- A list of the genres. - -u.user -- Demographic information about the users; this is a tab - separated list of - user id | age | gender | occupation | zip code - The user ids are the ones used in the u.data data set. - -u.occupation -- A list of the occupations. - -u1.base -- The data sets u1.base and u1.test through u5.base and u5.test -u1.test are 80%/20% splits of the u data into training and test data. -u2.base Each of u1, ..., u5 have disjoint test sets; this if for -u2.test 5 fold cross validation (where you repeat your experiment -u3.base with each training and test set and average the results). -u3.test These data sets can be generated from u.data by mku.sh. -u4.base -u4.test -u5.base -u5.test - -ua.base -- The data sets ua.base, ua.test, ub.base, and ub.test -ua.test split the u data into a training set and a test set with -ub.base exactly 10 ratings per user in the test set. The sets -ub.test ua.test and ub.test are disjoint. These data sets can - be generated from u.data by mku.sh. - -allbut.pl -- The script that generates training and test sets where - all but n of a users ratings are in the training data. - -mku.sh -- A shell script to generate all the u data sets from u.data. http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/allbut.pl ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/allbut.pl b/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/allbut.pl deleted file mode 100755 index 3ec46e7..0000000 --- a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/allbut.pl +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/local/bin/perl - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# get args -if (@ARGV < 3) { - print STDERR "Usage: $0 base_name start stop max_test [ratings ...]\n"; - exit 1; -} -$basename = shift; -$start = shift; -$stop = shift; -$maxtest = shift; - -# open files -open( TESTFILE, ">$basename.test" ) or die "Cannot open $basename.test for writing\n"; -open( BASEFILE, ">$basename.base" ) or die "Cannot open $basename.base for writing\n"; - -# init variables -$testcnt = 0; - -while (<>) { - ($user) = split; - if (! defined $ratingcnt{$user}) { - $ratingcnt{$user} = 0; - } - ++$ratingcnt{$user}; - if (($testcnt < $maxtest || $maxtest <= 0) - && $ratingcnt{$user} >= $start && $ratingcnt{$user} <= $stop) { - ++$testcnt; - print TESTFILE; - } - else { - print BASEFILE; - } -} http://git-wip-us.apache.org/repos/asf/bigtop/blob/f7127056/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/mku.sh ---------------------------------------------------------------------- diff --git a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/mku.sh b/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/mku.sh deleted file mode 100755 index 8dd03aa..0000000 --- a/bigtop-tests/test-artifacts/hive/src/main/resources/seed_data_files/ml-data/mku.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/sh - -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -trap `rm -f tmp.$$; exit 1` 1 2 15 - -for i in 1 2 3 4 5 -do - head -`expr $i \* 20000` u.data | tail -20000 > tmp.$$ - sort -t" " -k 1,1n -k 2,2n tmp.$$ > u$i.test - head -`expr \( $i - 1 \) \* 20000` u.data > tmp.$$ - tail -`expr \( 5 - $i \) \* 20000` u.data >> tmp.$$ - sort -t" " -k 1,1n -k 2,2n tmp.$$ > u$i.base -done - -allbut.pl ua 1 10 100000 u.data -sort -t" " -k 1,1n -k 2,2n ua.base > tmp.$$ -mv tmp.$$ ua.base -sort -t" " -k 1,1n -k 2,2n ua.test > tmp.$$ -mv tmp.$$ ua.test - -allbut.pl ub 11 20 100000 u.data -sort -t" " -k 1,1n -k 2,2n ub.base > tmp.$$ -mv tmp.$$ ub.base -sort -t" " -k 1,1n -k 2,2n ub.test > tmp.$$ -mv tmp.$$ ub.test -
