http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/.gitignore ---------------------------------------------------------------------- diff --git a/.gitignore b/.gitignore index 0edb4bb..d27bbbc 100644 --- a/.gitignore +++ b/.gitignore @@ -12,11 +12,11 @@ output-asf-email-examples/ target/ examples/bin/tmp output -mr/build/ -mr/input/ -mr/output/ -mr/testdata/ -mr/temp +community/mr/build/ +community/mr/input/ +community/mr/output/ +community/mr/testdata/ +community/mr/temp temp foo math-tests/
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/flink-batch/pom.xml ---------------------------------------------------------------------- diff --git a/community/community-engines/flink-batch/pom.xml b/community/community-engines/flink-batch/pom.xml index 8103b21..64083f7 100644 --- a/community/community-engines/flink-batch/pom.xml +++ b/community/community-engines/flink-batch/pom.xml @@ -27,7 +27,7 @@ <relativePath>../pom.xml</relativePath> </parent> - <artifactId>mahout-flink-batch_2.10</artifactId> + <artifactId>mahout-flink-batch_${scala.compat.version}</artifactId> <name>-- Mahout Flink Engine (Community)</name> <description> Mahout Bindings for Apache Flink @@ -35,7 +35,6 @@ <properties> <flink.version>1.1.4</flink.version> - <scala.compat.version>2.10</scala.compat.version> </properties> <packaging>jar</packaging> @@ -193,11 +192,11 @@ <version>${project.version}</version> </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>javacpp</artifactId> - <version>1.2.2</version> - </dependency> + <!--<dependency>--> + <!--<groupId>org.bytedeco</groupId>--> + <!--<artifactId>javacpp</artifactId>--> + <!--<version>1.2.2</version>--> + <!--</dependency>--> <!-- enforce current version of kryo as of 0.10.1--> <dependency> http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/pom.xml ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/pom.xml b/community/community-engines/h2o/pom.xml index a5c5e4f..89f560c 100644 --- a/community/community-engines/h2o/pom.xml +++ b/community/community-engines/h2o/pom.xml @@ -27,7 +27,7 @@ <relativePath>../pom.xml</relativePath> </parent> - <artifactId>mahout-h2o_2.10</artifactId> + <artifactId>mahout-h2o_${scala.compat.version}</artifactId> <name>-- Mahout H2O Engine (Community)</name> <description> H2O Backend for Mahout DSL @@ -37,8 +37,6 @@ <properties> <h2o.version>0.1.25</h2o.version> - <scala.compat.version>2.10</scala.compat.version> - <scala.version>2.10.5</scala.version> </properties> @@ -162,17 +160,14 @@ </build> <dependencies> - <dependency> - <groupId>org.scala-lang</groupId> - <artifactId>scala-library</artifactId> - <version>${scala.version}</version> - </dependency> - <dependency> - <groupId>org.apache.mahout</groupId> - <artifactId>mahout-math-scala_${scala.compat.version}</artifactId> - <version>${project.version}</version> - </dependency> + + <!--<dependency>--> + <!--<groupId>org.apache.mahout</groupId>--> + <!--<!–<artifactId>mahout-math-scala_${scala.compat.version}</artifactId>–>--> + <!--<artifactId>a</artifactId>--> + <!--<version>${project.version}</version>--> + <!--</dependency>--> <!-- for MatrixWritable and VectorWritable --> <dependency> @@ -219,6 +214,10 @@ <artifactId>scalatest_${scala.compat.version}</artifactId> </dependency> + <dependency> + <groupId>org.scala-lang</groupId> + <artifactId>scala-library</artifactId> + </dependency> </dependencies> <profiles> http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/.alphaIDrm.drm.crc ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/.alphaIDrm.drm.crc b/community/community-engines/h2o/tmp/naiveBayesModel/.alphaIDrm.drm.crc deleted file mode 100644 index a46b916..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/.alphaIDrm.drm.crc and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/.isComplementaryDrm.drm.crc ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/.isComplementaryDrm.drm.crc b/community/community-engines/h2o/tmp/naiveBayesModel/.isComplementaryDrm.drm.crc deleted file mode 100644 index ed8dd37..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/.isComplementaryDrm.drm.crc and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/.labelIndex.drm.crc ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/.labelIndex.drm.crc b/community/community-engines/h2o/tmp/naiveBayesModel/.labelIndex.drm.crc deleted file mode 100644 index 73c0476..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/.labelIndex.drm.crc and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/.perlabelThetaNormalizerDrm.drm.crc ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/.perlabelThetaNormalizerDrm.drm.crc b/community/community-engines/h2o/tmp/naiveBayesModel/.perlabelThetaNormalizerDrm.drm.crc deleted file mode 100644 index f4e564e..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/.perlabelThetaNormalizerDrm.drm.crc and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerFeatureDrm.drm.crc ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerFeatureDrm.drm.crc b/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerFeatureDrm.drm.crc deleted file mode 100644 index 9e5209b..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerFeatureDrm.drm.crc and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerLabelAndFeatureDrm.drm.crc ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerLabelAndFeatureDrm.drm.crc b/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerLabelAndFeatureDrm.drm.crc deleted file mode 100644 index bc53d76..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerLabelAndFeatureDrm.drm.crc and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerLabelDrm.drm.crc ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerLabelDrm.drm.crc b/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerLabelDrm.drm.crc deleted file mode 100644 index 514624c..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/.weightsPerLabelDrm.drm.crc and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/alphaIDrm.drm ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/alphaIDrm.drm b/community/community-engines/h2o/tmp/naiveBayesModel/alphaIDrm.drm deleted file mode 100644 index d821c41..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/alphaIDrm.drm and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/isComplementaryDrm.drm ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/isComplementaryDrm.drm b/community/community-engines/h2o/tmp/naiveBayesModel/isComplementaryDrm.drm deleted file mode 100644 index 0e46cf2..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/isComplementaryDrm.drm and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/labelIndex.drm ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/labelIndex.drm b/community/community-engines/h2o/tmp/naiveBayesModel/labelIndex.drm deleted file mode 100644 index ceca885..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/labelIndex.drm and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/perlabelThetaNormalizerDrm.drm ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/perlabelThetaNormalizerDrm.drm b/community/community-engines/h2o/tmp/naiveBayesModel/perlabelThetaNormalizerDrm.drm deleted file mode 100644 index 38b4904..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/perlabelThetaNormalizerDrm.drm and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerFeatureDrm.drm ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerFeatureDrm.drm b/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerFeatureDrm.drm deleted file mode 100644 index 1c3beea..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerFeatureDrm.drm and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerLabelAndFeatureDrm.drm ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerLabelAndFeatureDrm.drm b/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerLabelAndFeatureDrm.drm deleted file mode 100644 index a2a0e13..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerLabelAndFeatureDrm.drm and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerLabelDrm.drm ---------------------------------------------------------------------- diff --git a/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerLabelDrm.drm b/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerLabelDrm.drm deleted file mode 100644 index 15c894f..0000000 Binary files a/community/community-engines/h2o/tmp/naiveBayesModel/weightsPerLabelDrm.drm and /dev/null differ http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/pom.xml ---------------------------------------------------------------------- diff --git a/community/mahout-mr/pom.xml b/community/mahout-mr/pom.xml new file mode 100644 index 0000000..6db69a1 --- /dev/null +++ b/community/mahout-mr/pom.xml @@ -0,0 +1,309 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.mahout</groupId> + <artifactId>community</artifactId> + <version>0.13.1-SNAPSHOT</version> + <relativePath>../pom.xml</relativePath> + </parent> + + <!-- modules inherit parent's group id and version. --> + <artifactId>mahout-mr</artifactId> + <name>- Mahout Classic (Map-Reduce Based Machine Learning)</name> + <description>Scalable machine learning libraries</description> + + <packaging>jar</packaging> + + <properties> + <hadoop.version>2.4.1</hadoop.version> + <lucene.version>5.5.2</lucene.version> + </properties> + <build> + <resources> + <resource> + <directory>src/main/resources</directory> + </resource> + <resource> + <directory>../src/conf</directory> + <includes> + <include>driver.classes.default.props</include> + </includes> + </resource> + </resources> + <plugins> + <!-- copy jars to top directory, which is MAHOUT_HOME --> + <plugin> + <artifactId>maven-antrun-plugin</artifactId> + <version>1.4</version> + <executions> + <execution> + <id>copy</id> + <phase>package</phase> + <configuration> + <tasks> + <copy file="target/mahout-mr-${version}.jar" tofile="../mahout-mr-${version}.jar" /> + </tasks> + </configuration> + <goals> + <goal>run</goal> + </goals> + </execution> + </executions> + </plugin> + <!-- create test jar so other modules can reuse the core test utility classes. --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <executions> + <execution> + <goals> + <goal>test-jar</goal> + </goals> + </execution> + </executions> + </plugin> + + <!-- create core hadoop job jar --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <executions> + <execution> + <id>job</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + <configuration> + <descriptors> + <descriptor>src/main/assembly/job.xml</descriptor> + </descriptors> + </configuration> + </execution> + </executions> + </plugin> + + <plugin> + <artifactId>maven-javadoc-plugin</artifactId> + </plugin> + + <plugin> + <artifactId>maven-source-plugin</artifactId> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-remote-resources-plugin</artifactId> + <configuration> + <appendedResourcesDirectory>src/main/resources</appendedResourcesDirectory> + <resourceBundles> + <resourceBundle>org.apache:apache-jar-resource-bundle:1.4</resourceBundle> + </resourceBundles> + <supplementalModels> + <supplementalModel>supplemental-models.xml</supplementalModel> + </supplementalModels> + </configuration> + </plugin> + <!-- remove jars from top directory on clean --> + <plugin> + <artifactId>maven-clean-plugin</artifactId> + <version>3.0.0</version> + <configuration> + <filesets> + <fileset> + <directory>../</directory> + <includes> + <include>mahout-mr*.jar</include> + </includes> + <followSymlinks>false</followSymlinks> + </fileset> + </filesets> + </configuration> + </plugin> + </plugins> + </build> + + <dependencies> + + <!-- our modules --> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>core</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-hdfs</artifactId> + <version>${project.version}</version> + </dependency> + + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>mahout-hdfs</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + + <!-- Third Party --> + + <dependency> + <groupId>com.tdunning</groupId> + <artifactId>t-digest</artifactId> + <version>3.1</version> + </dependency> + + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>11.0.2</version> + </dependency> + + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-client</artifactId> + <version>${hadoop.version}</version> + </dependency> + + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + <version>2.7.4</version> + </dependency> + + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + <version>${slf4j.version}</version> + </dependency> + + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-jcl</artifactId> + <version>${slf4j.version}</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + <version>3.1</version> + </dependency> + + <dependency> + <groupId>commons-cli</groupId> + <artifactId>commons-cli</artifactId> + <version>1.2</version> + </dependency> + + <dependency> + <groupId>com.thoughtworks.xstream</groupId> + <artifactId>xstream</artifactId> + <version>1.4.4</version> + </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + <version>${lucene.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <version>${lucene.version}</version> + </dependency> + + <dependency> + <groupId>org.apache.mahout.commons</groupId> + <artifactId>commons-cli</artifactId> + <version>2.0-mahout</version> + </dependency> + + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + <version>3.2</version> + </dependency> + + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>4.12</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.hamcrest</groupId> + <artifactId>hamcrest-all</artifactId> + <version>1.3</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>com.carrotsearch.randomizedtesting</groupId> + <artifactId>randomizedtesting-runner</artifactId> + <version>2.0.15</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.easymock</groupId> + <artifactId>easymock</artifactId> + <version>3.2</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.apache.mrunit</groupId> + <artifactId>mrunit</artifactId> + <version>1.0.0</version> + <classifier>hadoop2</classifier> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>commons-httpclient</groupId> + <artifactId>commons-httpclient</artifactId> + <version>3.0.1</version> + <scope>test</scope> + </dependency> + + <dependency> + <groupId>org.apache.solr</groupId> + <artifactId>solr-commons-csv</artifactId> + <version>3.5.0</version> + </dependency> + + </dependencies> + +</project> http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/assembly/job.xml ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/assembly/job.xml b/community/mahout-mr/src/main/assembly/job.xml new file mode 100644 index 0000000..2bdb3ce --- /dev/null +++ b/community/mahout-mr/src/main/assembly/job.xml @@ -0,0 +1,61 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<assembly + xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 + http://maven.apache.org/xsd/assembly-1.1.0.xsd"> + <id>job</id> + <formats> + <format>jar</format> + </formats> + <includeBaseDirectory>false</includeBaseDirectory> + <dependencySets> + <dependencySet> + <unpack>true</unpack> + <unpackOptions> + <!-- MAHOUT-1126 --> + <excludes> + <exclude>META-INF/LICENSE</exclude> + </excludes> + </unpackOptions> + <scope>runtime</scope> + <outputDirectory>/</outputDirectory> + <useTransitiveFiltering>true</useTransitiveFiltering> + <excludes> + <exclude>org.apache.hadoop:hadoop-core</exclude> + </excludes> + </dependencySet> + </dependencySets> + <fileSets> + <fileSet> + <directory>${basedir}/target/classes</directory> + <outputDirectory>/</outputDirectory> + <excludes> + <exclude>*.jar</exclude> + </excludes> + </fileSet> + <fileSet> + <directory>${basedir}/target/classes</directory> + <outputDirectory>/</outputDirectory> + <includes> + <include>driver.classes.default.props</include> + </includes> + </fileSet> + </fileSets> +</assembly> http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/Version.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/Version.java b/community/mahout-mr/src/main/java/org/apache/mahout/Version.java new file mode 100644 index 0000000..5f3c879 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/Version.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout; + +import com.google.common.base.Charsets; +import com.google.common.io.Resources; + +import java.io.IOException; + +public final class Version { + + private Version() { + } + + public static String version() { + return Version.class.getPackage().getImplementationVersion(); + } + + public static String versionFromResource() throws IOException { + return Resources.toString(Resources.getResource("version"), Charsets.UTF_8); + } + + public static void main(String[] args) throws IOException { + System.out.println(version() + ' ' + versionFromResource()); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java new file mode 100644 index 0000000..1ac5b72 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchItemException.java @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.common; + +public final class NoSuchItemException extends TasteException { + + public NoSuchItemException() { } + + public NoSuchItemException(long itemID) { + this(String.valueOf(itemID)); + } + + public NoSuchItemException(String message) { + super(message); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java new file mode 100644 index 0000000..cbb60fa --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/NoSuchUserException.java @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.common; + +public final class NoSuchUserException extends TasteException { + + public NoSuchUserException() { } + + public NoSuchUserException(long userID) { + this(String.valueOf(userID)); + } + + public NoSuchUserException(String message) { + super(message); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java new file mode 100644 index 0000000..9b26bee --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/Refreshable.java @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.common; + +import java.util.Collection; + +/** + * <p> + * Implementations of this interface have state that can be periodically refreshed. For example, an + * implementation instance might contain some pre-computed information that should be periodically refreshed. + * The {@link #refresh(Collection)} method triggers such a refresh. + * </p> + * + * <p> + * All Taste components implement this. In particular, + * {@link org.apache.mahout.cf.taste.recommender.Recommender}s do. Callers may want to call + * {@link #refresh(Collection)} periodically to re-compute information throughout the system and bring it up + * to date, though this operation may be expensive. + * </p> + */ +public interface Refreshable { + + /** + * <p> + * Triggers "refresh" -- whatever that means -- of the implementation. The general contract is that any + * {@link Refreshable} should always leave itself in a consistent, operational state, and that the refresh + * atomically updates internal state from old to new. + * </p> + * + * @param alreadyRefreshed + * {@link org.apache.mahout.cf.taste.common.Refreshable}s that are known to have already been + * refreshed as a result of an initial call to a {#refresh(Collection)} method on some + * object. This ensure that objects in a refresh dependency graph aren't refreshed twice + * needlessly. + */ + void refresh(Collection<Refreshable> alreadyRefreshed); + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java new file mode 100644 index 0000000..1792eff --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/TasteException.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.common; + +/** + * <p> + * An exception thrown when an error occurs inside the Taste engine. + * </p> + */ +public class TasteException extends Exception { + + public TasteException() { } + + public TasteException(String message) { + super(message); + } + + public TasteException(Throwable cause) { + super(cause); + } + + public TasteException(String message, Throwable cause) { + super(message, cause); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java new file mode 100644 index 0000000..4e39617 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/common/Weighting.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.common; + +/** + * <p> + * A simple enum which gives symbolic names to the ideas of "weighted" and "unweighted", to make various API + * calls which take a weighting parameter more readable. + * </p> + */ +public enum Weighting { + + WEIGHTED, + UNWEIGHTED + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java new file mode 100644 index 0000000..875c65e --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/DataModelBuilder.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.eval; + +import org.apache.mahout.cf.taste.impl.common.FastByIDMap; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.PreferenceArray; + +/** + * <p> + * Implementations of this inner interface are simple helper classes which create a {@link DataModel} to be + * used while evaluating a {@link org.apache.mahout.cf.taste.recommender.Recommender}. + * + * @see RecommenderBuilder + * @see RecommenderEvaluator + */ +public interface DataModelBuilder { + + /** + * <p> + * Builds a {@link DataModel} implementation to be used in an evaluation, given training data. + * </p> + * + * @param trainingData + * data to be used in the {@link DataModel} + * @return {@link DataModel} based upon the given data + */ + DataModel buildDataModel(FastByIDMap<PreferenceArray> trainingData); + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java new file mode 100644 index 0000000..9c442ff --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/IRStatistics.java @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.eval; + +/** + * <p> + * Implementations encapsulate information retrieval-related statistics about a + * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations. + * </p> + * + * <p> + * See <a href="http://en.wikipedia.org/wiki/Information_retrieval">Information retrieval</a>. + * </p> + */ +public interface IRStatistics { + + /** + * <p> + * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#Precision">Precision</a>. + * </p> + */ + double getPrecision(); + + /** + * <p> + * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#Recall">Recall</a>. + * </p> + */ + double getRecall(); + + /** + * <p> + * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#Fall-Out">Fall-Out</a>. + * </p> + */ + double getFallOut(); + + /** + * <p> + * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#F-measure">F-measure</a>. + * </p> + */ + double getF1Measure(); + + /** + * <p> + * See <a href="http://en.wikipedia.org/wiki/Information_retrieval#F-measure">F-measure</a>. + * </p> + */ + double getFNMeasure(double n); + + /** + * <p> + * See <a href="http://en.wikipedia.org/wiki/Discounted_cumulative_gain#Normalized_DCG"> + * Normalized Discounted Cumulative Gain</a>. + * </p> + */ + double getNormalizedDiscountedCumulativeGain(); + + /** + * @return the fraction of all users for whom recommendations could be produced + */ + double getReach(); + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java new file mode 100644 index 0000000..1805092 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderBuilder.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.eval; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.recommender.Recommender; + +/** + * <p> + * Implementations of this inner interface are simple helper classes which create a {@link Recommender} to be + * evaluated based on the given {@link DataModel}. + * </p> + */ +public interface RecommenderBuilder { + + /** + * <p> + * Builds a {@link Recommender} implementation to be evaluated, using the given {@link DataModel}. + * </p> + * + * @param dataModel + * {@link DataModel} to build the {@link Recommender} on + * @return {@link Recommender} based upon the given {@link DataModel} + * @throws TasteException + * if an error occurs while accessing the {@link DataModel} + */ + Recommender buildRecommender(DataModel dataModel) throws TasteException; + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java new file mode 100644 index 0000000..dcbbcf8 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderEvaluator.java @@ -0,0 +1,105 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.eval; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.model.DataModel; + +/** + * <p> + * Implementations of this interface evaluate the quality of a + * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations. + * </p> + */ +public interface RecommenderEvaluator { + + /** + * <p> + * Evaluates the quality of a {@link org.apache.mahout.cf.taste.recommender.Recommender}'s recommendations. + * The range of values that may be returned depends on the implementation, but <em>lower</em> values must + * mean better recommendations, with 0 being the lowest / best possible evaluation, meaning a perfect match. + * This method does not accept a {@link org.apache.mahout.cf.taste.recommender.Recommender} directly, but + * rather a {@link RecommenderBuilder} which can build the + * {@link org.apache.mahout.cf.taste.recommender.Recommender} to test on top of a given {@link DataModel}. + * </p> + * + * <p> + * Implementations will take a certain percentage of the preferences supplied by the given {@link DataModel} + * as "training data". This is typically most of the data, like 90%. This data is used to produce + * recommendations, and the rest of the data is compared against estimated preference values to see how much + * the {@link org.apache.mahout.cf.taste.recommender.Recommender}'s predicted preferences match the user's + * real preferences. Specifically, for each user, this percentage of the user's ratings are used to produce + * recommendations, and for each user, the remaining preferences are compared against the user's real + * preferences. + * </p> + * + * <p> + * For large datasets, it may be desirable to only evaluate based on a small percentage of the data. + * {@code evaluationPercentage} controls how many of the {@link DataModel}'s users are used in + * evaluation. + * </p> + * + * <p> + * To be clear, {@code trainingPercentage} and {@code evaluationPercentage} are not related. They + * do not need to add up to 1.0, for example. + * </p> + * + * @param recommenderBuilder + * object that can build a {@link org.apache.mahout.cf.taste.recommender.Recommender} to test + * @param dataModelBuilder + * {@link DataModelBuilder} to use, or if null, a default {@link DataModel} + * implementation will be used + * @param dataModel + * dataset to test on + * @param trainingPercentage + * percentage of each user's preferences to use to produce recommendations; the rest are compared + * to estimated preference values to evaluate + * {@link org.apache.mahout.cf.taste.recommender.Recommender} performance + * @param evaluationPercentage + * percentage of users to use in evaluation + * @return a "score" representing how well the {@link org.apache.mahout.cf.taste.recommender.Recommender}'s + * estimated preferences match real values; <em>lower</em> scores mean a better match and 0 is a + * perfect match + * @throws TasteException + * if an error occurs while accessing the {@link DataModel} + */ + double evaluate(RecommenderBuilder recommenderBuilder, + DataModelBuilder dataModelBuilder, + DataModel dataModel, + double trainingPercentage, + double evaluationPercentage) throws TasteException; + + /** + * @deprecated see {@link DataModel#getMaxPreference()} + */ + @Deprecated + float getMaxPreference(); + + @Deprecated + void setMaxPreference(float maxPreference); + + /** + * @deprecated see {@link DataModel#getMinPreference()} + */ + @Deprecated + float getMinPreference(); + + @Deprecated + void setMinPreference(float minPreference); + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java new file mode 100644 index 0000000..6e4e9c7 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RecommenderIRStatsEvaluator.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.eval; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.recommender.IDRescorer; + +/** + * <p> + * Implementations collect information retrieval-related statistics on a + * {@link org.apache.mahout.cf.taste.recommender.Recommender}'s performance, including precision, recall and + * f-measure. + * </p> + * + * <p> + * See <a href="http://en.wikipedia.org/wiki/Information_retrieval">Information retrieval</a>. + */ +public interface RecommenderIRStatsEvaluator { + + /** + * @param recommenderBuilder + * object that can build a {@link org.apache.mahout.cf.taste.recommender.Recommender} to test + * @param dataModelBuilder + * {@link DataModelBuilder} to use, or if null, a default {@link DataModel} implementation will be + * used + * @param dataModel + * dataset to test on + * @param rescorer + * if any, to use when computing recommendations + * @param at + * as in, "precision at 5". The number of recommendations to consider when evaluating precision, + * etc. + * @param relevanceThreshold + * items whose preference value is at least this value are considered "relevant" for the purposes + * of computations + * @return {@link IRStatistics} with resulting precision, recall, etc. + * @throws TasteException + * if an error occurs while accessing the {@link DataModel} + */ + IRStatistics evaluate(RecommenderBuilder recommenderBuilder, + DataModelBuilder dataModelBuilder, + DataModel dataModel, + IDRescorer rescorer, + int at, + double relevanceThreshold, + double evaluationPercentage) throws TasteException; + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java new file mode 100644 index 0000000..da318d5 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/eval/RelevantItemsDataSplitter.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.eval; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastByIDMap; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.PreferenceArray; + +/** + * Implementations of this interface determine the items that are considered relevant, + * and splits data into a training and test subset, for purposes of precision/recall + * tests as implemented by implementations of {@link RecommenderIRStatsEvaluator}. + */ +public interface RelevantItemsDataSplitter { + + /** + * During testing, relevant items are removed from a particular users' preferences, + * and a model is build using this user's other preferences and all other users. + * + * @param at Maximum number of items to be removed + * @param relevanceThreshold Minimum strength of preference for an item to be considered + * relevant + * @return IDs of relevant items + */ + FastIDSet getRelevantItemsIDs(long userID, + int at, + double relevanceThreshold, + DataModel dataModel) throws TasteException; + + /** + * Adds a single user and all their preferences to the training model. + * + * @param userID ID of user whose preferences we are trying to predict + * @param relevantItemIDs IDs of items considered relevant to that user + * @param trainingUsers the database of training preferences to which we will + * append the ones for otherUserID. + * @param otherUserID for whom we are adding preferences to the training model + */ + void processOtherUser(long userID, + FastIDSet relevantItemIDs, + FastByIDMap<PreferenceArray> trainingUsers, + long otherUserID, + DataModel dataModel) throws TasteException; + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java new file mode 100644 index 0000000..e70a675 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityEntityWritable.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.hadoop; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import com.google.common.primitives.Longs; +import org.apache.hadoop.io.WritableComparable; +import org.apache.mahout.math.Varint; + +/** A {@link WritableComparable} encapsulating two items. */ +public final class EntityEntityWritable implements WritableComparable<EntityEntityWritable>, Cloneable { + + private long aID; + private long bID; + + public EntityEntityWritable() { + // do nothing + } + + public EntityEntityWritable(long aID, long bID) { + this.aID = aID; + this.bID = bID; + } + + long getAID() { + return aID; + } + + long getBID() { + return bID; + } + + @Override + public void write(DataOutput out) throws IOException { + Varint.writeSignedVarLong(aID, out); + Varint.writeSignedVarLong(bID, out); + } + + @Override + public void readFields(DataInput in) throws IOException { + aID = Varint.readSignedVarLong(in); + bID = Varint.readSignedVarLong(in); + } + + @Override + public int compareTo(EntityEntityWritable that) { + int aCompare = compare(aID, that.getAID()); + return aCompare == 0 ? compare(bID, that.getBID()) : aCompare; + } + + private static int compare(long a, long b) { + return a < b ? -1 : a > b ? 1 : 0; + } + + @Override + public int hashCode() { + return Longs.hashCode(aID) + 31 * Longs.hashCode(bID); + } + + @Override + public boolean equals(Object o) { + if (o instanceof EntityEntityWritable) { + EntityEntityWritable that = (EntityEntityWritable) o; + return aID == that.getAID() && bID == that.getBID(); + } + return false; + } + + @Override + public String toString() { + return aID + "\t" + bID; + } + + @Override + public EntityEntityWritable clone() { + return new EntityEntityWritable(aID, bID); + } + +} + http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java new file mode 100644 index 0000000..2aab63c --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/EntityPrefWritable.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.hadoop; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.mahout.common.RandomUtils; +import org.apache.mahout.math.VarLongWritable; + +/** A {@link org.apache.hadoop.io.Writable} encapsulating an item ID and a preference value. */ +public final class EntityPrefWritable extends VarLongWritable implements Cloneable { + + private float prefValue; + + public EntityPrefWritable() { + // do nothing + } + + public EntityPrefWritable(long itemID, float prefValue) { + super(itemID); + this.prefValue = prefValue; + } + + public EntityPrefWritable(EntityPrefWritable other) { + this(other.get(), other.getPrefValue()); + } + + public long getID() { + return get(); + } + + public float getPrefValue() { + return prefValue; + } + + @Override + public void write(DataOutput out) throws IOException { + super.write(out); + out.writeFloat(prefValue); + } + + @Override + public void readFields(DataInput in) throws IOException { + super.readFields(in); + prefValue = in.readFloat(); + } + + @Override + public int hashCode() { + return super.hashCode() ^ RandomUtils.hashFloat(prefValue); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof EntityPrefWritable)) { + return false; + } + EntityPrefWritable other = (EntityPrefWritable) o; + return get() == other.get() && prefValue == other.getPrefValue(); + } + + @Override + public String toString() { + return get() + "\t" + prefValue; + } + + @Override + public EntityPrefWritable clone() { + return new EntityPrefWritable(get(), prefValue); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java new file mode 100644 index 0000000..3de272d --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/MutableRecommendedItem.java @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.hadoop; + +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.common.RandomUtils; + +/** + * Mutable variant of {@link RecommendedItem} + */ +public class MutableRecommendedItem implements RecommendedItem { + + private long itemID; + private float value; + + public MutableRecommendedItem() {} + + public MutableRecommendedItem(long itemID, float value) { + this.itemID = itemID; + this.value = value; + } + + @Override + public long getItemID() { + return itemID; + } + + @Override + public float getValue() { + return value; + } + + public void setItemID(long itemID) { + this.itemID = itemID; + } + + public void set(long itemID, float value) { + this.itemID = itemID; + this.value = value; + } + + public void capToMaxValue(float maxValue) { + if (value > maxValue) { + value = maxValue; + } + } + + @Override + public String toString() { + return "MutableRecommendedItem[item:" + itemID + ", value:" + value + ']'; + } + + @Override + public int hashCode() { + return (int) itemID ^ RandomUtils.hashFloat(value); + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof MutableRecommendedItem)) { + return false; + } + RecommendedItem other = (RecommendedItem) o; + return itemID == other.getItemID() && value == other.getValue(); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java new file mode 100644 index 0000000..bc832aa --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommendedItemsWritable.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.hadoop; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.hadoop.io.Writable; +import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; +import org.apache.mahout.math.Varint; + +/** + * A {@link Writable} which encapsulates a list of {@link RecommendedItem}s. This is the mapper (and reducer) + * output, and represents items recommended to a user. The first item is the one whose estimated preference is + * highest. + */ +public final class RecommendedItemsWritable implements Writable { + + private List<RecommendedItem> recommended; + + public RecommendedItemsWritable() { + // do nothing + } + + public RecommendedItemsWritable(List<RecommendedItem> recommended) { + this.recommended = recommended; + } + + public List<RecommendedItem> getRecommendedItems() { + return recommended; + } + + public void set(List<RecommendedItem> recommended) { + this.recommended = recommended; + } + + @Override + public void write(DataOutput out) throws IOException { + out.writeInt(recommended.size()); + for (RecommendedItem item : recommended) { + Varint.writeSignedVarLong(item.getItemID(), out); + out.writeFloat(item.getValue()); + } + } + + @Override + public void readFields(DataInput in) throws IOException { + int size = in.readInt(); + recommended = new ArrayList<>(size); + for (int i = 0; i < size; i++) { + long itemID = Varint.readSignedVarLong(in); + float value = in.readFloat(); + RecommendedItem recommendedItem = new GenericRecommendedItem(itemID, value); + recommended.add(recommendedItem); + } + } + + @Override + public String toString() { + StringBuilder result = new StringBuilder(200); + result.append('['); + boolean first = true; + for (RecommendedItem item : recommended) { + if (first) { + first = false; + } else { + result.append(','); + } + result.append(String.valueOf(item.getItemID())); + result.append(':'); + result.append(String.valueOf(item.getValue())); + } + result.append(']'); + return result.toString(); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java new file mode 100644 index 0000000..e3fab29 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TasteHadoopUtils.java @@ -0,0 +1,84 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.hadoop; + +import com.google.common.primitives.Longs; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.mahout.common.Pair; +import org.apache.mahout.common.iterator.sequencefile.PathFilters; +import org.apache.mahout.common.iterator.sequencefile.PathType; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; +import org.apache.mahout.math.VarIntWritable; +import org.apache.mahout.math.VarLongWritable; +import org.apache.mahout.math.map.OpenIntLongHashMap; + +import java.util.regex.Pattern; + +/** + * Some helper methods for the hadoop-related stuff in org.apache.mahout.cf.taste + */ +public final class TasteHadoopUtils { + + public static final int USER_ID_POS = 0; + public static final int ITEM_ID_POS = 1; + + /** Standard delimiter of textual preference data */ + private static final Pattern PREFERENCE_TOKEN_DELIMITER = Pattern.compile("[\t,]"); + + private TasteHadoopUtils() {} + + /** + * Splits a preference data line into string tokens + */ + public static String[] splitPrefTokens(CharSequence line) { + return PREFERENCE_TOKEN_DELIMITER.split(line); + } + + /** + * Maps a long to an int with range of 0 to Integer.MAX_VALUE-1 + */ + public static int idToIndex(long id) { + return 0x7FFFFFFF & Longs.hashCode(id) % 0x7FFFFFFE; + } + + public static int readID(String token, boolean usesLongIDs) { + return usesLongIDs ? idToIndex(Long.parseLong(token)) : Integer.parseInt(token); + } + + /** + * Reads a binary mapping file + */ + public static OpenIntLongHashMap readIDIndexMap(String idIndexPathStr, Configuration conf) { + OpenIntLongHashMap indexIDMap = new OpenIntLongHashMap(); + Path itemIDIndexPath = new Path(idIndexPathStr); + for (Pair<VarIntWritable,VarLongWritable> record + : new SequenceFileDirIterable<VarIntWritable,VarLongWritable>(itemIDIndexPath, + PathType.LIST, + PathFilters.partFilter(), + null, + true, + conf)) { + indexIDMap.put(record.getFirst().get(), record.getSecond().get()); + } + return indexIDMap; + } + + + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java new file mode 100644 index 0000000..fdb552e --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToEntityPrefsMapper.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.hadoop; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.mahout.cf.taste.hadoop.item.RecommenderJob; +import org.apache.mahout.math.VarLongWritable; + +import java.io.IOException; +import java.util.regex.Pattern; + +public abstract class ToEntityPrefsMapper extends + Mapper<LongWritable,Text, VarLongWritable,VarLongWritable> { + + public static final String TRANSPOSE_USER_ITEM = ToEntityPrefsMapper.class + "transposeUserItem"; + public static final String RATING_SHIFT = ToEntityPrefsMapper.class + "shiftRatings"; + + private static final Pattern DELIMITER = Pattern.compile("[\t,]"); + + private boolean booleanData; + private boolean transpose; + private final boolean itemKey; + private float ratingShift; + + ToEntityPrefsMapper(boolean itemKey) { + this.itemKey = itemKey; + } + + @Override + protected void setup(Context context) { + Configuration jobConf = context.getConfiguration(); + booleanData = jobConf.getBoolean(RecommenderJob.BOOLEAN_DATA, false); + transpose = jobConf.getBoolean(TRANSPOSE_USER_ITEM, false); + ratingShift = Float.parseFloat(jobConf.get(RATING_SHIFT, "0.0")); + } + + @Override + public void map(LongWritable key, + Text value, + Context context) throws IOException, InterruptedException { + String[] tokens = DELIMITER.split(value.toString()); + long userID = Long.parseLong(tokens[0]); + long itemID = Long.parseLong(tokens[1]); + if (itemKey ^ transpose) { + // If using items as keys, and not transposing items and users, then users are items! + // Or if not using items as keys (users are, as usual), but transposing items and users, + // then users are items! Confused? + long temp = userID; + userID = itemID; + itemID = temp; + } + if (booleanData) { + context.write(new VarLongWritable(userID), new VarLongWritable(itemID)); + } else { + float prefValue = tokens.length > 2 ? Float.parseFloat(tokens[2]) + ratingShift : 1.0f; + context.write(new VarLongWritable(userID), new EntityPrefWritable(itemID, prefValue)); + } + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java new file mode 100644 index 0000000..f5f9574 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/ToItemPrefsMapper.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.hadoop; + +/** + * <h1>Input</h1> + * + * <p> + * Intended for use with {@link org.apache.hadoop.mapreduce.lib.input.TextInputFormat}; + * accepts line number / line pairs as + * {@link org.apache.hadoop.io.LongWritable}/{@link org.apache.hadoop.io.Text} pairs. + * </p> + * + * <p> + * Each line is assumed to be of the form {@code userID,itemID,preference}, or {@code userID,itemID}. + * </p> + * + * <h1>Output</h1> + * + * <p> + * Outputs the user ID as a {@link org.apache.mahout.math.VarLongWritable} mapped to the item ID and preference as a + * {@link EntityPrefWritable}. + * </p> + */ +public final class ToItemPrefsMapper extends ToEntityPrefsMapper { + + public ToItemPrefsMapper() { + super(false); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java new file mode 100644 index 0000000..8f563b0 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/TopItemsQueue.java @@ -0,0 +1,60 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.hadoop; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.lucene.util.PriorityQueue; +import org.apache.mahout.cf.taste.recommender.RecommendedItem; + +public class TopItemsQueue extends PriorityQueue<MutableRecommendedItem> { + + private static final long SENTINEL_ID = Long.MIN_VALUE; + + private final int maxSize; + + public TopItemsQueue(int maxSize) { + super(maxSize); + this.maxSize = maxSize; + } + + public List<RecommendedItem> getTopItems() { + List<RecommendedItem> recommendedItems = new ArrayList<>(maxSize); + while (size() > 0) { + MutableRecommendedItem topItem = pop(); + // filter out "sentinel" objects necessary for maintaining an efficient priority queue + if (topItem.getItemID() != SENTINEL_ID) { + recommendedItems.add(topItem); + } + } + Collections.reverse(recommendedItems); + return recommendedItems; + } + + @Override + protected boolean lessThan(MutableRecommendedItem one, MutableRecommendedItem two) { + return one.getValue() < two.getValue(); + } + + @Override + protected MutableRecommendedItem getSentinelObject() { + return new MutableRecommendedItem(SENTINEL_ID, Float.MIN_VALUE); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java new file mode 100644 index 0000000..4bb95ae --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/hadoop/als/ALS.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.hadoop.als; + +import com.google.common.base.Preconditions; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.mahout.common.HadoopUtil; +import org.apache.mahout.common.Pair; +import org.apache.mahout.common.iterator.sequencefile.PathFilters; +import org.apache.mahout.common.iterator.sequencefile.PathType; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirIterable; +import org.apache.mahout.common.iterator.sequencefile.SequenceFileDirValueIterator; +import org.apache.mahout.math.Vector; +import org.apache.mahout.math.VectorWritable; +import org.apache.mahout.math.als.AlternatingLeastSquaresSolver; +import org.apache.mahout.math.map.OpenIntObjectHashMap; + +final class ALS { + + private ALS() {} + + static Vector readFirstRow(Path dir, Configuration conf) throws IOException { + Iterator<VectorWritable> iterator = new SequenceFileDirValueIterator<>(dir, PathType.LIST, + PathFilters.partFilter(), null, true, conf); + return iterator.hasNext() ? iterator.next().get() : null; + } + + public static OpenIntObjectHashMap<Vector> readMatrixByRowsFromDistributedCache(int numEntities, + Configuration conf) throws IOException { + + IntWritable rowIndex = new IntWritable(); + VectorWritable row = new VectorWritable(); + + + OpenIntObjectHashMap<Vector> featureMatrix = numEntities > 0 + ? new OpenIntObjectHashMap<Vector>(numEntities) : new OpenIntObjectHashMap<Vector>(); + + Path[] cachedFiles = HadoopUtil.getCachedFiles(conf); + LocalFileSystem localFs = FileSystem.getLocal(conf); + + for (Path cachedFile : cachedFiles) { + try (SequenceFile.Reader reader = new SequenceFile.Reader(localFs.getConf(), SequenceFile.Reader.file(cachedFile))) { + while (reader.next(rowIndex, row)) { + featureMatrix.put(rowIndex.get(), row.get()); + } + } + } + + Preconditions.checkState(!featureMatrix.isEmpty(), "Feature matrix is empty"); + return featureMatrix; + } + + public static OpenIntObjectHashMap<Vector> readMatrixByRows(Path dir, Configuration conf) { + OpenIntObjectHashMap<Vector> matrix = new OpenIntObjectHashMap<>(); + for (Pair<IntWritable,VectorWritable> pair + : new SequenceFileDirIterable<IntWritable,VectorWritable>(dir, PathType.LIST, PathFilters.partFilter(), conf)) { + int rowIndex = pair.getFirst().get(); + Vector row = pair.getSecond().get(); + matrix.put(rowIndex, row); + } + return matrix; + } + + public static Vector solveExplicit(VectorWritable ratingsWritable, OpenIntObjectHashMap<Vector> uOrM, + double lambda, int numFeatures) { + Vector ratings = ratingsWritable.get(); + + List<Vector> featureVectors = new ArrayList<>(ratings.getNumNondefaultElements()); + for (Vector.Element e : ratings.nonZeroes()) { + int index = e.index(); + featureVectors.add(uOrM.get(index)); + } + + return AlternatingLeastSquaresSolver.solve(featureVectors, ratings, lambda, numFeatures); + } +}
