Applied minor fixes to the previous commit
Project: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/commit/1db53587 Tree: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/tree/1db53587 Diff: http://git-wip-us.apache.org/repos/asf/incubator-hivemall/diff/1db53587 Branch: refs/heads/master Commit: 1db5358767bb30a8c433e4530c39d8591bc28a36 Parents: 1fbf90a Author: myui <[email protected]> Authored: Wed Jun 7 17:01:47 2017 +0900 Committer: myui <[email protected]> Committed: Wed Jun 7 17:01:47 2017 +0900 ---------------------------------------------------------------------- .../knn/similarity/DIMSUMMapperUDTF.java | 26 +++++++++----------- docs/gitbook/recommend/item_based_cf.md | 6 ++--- 2 files changed, 15 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1db53587/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java b/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java index 73e218f..740b2da 100644 --- a/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java +++ b/core/src/main/java/hivemall/knn/similarity/DIMSUMMapperUDTF.java @@ -29,8 +29,6 @@ import hivemall.utils.lang.Primitives; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.Options; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; @@ -52,25 +50,24 @@ import java.util.Map; name = "dimsum_mapper", value = "_FUNC_(array<string> row, map<int col_id, double norm> colNorms [, const string options]) " + "- Returns column-wise partial similarities") -public class DIMSUMMapperUDTF extends UDTFWithOptions { - private static final Log logger = LogFactory.getLog(DIMSUMMapperUDTF.class); +public final class DIMSUMMapperUDTF extends UDTFWithOptions { - protected ListObjectInspector rowOI; - protected MapObjectInspector colNormsOI; + private ListObjectInspector rowOI; + private MapObjectInspector colNormsOI; @Nullable - protected Feature[] probes; + private Feature[] probes; @Nonnull - protected PRNG rnd; + private PRNG rnd; - protected double threshold; - protected double sqrtGamma; - protected boolean symmetricOutput; - protected boolean parseFeatureAsInt; + private double threshold; + private double sqrtGamma; + private boolean symmetricOutput; + private boolean parseFeatureAsInt; - protected Map<Object, Double> colNorms; - protected Map<Object, Double> colProbs; + private Map<Object, Double> colNorms; + private Map<Object, Double> colProbs; @Override protected Options getOptions() { @@ -158,6 +155,7 @@ public class DIMSUMMapperUDTF extends UDTFWithOptions { return ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldOIs); } + @SuppressWarnings("unchecked") @Override public void process(Object[] args) throws HiveException { Feature[] row = parseFeatures(args[0]); http://git-wip-us.apache.org/repos/asf/incubator-hivemall/blob/1db53587/docs/gitbook/recommend/item_based_cf.md ---------------------------------------------------------------------- diff --git a/docs/gitbook/recommend/item_based_cf.md b/docs/gitbook/recommend/item_based_cf.md index 2b9097e..9515184 100644 --- a/docs/gitbook/recommend/item_based_cf.md +++ b/docs/gitbook/recommend/item_based_cf.md @@ -517,7 +517,7 @@ group by # Efficient similarity computation -Since naive similarity computation takes `O(n^2)` computational complexity, utilizing a certain approximation scheme is practically important to improve efficiency and feasibility. In particular, Hivemall enables you to use one of two sophisticated approximation schemes, [MinHash](https://en.wikipedia.org/wiki/MinHash#Jaccard_similarity_and_minimum_hash_values) and [DIMSUM](https://blog.twitter.com/engineering/en_us/a/2014/all-pairs-similarity-via-dimsum.html). +Since naive similarity computation takes `O(n^2)` computational complexity, utilizing a certain approximation scheme is practically important to improve efficiency and feasibility. In particular, Hivemall enables you to use one of two sophisticated approximation schemes, [MinHash](##minhash-compute-pseudo-jaccard-similarity) and [DIMSUM](#dimsum-approximated-all-pairs-cosine-similarity-computation). ## MinHash: Compute "pseudo" Jaccard similarity @@ -604,12 +604,12 @@ from topk; ``` -## DIMSUM: Approximated all-pairs similarity computation +## DIMSUM: Approximated all-pairs "Cosine" similarity computation > #### Note > This feature is supported from Hivemall v0.5-rc.1 or later. -DIMSUM is a technique to efficiently and approximately compute similarities for all-pairs of items. You can refer to [an article in Twitter's Engineering blog](https://blog.twitter.com/engineering/en_us/a/2014/all-pairs-similarity-via-dimsum.html) to learn how DIMSUM reduces running time. +DIMSUM is a technique to efficiently and approximately compute [Cosine similarities](https://en.wikipedia.org/wiki/Cosine_similarity) for all-pairs of items. You can refer to [an article in Twitter's Engineering blog](https://blog.twitter.com/engineering/en_us/a/2014/all-pairs-similarity-via-dimsum.html) to learn how DIMSUM reduces running time. Here, let us begin with the `user_purchased` table. `item_similarity` table can be obtained as follows:
