http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java index d01f9fb..f7caafd 100644 --- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java +++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/L2.java @@ -22,12 +22,14 @@ package datafu.pig.hash.lsh.metric; import org.apache.commons.math.linear.RealVector; /** * A UDF used to find a vector v in a bag such that for query point q, metric m and threshold t - * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away. - * - * It returns one of the tuples of the bag of vectors using {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2 distance</a>}, + * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away. + * + * <p> + * It returns one of the tuples of the bag of vectors using <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L2 distance</a>, * distance between two vectors. This is otherwise known as * the Euclidean distance. - * + * </p> + * * @see datafu.pig.hash.lsh.L2PStableHash L2PStableHash for an example * @author cstella * @@ -37,7 +39,7 @@ public class L2 extends MetricUDF { /** * Create a new L2 Metric UDF with a given dimension. * - * @param sDim + * @param sDim dimension */ public L2(String sDim) { super(sDim);
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java index da00a60..cb6efbb 100644 --- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java +++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/MetricUDF.java @@ -34,10 +34,12 @@ import datafu.pig.hash.lsh.util.DataTypeUtil; /** * A base UDF used to find a vector v in a bag such that for query point q, metric m and threshold t - * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away. - * - * It returns one of the tuples of the bag of vectors. For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash. - * + * m(v,q) < t. In other words, find the first vector in the bag within a threshold distance away. + * + * <p> + * It returns one of the tuples of the bag of vectors. For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash. + * </p> + * * @see datafu.pig.hash.lsh.CosineDistanceHash * @author cstella * @@ -49,7 +51,7 @@ public abstract class MetricUDF extends EvalFunc<Tuple> /** * Create a new Metric UDF with a given dimension. * - * @param sDim + * @param sDim dimension */ public MetricUDF(String sDim) { @@ -58,8 +60,8 @@ public abstract class MetricUDF extends EvalFunc<Tuple> /** * The distance metric used. Given v1 and v2, compute the distance between those vectors. - * @param v1 vector - * @param v2 vector + * @param v1 first vector + * @param v2 second vector * @return the distance between v1 and v2 */ protected abstract double dist(RealVector v1, RealVector v2); @@ -68,9 +70,11 @@ public abstract class MetricUDF extends EvalFunc<Tuple> * This UDF expects a query vector as the first element, a threshold (double) as the second, and a bag of vectors. * Vectors are represented by tuples with doubles as elements or bags of tuples representing position and value * in the case of sparse vectors. - * + * + * <p> * It returns one of the tuples of the bag of vectors. For an example of its use, please see datafu.pig.hash.lsh.CosineDistanceHash. - * + * </p> + * * @see datafu.pig.hash.lsh.CosineDistanceHash */ @Override @@ -109,10 +113,10 @@ public abstract class MetricUDF extends EvalFunc<Tuple> } return null; } - + /** * Create the output schema, based on the input schema. - * + * * @return the output schema, which is a tuple matching the schema of the third input field. */ public Schema outputSchema(Schema input) { @@ -120,15 +124,14 @@ public abstract class MetricUDF extends EvalFunc<Tuple> validateInputSchema(input); FieldSchema fieldSchema = input.getField(2); return fieldSchema.schema; - - }catch (Exception e){ + }catch (Exception e) { throw new RuntimeException("Unable to create output schema", e); } } - + /** * Validate the input schema to ensure that our input is consistent and that we fail fast. - * @param input + * @param input input schema * @throws FrontendException */ private void validateInputSchema(Schema input) throws FrontendException @@ -140,18 +143,18 @@ public abstract class MetricUDF extends EvalFunc<Tuple> throw new FrontendException("Invalid vector element: Expected either a tuple or a bag, but found " + vectorSchema); } } - + { FieldSchema distanceSchema = input.getField(1); - if(distanceSchema.type != DataType.DOUBLE - && distanceSchema.type != DataType.INTEGER - && distanceSchema.type != DataType.LONG + if(distanceSchema.type != DataType.DOUBLE + && distanceSchema.type != DataType.INTEGER + && distanceSchema.type != DataType.LONG ) { throw new FrontendException("Invalid distance element: Expected a number, but found " + distanceSchema); } } - + { FieldSchema pointsSchema = input.getField(2); if( pointsSchema.type != DataType.BAG) @@ -166,5 +169,4 @@ public abstract class MetricUDF extends EvalFunc<Tuple> } } } - } http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java index df8dbc7..ce48880 100644 --- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java +++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/metric/package-info.java @@ -18,7 +18,7 @@ */ /** - * UDFs for different {@link <a href="http://en.wikipedia.org/wiki/Metric_(mathematics)" target="_blank">distance functions</a>} (and some similarity functions) + * UDFs for different <a href="http://en.wikipedia.org/wiki/Metric_(mathematics)" target="_blank">distance functions</a> (and some similarity functions) * used with Locality Sensitive Hashing. */ package datafu.pig.hash.lsh.metric; http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java index 0f3ba94..21b7306 100644 --- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java +++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/AbstractStableDistributionFunction.java @@ -29,22 +29,25 @@ import datafu.pig.hash.lsh.interfaces.Sampler; /** * This is the base-class for all p-stable based locality sensitive hashes. p-stable locality sensitive - * hashes are defined by a few parameters: a dimension, d , a vector taken from a - * {@link <a href="http://en.wikipedia.org/wiki/Stable_distribution" target="_blank">k-stable distribution</a>} + * hashes are defined by a few parameters: a dimension, d , a vector taken from a + * <a href="http://en.wikipedia.org/wiki/Stable_distribution" target="_blank">k-stable distribution</a> * (where k is 1 or 2) and a width of projection, w. + * * <p> * All p-stable LSH functions are parameterized with a quantization parameter (w or r in * the literature , depending on where you look). Consider the following excerpt * from Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). * "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". * Proceedings of the Symposium on Computational Geometry. - * - * <pre> - * Decreasing the width of the projection (w) decreases the probability of collision for any two points. + * </p> + * + * <p> + * Decreasing the width of the projection (w) decreases the probability of collision for any two points. * Thus, it has the same effect as increasing k . As a result, we would like to set w as small as possible * and in this way decrease the number of projections we need to make. - * </pre> - * + * </p> + * + * <p> * In the literature, the quantization parameter (or width of the projection) is * found empirically given a sample of the data and the likely threshold for * the metric. Tuning this parameter is very important for the performance of @@ -52,24 +55,23 @@ import datafu.pig.hash.lsh.interfaces.Sampler; * P.; Mirrokni, V.S. (2004). * "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". * Proceedings of the Symposium on Computational Geometry. - * + * </p> + * * @author cstella - * + * */ public abstract class AbstractStableDistributionFunction extends LSH { - private double[] a; private double b; double w; - /** * Constructs a new instance. * @param dim The dimension of the vectors to be hashed * @param w A double representing the quantization parameter (also known as the projection width) - * @param rand The random generator used - * @throws MathException + * @param rand The random generator used + * @throws MathException MathException */ public AbstractStableDistributionFunction(int dim, double w, RandomGenerator rand) throws MathException { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java index 79bf7e5..5ac2bfc 100644 --- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java +++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L1LSH.java @@ -28,7 +28,7 @@ import datafu.pig.hash.lsh.interfaces.Sampler; /** * A locality sensitive hash associated with the L1 metric. This uses a 1-stable distribution * to construct the hash. - * + * * @author cstella * */ @@ -36,25 +36,29 @@ public class L1LSH extends AbstractStableDistributionFunction implements Sampler { /** * Constructs a new instance. - * @throws MathException + * + * @param dim The dimension of the vectors to be hashed + * @param w A double representing the quantization parameter (also known as the projection width) + * @param rand The random generator used + * @throws MathException MathException */ - public L1LSH(int dim, double d, RandomGenerator rand) throws MathException { - super(dim, d, rand); + public L1LSH(int dim, double w, RandomGenerator rand) throws MathException { + super(dim, w, rand); } /** * Draw a sample s ~ Cauchy(0,1), which is 1-stable. - * + * + * @param randomData random data generator * @return a sample from a cauchy distribution with median 0 and scale 1 + * @throws MathException MathException */ public double sample(RandomDataImpl randomData) throws MathException { - return randomData.nextCauchy(0, 1); - } + @Override protected Sampler getSampler() { return this; } - } http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java index d18b189..95a487a 100644 --- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java +++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/L2LSH.java @@ -28,15 +28,17 @@ import datafu.pig.hash.lsh.interfaces.Sampler; /** * A locality sensitive hash associated with the L2 metric. This uses a 2-stable distribution * to construct the hash. - * - * @author cstella * + * @author cstella */ public class L2LSH extends AbstractStableDistributionFunction implements Sampler { /** * Constructs a new instance. - * @throws MathException + * @param dim the dimension of the vectors to be hashed + * @param w a double representing the quantization parameter (also known as the projection width) + * @param rand the random generator + * @throws MathException MathException */ public L2LSH(int dim, double w, RandomGenerator rand) throws MathException { super(dim, w, rand); @@ -44,7 +46,8 @@ public class L2LSH extends AbstractStableDistributionFunction implements Sampler /** * Draw a sample s ~ Gaussian(0,1), which is 2-stable. - * + * + * @param randomData random data generator * @return a sample from a Gaussian distribution with mu of 0 and sigma of 1 */ public double sample(RandomDataImpl randomData) http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java index ec9c313..7d7bb65 100644 --- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java +++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/p_stable/package-info.java @@ -18,8 +18,8 @@ */ /** - * Implementation of {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>} for - * {@link <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1 and L2 metrics</a>}. + * Implementation of <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a> for + * <a href="http://en.wikipedia.org/wiki/Lp_space" target="_blank">L1 and L2 metrics</a>. * * See Datar, M.; Immorlica, N.; Indyk, P.; Mirrokni, V.S. (2004). "Locality-Sensitive Hashing Scheme Based on p-Stable Distributions". Proceedings of the Symposium on Computational Geometry. * http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java index 045ed0d..912d72e 100644 --- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java +++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/package-info.java @@ -18,6 +18,6 @@ */ /** - * UDFs for {@link <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>} + * UDFs for <a href="http://en.wikipedia.org/wiki/Locality-sensitive_hashing" target="_blank">Locality Sensitive Hashing</a>. */ package datafu.pig.hash.lsh; http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java b/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java index da30179..6f09941 100644 --- a/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java +++ b/datafu-pig/src/main/java/datafu/pig/hash/lsh/util/DataTypeUtil.java @@ -40,15 +40,17 @@ public enum DataTypeUtil { /** * Convert a tuple t into a RealVector of dimension dim. * The tuple can be of a couple of forms: + * * <ul> * <li>A tuple composed of dim numeric types a la (1.0,2.0,3,5.0)</li> * <li>A tuple which contains as its first element a tuple like above a la ( (1.0,2.0,3,5.0), 5) ) would yield (1.0,2.0,3,5.0)</li> * <li>A bag containing tuples where the first element is the position and the second element is the value. This is for sparse vectors and it looks like this ( { (0,1.0), (1, 2.0), (3,3), (4,5.0) } ).</li> * </ul> + * * @param t The tuple to convert to a vector * @param dim The dimension of the vector * @return The actual RealVector (which may or may not be sparse) - * @throws PigException + * @throws PigException PigException */ public RealVector convert(Tuple t, int dim) throws PigException { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java index 80ff567..423f890 100644 --- a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java +++ b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRank.java @@ -39,39 +39,39 @@ import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; /** - * A UDF which implements {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}. - * - * <p> - * This is not a distributed implementation. Each graph is stored in memory while running the algorithm, with edges optionally + * A UDF which implements <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>. + * + * <p> + * This is not a distributed implementation. Each graph is stored in memory while running the algorithm, with edges optionally * spilled to disk to conserve memory. This can be used to distribute the execution of PageRank on multiple * reasonably sized graphs. It does not distribute execuion of PageRank for each individual graph. Each graph is identified * by an integer valued topic ID. * </p> - * + * * <p> * If the graph is too large to fit in memory than an alternative method must be used, such as an iterative approach which runs * many MapReduce jobs in a sequence to complete the PageRank iterations. * </p> - * + * * <p> * Each graph is represented through a bag of (source,edges) tuples. The 'source' is an integer ID representing the source node. * The 'edges' are the outgoing edges from the source node, represented as a bag of (dest,weight) tuples. The 'dest' is an * integer ID representing the destination node. The weight is a double representing how much the edge should be weighted. * For a standard PageRank implementation just use weight of 1.0. * </p> - * + * * <p> * The output of the UDF is a bag of (source,rank) pairs, where 'rank' is the PageRank value for that source in the graph. * </p> - * + * * <p> * There are several configurable options for this UDF, among them: - * <p> - * + * </p> + * * <ul> * <li> * <b>alpha</b>: Controls the PageRank alpha value. The default is 0.85. A higher value reduces the "random jump" - * factor and causes the rank to be influenced more by edges. + * factor and causes the rank to be influenced more by edges. * </li> * <li> * <b>max_iters</b>: The maximum number of iterations to run. The default is 150. @@ -83,8 +83,8 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; * <li> * <b>tolerance</b>: A threshold which causes iterations to cease. It is measured from the total change in ranks from each of * the nodes in the graph. As the ranks settle on their final values the total change decreases. This can be used - * to stop iterations early. The default is 1e-16. - * </li> + * to stop iterations early. The default is 1e-16. + * </li> * <li> * <b>max_nodes_and_edges</b>: This is a control to prevent running out of memory. As a graph is loaded, if the sum of edges * and nodes exceeds this value then it will stop. It will not fail but PageRank will not be run on this graph. Instead a null @@ -92,52 +92,48 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; * </li> * <li> * <b>spill_to_edge_disk_storage</b>: Used to conserve memory. When "true" it causes the edge data to be written to disk in a temp file instead - * of being held in memory when the number of edges exceeds a threshold. The nodes are still held in memory however. + * of being held in memory when the number of edges exceeds a threshold. The nodes are still held in memory however. * Each iteration of PageRank will stream through the edges stored on disk. The default is "false". * </li> * <li> * <b>max_edges_in_memory</b>: When spilling edges to disk is enabled, this is the threshold which triggers that behavior. The default is 30M. * </li> * </ul> - * + * * <p> * Parameters are configured by passing them in as a sequence of pairs into the UDF constructor. For example, below the alpha value is set to * 0.87 and dangling nodes are enabled. All arguments must be strings. * </p> - * - * <p> + * * <pre> * {@code * define PageRank datafu.pig.linkanalysis.PageRank('alpha','0.87','dangling_nodes','true'); * } * </pre> - * </p> - * - * <p> + * * Full example: * <pre> * {@code - * + * * topic_edges = LOAD 'input_edges' as (topic:INT,source:INT,dest:INT,weight:DOUBLE); - * + * * topic_edges_grouped = GROUP topic_edges by (topic, source) ; * topic_edges_grouped = FOREACH topic_edges_grouped GENERATE * group.topic as topic, * group.source as source, * topic_edges.(dest,weight) as edges; - * - * topic_edges_grouped_by_topic = GROUP topic_edges_grouped BY topic; - * + * + * topic_edges_grouped_by_topic = GROUP topic_edges_grouped BY topic; + * * topic_ranks = FOREACH topic_edges_grouped_by_topic GENERATE * group as topic, * FLATTEN(PageRank(topic_edges_grouped.(source,edges))) as (source,rank); * * topic_ranks = FOREACH topic_ranks GENERATE * topic, source, rank; - * + * * } * </pre> - * </p> */ public class PageRank extends AccumulatorEvalFunc<DataBag> { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java index 5d0b932..7994b0e 100644 --- a/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java +++ b/datafu-pig/src/main/java/datafu/pig/linkanalysis/PageRankImpl.java @@ -39,9 +39,8 @@ import java.util.Map; import com.google.common.collect.AbstractIterator; /** - * An implementation of {@link <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>}, used by the {@link PageRank} UDF. - * It is not intended to be used directly. - * </p> + * An implementation of <a href="http://en.wikipedia.org/wiki/PageRank" target="_blank">PageRank</a>, used by the {@link PageRank} UDF. + * It is not intended to be used directly. */ public class PageRankImpl { @@ -104,7 +103,7 @@ public class PageRankImpl /** * Sets the page rank alpha value (default is 0.85); - * @param alpha + * @param alpha page rank alpha value */ public void setAlpha(float alpha) { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/random/RandInt.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/random/RandInt.java b/datafu-pig/src/main/java/datafu/pig/random/RandInt.java index de89c4a..ec8375a 100644 --- a/datafu-pig/src/main/java/datafu/pig/random/RandInt.java +++ b/datafu-pig/src/main/java/datafu/pig/random/RandInt.java @@ -35,10 +35,12 @@ import datafu.pig.util.SimpleEvalFunc; public class RandInt extends SimpleEvalFunc<Integer> { private final Random rand = new Random(); - + /** * @param min lower bound for random number * @param max upper bound for random number + * @return random integer between min and max + * @throws IOException IOException */ public Integer call(Integer min, Integer max) throws IOException { @@ -60,6 +62,5 @@ public class RandInt extends SimpleEvalFunc<Integer> { return new Schema(new Schema.FieldSchema("rand", DataType.INTEGER)); } - } http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java b/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java index 90ea576..d94a038 100644 --- a/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java +++ b/datafu-pig/src/main/java/datafu/pig/sampling/SampleByKey.java @@ -30,24 +30,24 @@ import org.apache.pig.data.Tuple; * This is essentially equivalent to grouping on the fields, applying SAMPLE, * and then flattening. It is much more efficient though because it does not require * a reduce step. - * + * * <p> * The method of sampling is to convert the key to a hash, derive a double value * from this, and then test this against a supplied probability. The double value * derived from a key is uniformly distributed between 0 and 1. * </p> - * + * * <p> * The only required parameter is the sampling probability. This may be followed * by an optional seed value to control the random number generation. * </p> - * + * * <p> * SampleByKey will work deterministically as long as the same seed is provided. * </p> - * - * <p> + * * Example: + * * <pre> * {@code * DEFINE SampleByKey datafu.pig.sampling.SampleByKey('0.5'); @@ -58,12 +58,10 @@ import org.apache.pig.data.Tuple; * output = FILTER data BY SampleByKey(A_id); * * --output: (B,1), (B,3) - * } - * + * } * </pre> - * </p> - * @author evion - * + * + * @author evion */ public class SampleByKey extends FilterFunc http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java index 8e8debf..ee2e796 100644 --- a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java +++ b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSample.java @@ -35,57 +35,69 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; /** * Scalable simple random sampling (ScaSRS). - * <p/> + * + * <p> * This UDF implements a scalable simple random sampling algorithm described in - * + * </p> + * * <pre> * X. Meng, Scalable Simple Random Sampling and Stratified Sampling, ICML 2013. * </pre> - * + * + * <p> * It takes a bag of n items and a sampling probability p as the inputs, and outputs a * simple random sample of size exactly ceil(p*n) in a bag, with probability at least * 99.99%. For example, the following script generates a simple random sample with * sampling probability 0.1: - * + * </p> + * * <pre> * DEFINE SRS datafu.pig.sampling.SimpleRandomSample(); - * - * item = LOAD 'input' AS (x:double); + * + * item = LOAD 'input' AS (x:double); * sampled = FOREACH (GROUP item ALL) GENERATE FLATTEN(SRS(item, 0.01)); * </pre> - * + * + * <p> * Optionally, user can provide a good lower bound of n as the third argument to help * reduce the size of intermediate data, for example: - * + * </p> + * * <pre> * DEFINE SRS datafu.pig.sampling.SimpleRandomSample(); - * - * item = LOAD 'input' AS (x:double); + * + * item = LOAD 'input' AS (x:double); * summary = FOREACH (GROUP item ALL) GENERATE COUNT(item) AS count; * sampled = FOREACH (GROUP item ALL) GENERATE FLATTEN(SRS(item, 0.01, summary.count)); * </pre> - * + * + * <p> * This UDF is very useful for stratified sampling. For example, the following script * keeps all positive examples while downsampling negatives with probability 0.1: - * + * </p> + * * <pre> * DEFINE SRS datafu.pig.sampling.SimpleRandomSample(); - * + * * item = LOAD 'input' AS (x:double, label:int); - * grouped = FOREACH (GROUP item BY label) GENERATE item, (group == 1 ? 1.0 : 0.1) AS p; + * grouped = FOREACH (GROUP item BY label) GENERATE item, (group == 1 ? 1.0 : 0.1) AS p; * sampled = FOREACH grouped GENERATE FLATTEN(SRS(item, p)); * </pre> - * + * + * <p> * In a Java Hadoop MapReduce job, we can output selected items directly using * MultipleOutputs. However, this feature is not available in a Pig UDF. So we still let * selected items go through the sort phase. However, as long as the sample size is not * huge, this should not be a big problem. - * - * In the first version, the sampling probability is specified in the constructor. This + * </p> + * + * <p> + * In the first version, the sampling probability is specified in the constructor. This * method is deprecated now and will be removed in the next release. - * + * </p> + * * @author ximeng - * + * */ public class SimpleRandomSample extends AlgebraicEvalFunc<DataBag> { @@ -104,7 +116,8 @@ public class SimpleRandomSample extends AlgebraicEvalFunc<DataBag> /** * Constructs this UDF with a sampling probability. - * + * + * @param samplingProbability sampling probability * @deprecated Should specify the sampling probability in the function call. */ @Deprecated http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java index 598e58c..0ae5950 100644 --- a/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java +++ b/datafu-pig/src/main/java/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.java @@ -40,11 +40,14 @@ import com.google.common.primitives.Ints; /** * Scalable simple random sampling with replacement (ScaSRSWR). - * <p/> + * + * <p> * This UDF together with {@link SimpleRandomSampleWithReplacementElect} implement a * scalable algorithm for simple random sampling with replacement (SRSWR), which is a * randomized algorithm with a failure rate less than {@value #FAILURE_RATE}. - * <p/> + * </p> + * + * <p> * Let s be the desired sample size. To compute an SRSWR sample of size s, for each output * position in {0, 1, ..., s-1}, we want to select an item from the population uniformly * at random. This algorithm consists of two stages: vote and election. In the vote stage, @@ -52,7 +55,9 @@ import com.google.common.primitives.Ints; * for each position. In the election stage, the paired UDF * {@link SimpleRandomSampleWithReplacementElect} elects one candidate for each position. * The algorithm succeeds if we have at least one candidate for each position. - * <p/> + * </p> + * + * <p> * To use this UDF pair, user needs to provide: 1) the desired sample size, 2) a good * lower bound of the population size or the exact size. The input to the vote UDF * {@link SimpleRandomSampleWithReplacementVote} is a tuple that consists of a bag of @@ -62,47 +67,59 @@ import com.google.common.primitives.Ints; * elect UDF {@link SimpleRandomSampleWithReplacementElect} is a tuple that contains all * candidates voted by the vote UDF for some positions. The output from the elect UDF is a * bag of sampled items. - * <p/> + * </p> + * + * <p> * For example, the following script generates a sample of size 100000 with replacement: - * + * </p> + * * <pre> * DEFINE SRSWR_VOTE datafu.pig.sampling.SimpleRandomSampleWithReplacementVote(); * DEFINE SRSWR_ELECT datafu.pig.sampling.SimpleRandomSampleWithReplacementElect(); - * - * item = LOAD 'input' AS (x:double); + * + * item = LOAD 'input' AS (x:double); * summary = FOREACH (GROUP item ALL) GENERATE COUNT(item) AS count; * candidates = FOREACH item GENERATE FLATTEN(SRSWR_VOTE(TOBAG(x), 100000, summary.count)); * sampled = FOREACH (GROUP candidates BY position PARALLEL 10) GENERATE FLATTEN(SRSWR_ELECT(candidates)); * </pre> - * + * + * <p> * Because for election we only need to group candidates voted for the same position, this * algorithm can use many reducers to consume the candidates. See the "PARALLEL 10" * statement above. If the item to sample is the entire row, use TOBAG(TOTUPLE(*)). - * <p/> + * </p> + * + * <p> * SRSWR is heavily used in bootstrapping. Bootstrapping can be done easily with this UDF * pair. For example, the following script generates 100 bootstrap samples, computes the * mean value for each sample, and then outputs the bootstrap estimates. - * + * </p> + * * <pre> * summary = FOREACH (GROUP item ALL) GENERATE AVG(item.x) AS mean, COUNT(item) AS count; * candidates = FOREACH item GENERATE FLATTEN(SRSWR_VOTE(TOBAG(x), summary.count*100, summary.count)); * sampled = FOREACH (GROUP candidates BY (position % 100) PARALLEL 10) GENERATE AVG(SRSWR_ELECT(candidates)) AS mean; * bootstrap = FOREACH (GROUP sampled ALL) GENERATE summary.mean AS mean, sampled.mean AS bootstrapMeans; * </pre> - * + * + * <p> * Another usage of this UDF pair is to generate random pairs or tuples without computing * the cross product, where each pair or tuple consist of items from different input * sources. Let s be the number of random tuples we want to generate. For each input * source, simply use the vote UDF to propose candidates, then join the candidates from * different sources by their positions and for each position use the elect UDF to select * one candidate from each source to form the pair or tuple for that position. - * <p/> + * </p> + * + * <p> * The algorithm is a simple extension to the work - * + * </p> + * * <pre> * X. Meng, Scalable Simple Random Sampling and Stratified Sampling, ICML 2013. * </pre> - * + * + * <p> * Basically, for each output position, it performs a random sort on the population * (associates each item with a random score independently drawn from the uniform * distribution and then sorts items based on the scores), and picks the one that has the @@ -110,34 +127,41 @@ import com.google.common.primitives.Ints; * population. For example, if the population size is one billion and the random score * generated for an item is 0.9, very likely it won't become the smallest and hence we do * not need to propose it as a candidate. - * <p/> + * </p> + * + * <p> * More precisely, let n be the population size, n1 be a good lower bound of n, s be the * sample size, delta be the failure rate, and q be the threshold. For each output * position the probability of all random scores being greater than q is (1-q)^n. Thus, if * we throw away items with associated scores greater than q, with probability at least 1 * - s*(1-q)^n, we can still capture the item with the smallest score for each position. * Fix delta = s*(1-q)^n and solve for q, we get q = 1-exp(log(delta/s)/n), Note that - * replacing n by n1 < n can only decrease the failure rate, though at the cost of + * replacing n by n1 < n can only decrease the failure rate, though at the cost of * increased number of candidates. The expected number of candidates is (1 - * exp(log(delta/s)/n1)*s*n. When n1 equals n, this number is approximately * s*log(s/delta). - * <p/> + * </p> + * + * <p> * Generating a random score for each (item, position) pair is very expensive and * unnecessary. For each item, the number of positions for which it gets voted follows a * binomial distribution B(s,q). We can simply draw a number from this distribution, * determine the positions by sampling without replacement, and then generate random * scores for those positions. This reduces the running time significantly. - * <p/> + * </p> + * + * <p> * Since for each position we only need the candidate with the smallest score, we * implement a combiner to reduce the size of intermediate data in the elect UDF + * </p> + * * {@link SimpleRandomSampleWithReplacementElect}. - * + * * @see SimpleRandomSampleWithReplacementElect - * @see <a href="http://en.wikipedia.org/wiki/Bootstrapping_(statistics) target="_blank - * ">Boostrapping (Wikipedia)</a> - * + * @see <a href="http://en.wikipedia.org/wiki/Bootstrapping_(statistics)" target="_blank">Boostrapping (Wikipedia)</a> + * * @author ximeng - * + * */ public class SimpleRandomSampleWithReplacementVote extends EvalFunc<DataBag> { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java b/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java index 92af6a3..a5d265c 100644 --- a/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java +++ b/datafu-pig/src/main/java/datafu/pig/sampling/WeightedReservoirSample.java @@ -29,28 +29,27 @@ import org.apache.pig.builtin.Nondeterministic; import org.apache.pig.backend.executionengine.ExecException; /** - * <p> * Performs a weighted random sample using an in-memory reservoir to produce * a weighted random sample of a given size based on the A-Res algorithm described in - * {@link <a href="http://utopia.duth.gr/~pefraimi/research/data/2007EncOfAlg.pdf" target="_blank">paper</a>}. - * </p> + * <a href="http://utopia.duth.gr/~pefraimi/research/data/2007EncOfAlg.pdf" target="_blank">paper</a>. + * * <p> * Species with larger weight have higher probability to be selected in the final sample set. * </p> + * * <p> * This UDF inherits from {@link ReservoirSample} and it is guaranteed to produce * a sample of the given size. Similarly it comes at the cost of scalability. * since it uses internal storage with size equaling the desired sample to guarantee the exact sample size. * </p> - * <p> - * Its constructor takes 2 arguments. + * + * Its constructor takes 2 arguments: * <ul> * <li>The 1st argument specifies the sample size which should be a string of positive integer. * <li>The 2nd argument specifies the index of the weight field in the input tuple, * which should be a string of non-negative integer that is no greater than the input tuple size. * </ul> - * </p> - * <p> + * * Example: * <pre> * {@code @@ -60,7 +59,6 @@ import org.apache.pig.backend.executionengine.ExecException; * sampled = FOREACH input_g GENERATE WeightedSample(input); * } * </pre> - * </p> * @author wjian */ http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java b/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java index 52d159b..d81fb48 100644 --- a/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java +++ b/datafu-pig/src/main/java/datafu/pig/sessions/Sessionize.java @@ -47,13 +47,12 @@ import org.joda.time.Period; * session_id, that is a GUID indicating the session of the request. * </p> * - * <p> * Example: * <pre> * {@code - * + * * %declare TIME_WINDOW 30m - * + * * define Sessionize datafu.pig.sessions.Sessionize('$TIME_WINDOW'); * * views = LOAD 'views.tsv' AS (visit_date:chararray, member_id:int, url:chararray); @@ -62,7 +61,7 @@ import org.joda.time.Period; * views = GROUP views BY member_id; * sessions = FOREACH views { * visits = ORDER views BY visit_date; - * GENERATE FLATTEN(Sessionize(VISITS)) AS (visit_date,member_id,url,session_id); + * GENERATE FLATTEN(Sessionize(VISITS)) AS (visit_date,member_id,url,session_id); * } * * -- count the number of sessions hitting the url @@ -70,7 +69,6 @@ import org.joda.time.Period; * result = FOREACH rollup GENERATE group AS url, COUNT(SESSIONS) AS session_cnt; * } * </pre> - * </p> */ @Nondeterministic public class Sessionize extends AccumulatorEvalFunc<DataBag> http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java b/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java index c9997f8..90faa1e 100644 --- a/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java +++ b/datafu-pig/src/main/java/datafu/pig/sets/SetOperationsBase.java @@ -26,9 +26,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; /** * Base class for set operations. - * - * @author "Matthew Hayes <[email protected]>" - * */ public abstract class SetOperationsBase extends EvalFunc<DataBag> { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/Median.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/Median.java b/datafu-pig/src/main/java/datafu/pig/stats/Median.java index e33a84e..5f9d18d 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/Median.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/Median.java @@ -16,18 +16,18 @@ * specific language governing permissions and limitations * under the License. */ - + package datafu.pig.stats; /** - * Computes the {@link <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>} + * Computes the <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a> * for a <b>sorted</b> input bag, using type R-2 estimation. This is a convenience wrapper around Quantile. * * <p> - * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is + * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is * done (e.g., group by 'day') if the data is too large. That is, this isn't distributed median. * </p> - * + * * @see Quantile */ public class Median extends Quantile http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java b/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java index 6fd42d3..89621ea 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/Quantile.java @@ -35,7 +35,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import datafu.pig.util.SimpleEvalFunc; /** - * Computes {@link <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>} + * Computes <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a> * for a <b>sorted</b> input bag, using type R-2 estimation. * * <p> @@ -74,7 +74,6 @@ import datafu.pig.util.SimpleEvalFunc; * <li>Quantile('0.0013','0.0228','0.1587','0.5','0.8413','0.9772','0.9987') yields the 0.13th, 2.28th, 15.87th, 50th, 84.13th, 97.72nd, and 99.87th percentiles * </ul> * - * <p> * Example: * <pre> * {@code @@ -91,7 +90,7 @@ import datafu.pig.util.SimpleEvalFunc; * sorted = ORDER input BY val; * GENERATE Quantile(sorted); * } - * }</pre></p> + * }</pre> * * @see Median * @see StreamingQuantile http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java b/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java index c6fd36a..e7ba3c9 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/QuantileUtil.java @@ -23,9 +23,6 @@ import java.util.ArrayList; /** * Methods used by {@link Quantile}. - * - * @author "Matthew Hayes <[email protected]>" - * */ public class QuantileUtil { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java b/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java index c4c3be4..28d887d 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/StreamingMedian.java @@ -20,15 +20,15 @@ package datafu.pig.stats; /** - * Computes the approximate {@link <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a>} - * for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm. + * Computes the approximate <a href="http://en.wikipedia.org/wiki/Median" target="_blank">median</a> + * for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm. * This is a convenience wrapper around StreamingQuantile. * * <p> - * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is + * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is * done (e.g., group by 'day') if the data is too large. That is, this isn't distributed median. * </p> - * + * * @see StreamingQuantile */ public class StreamingMedian extends StreamingQuantile http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java b/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java index e4a65b4..2e36941 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/StreamingQuantile.java @@ -35,21 +35,21 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; /** - * Computes approximate {@link <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a>} + * Computes approximate <a href="http://en.wikipedia.org/wiki/Quantile" target="_blank">quantiles</a> * for a (not necessarily sorted) input bag, using the Munro-Paterson algorithm. * * <p> * The algorithm is described here: - * {@link <a href="http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf" target="_blank">http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf</a>} + * <a href="http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf" target="_blank">http://www.cs.ucsb.edu/~suri/cs290/MunroPat.pdf</a> * </p> * * <p> * The implementation is based on the one in Sawzall, available here: - * {@link <a href="http://szl.googlecode.com/svn-history/r41/trunk/src/emitters/szlquantile.cc">szlquantile.cc</a>} + * <a href="http://szl.googlecode.com/svn-history/r41/trunk/src/emitters/szlquantile.cc">szlquantile.cc</a> * </p> * * <p> - * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is + * N.B., all the data is pushed to a single reducer per key, so make sure some partitioning is * done (e.g., group by 'day') if the data is too large. That is, this isn't distributed quantiles. * </p> * @@ -95,12 +95,10 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; * GCD of 0.2, 0.7, and 1.0.</li> * <li>If 0.999 is requested the quantiles 0.0, 0.001, 0.002, ... , 0.998, 0.999, 1.0 are computed because 0.001 is * the GCD of 0.999 and 1.0.</li> - * </p> * </ul> - * + * * <p>The error on the approximation goes down as the number of buckets computed goes up.</p> - * - * <p> + * * Example: * <pre> * {@code @@ -115,7 +113,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; * -- produces: (1.0,3.0,5.0,8.0,10.0) * quantiles = FOREACH grouped generate Quantile(input); * } - * </pre></p> + * </pre> * * @see StreamingMedian * @see Quantile http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/VAR.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/VAR.java b/datafu-pig/src/main/java/datafu/pig/stats/VAR.java index 6f22f25..dd18c56 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/VAR.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/VAR.java @@ -40,25 +40,23 @@ import org.apache.pig.backend.executionengine.ExecException; /** -* Generates the {@link <a href="http://en.wikipedia.org/wiki/Variance" target="_blank">Variance</a>} +* Generates the <a href="http://en.wikipedia.org/wiki/Variance" target="_blank">Variance</a> * of a set of Values. This UDF uses the fact that variance(x) = average(x^2) - average(x)^2 * This class implements * {@link org.apache.pig.Algebraic}, so if possible the execution will performed in a distributed fashion. * VAR implements the {@link org.apache.pig.Accumulator} interface as well. -* +* * Input: Bag of int, long, double, float or bytearray * Output: Double -* -* <p> +* * Example: * <pre> * define VAR datafu.pig.stats.VAR(); -* +* * -- input: 1,2,3,4,10,5,6,7,8,9 * input = LOAD 'input' AS (val:int); * grouped = GROUP input ALL; * variance = FOREACH grouped GENERATE VAR(input.val) AS variance; * </pre> -* </p> */ public class VAR extends EvalFunc<Double> implements Algebraic, Accumulator<Double> { private static TupleFactory mTupleFactory = TupleFactory.getInstance(); @@ -68,7 +66,7 @@ public class VAR extends EvalFunc<Double> implements Algebraic, Accumulator<Doub try { Double sum = sum(input); Double sumSquare = sumSquare(input); - + if(sum == null) { // either we were handed an empty bag or a bag // filled with nulls - return null in this case http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java b/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java index 1448611..85a8df7 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/WilsonBinConf.java @@ -35,13 +35,15 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import datafu.pig.util.SimpleEvalFunc; /** - * Computes the {@link <a href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval" target="_blank">Wilsonian binomial proportion confidence interval</a>} + * Computes the <a href="http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval" target="_blank">Wilsonian binomial proportion confidence interval</a>. + * * <p> * Constructor requires the confidence interval (alpha) parameter, and the * parameters are the number of positive (success) outcomes and the total * number of observations. The UDF returns the (lower,upper) confidence - * interval. - * <p> + * interval. + * </p> + * * Example: * <pre> * {@code @@ -54,7 +56,7 @@ import datafu.pig.util.SimpleEvalFunc; * quux = ORDER bar BY score DESC; * top = LIMIT quux 10; * } - * </pre></p> + * </pre> */ public class WilsonBinConf extends SimpleEvalFunc<Tuple> { @@ -82,6 +84,7 @@ public class WilsonBinConf extends SimpleEvalFunc<Tuple> * @param x The number of positive (success) outcomes * @param n The number of observations * @return The (lower,upper) confidence interval + * @throws IOException IOException */ public Tuple binconf(Long x, Long n) throws IOException { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java b/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java index 26b743e..2f0f148 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/entropy/CondEntropy.java @@ -34,34 +34,38 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; /** - * Calculate conditional entropy H(Y|X) of random variables X and Y following conditional entropy's - * {@link <a href="http://en.wikipedia.org/wiki/Conditional_entropy" target="_blank">wiki definition</a>}, + * Calculate conditional entropy H(Y|X) of random variables X and Y following conditional entropy's + * <a href="http://en.wikipedia.org/wiki/Conditional_entropy" target="_blank">wiki definition</a>, * X is the conditional variable and Y is the variable that conditions on X. + * * <p> * Each tuple of the input bag has 2 fields, the 1st field is an object instance of variable X and * the 2nd field is an object instance of variable Y. An exception will be thrown if the number of fields is not 2. - * </p> + * </p> + * * <p> * This UDF's constructor definition and parameters are the same as that of {@link datafu.pig.stats.entropy.Entropy} * </p> - * <p> + * * Note: * <ul> * <li>The input bag to this UDF must be <b>sorted</b> on X and Y, with X in the first sort order. * An exception will be thrown if the input bag is not sorted. * <li>The returned entropy value is of double type. * </ul> - * </p> + * * <p> - * How to use: + * How to use: * </p> + * * <p> * This UDF calculates conditional entropy given raw data tuples of X and Y without the need to pre-compute per tuple occurrence frequency. * </p> + * * <p> * It could be used in a nested FOREACH after a GROUP BY, in which we sort the inner bag and use the sorted bag as this UDF's input. * </p> - * <p> + * * Example: * <pre> * {@code @@ -79,21 +83,20 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; * } * } * </pre> - * </p> + * * Use case to calculate mutual information: - * <p> * <pre> * {@code * ------------ * -- calculate mutual information I(X, Y) using conditional entropy UDF and entropy UDF * -- I(X, Y) = H(Y) - H(Y|X) * ------------ - * + * * define CondEntropy datafu.pig.stats.entropy.CondEntropy(); * define Entropy datafu.pig.stats.entropy.Entropy(); - * + * * input = LOAD 'input' AS (grp: chararray, valX: double, valY: double); - * + * * -- calculate the I(X,Y) in each group * input_group_g = GROUP input BY grp; * mutual_information = FOREACH input_group_g { @@ -107,7 +110,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; * } * } * </pre> - * </p> * @see Entropy */ public class CondEntropy extends AccumulatorEvalFunc<Double> { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java b/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java index 388b80f..1a6b846 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/entropy/EmpiricalCountEntropy.java @@ -41,37 +41,40 @@ import datafu.pig.stats.entropy.EntropyUtil; /** * Calculate the empirical entropy of random variable X given its occurrence frequencies, following entropy's - * {@link <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>} + * <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>. + * * <p> * This UDF's constructor takes 1 argument: the logarithm base, whose definition is the same as that defined in {@link datafu.pig.stats.entropy.Entropy} * </p> - * <p> - * Note: + * + * Note: * <ul> * <li>Unlike {@link datafu.pig.stats.entropy.Entropy}, which calculates entropy from sorted raw data bag in accumulative mode, - * this UDF calculates entropy from the data's occurrence frequencies which does not need to be sorted, either in accumulative or algebraic mode. + * this UDF calculates entropy from the data's occurrence frequencies which does not need to be sorted, either in accumulative or algebraic mode.</li> * <li>Each tuple of the UDF's input bag <b>must only</b> have 1 field, the occurrence frequency of a data instance, - * and the data type of this field <b>must</b> be int or long. Otherwise, an exception will be thrown. - * <li>Negative frequency number will be silently discarded and a warning message will be logged in the job's log file. - * <li>The returned entropy value is of double type. + * and the data type of this field <b>must</b> be int or long. Otherwise, an exception will be thrown.</li> + * <li>Negative frequency number will be silently discarded and a warning message will be logged in the job's log file.</li> + * <li>The returned entropy value is of double type.</li> * </ul> - * </p> + * * <p> - * How to use: + * How to use: * </p> + * * <p> * To use this UDF, customer needs to pre-compute the occurrence frequency of each data instance, often in an outer GROUP BY * , and then use this UDF to calculate entropy with those frequency numbers in another outer GROUP BY. * </p> + * * <p> * Compared with {@link datafu.pig.stats.entropy.Entropy}, this UDF is more scalable when we need to handle a very large data set, * since it could distribute computation onto mappers and take advantage of combiners to reduce intermedidate output from mappers to reducers. * </p> - * <p> + * * Example: * <pre> * {@code - * + * * define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy(); * * input = LOAD 'input' AS (val: double); @@ -79,48 +82,48 @@ import datafu.pig.stats.entropy.EntropyUtil; * -- calculate the occurrence of each instance * counts_g = GROUP input BY val; * counts = FOREACh counts_g GENERATE COUNT(input) AS cnt; - * - * -- calculate entropy + * + * -- calculate entropy * input_counts_g = GROUP counts ALL; * entropy = FOREACH input_counts_g GENERATE Entropy(counts) AS entropy; * } * </pre> - * </p> + * * Use case to calculate mutual information using EmpiricalCountEntropy: - * <p> + * * <pre> * {@code - * + * * define Entropy datafu.pig.stats.entropy.EmpiricalCountEntropy(); - * + * * input = LOAD 'input' AS (valX: double, valY: double); - * + * * ------------ * -- calculate mutual information I(X, Y) using entropy * -- I(X, Y) = H(X) + H(Y) - H(X, Y) * ------------ - * + * * input_x_y_g = GROUP input BY (valX, valY); * input_x_y_cnt = FOREACH input_x_y_g GENERATE flatten(group) as (valX, valY), COUNT(input) AS cnt; - * + * * input_x_g = GROUP input_x_y_cnt BY valX; * input_x_cnt = FOREACH input_x_g GENERATE flatten(group) as valX, SUM(input_x_y_cnt.cnt) AS cnt; - * + * * input_y_g = GROUP input_x_y_cnt BY valY; * input_y_cnt = FOREACH input_y_g GENERATE flatten(group) as valY, SUM(input_x_y_cnt.cnt) AS cnt; - * + * * input_x_y_entropy_g = GROUP input_x_y_cnt ALL; * input_x_y_entropy = FOREACH input_x_y_entropy_g { * input_x_y_entropy_cnt = input_x_y_cnt.cnt; * GENERATE Entropy(input_x_y_entropy_cnt) AS x_y_entropy; * } - * + * * input_x_entropy_g = GROUP input_x_cnt ALL; * input_x_entropy = FOREACH input_x_entropy_g { * input_x_entropy_cnt = input_x_cnt.cnt; * GENERATE Entropy(input_x_entropy_cnt) AS x_entropy; * } - * + * * input_y_entropy_g = GROUP input_y_cnt ALL; * input_y_entropy = FOREACH input_y_entropy_g { * input_y_entropy_cnt = input_y_cnt.cnt; @@ -133,7 +136,6 @@ import datafu.pig.stats.entropy.EntropyUtil; * input_x_y_entropy::x_y_entropy) AS mi; * } * </pre> - * </p> * @see Entropy */ http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java b/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java index 9dfff1a..efa1d35 100644 --- a/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java +++ b/datafu-pig/src/main/java/datafu/pig/stats/entropy/Entropy.java @@ -32,53 +32,57 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; /** - * Calculate entropy H(X) of random variable X following entropy's - * {@link <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a>} + * Calculate entropy H(X) of random variable X following entropy's + * <a href="http://en.wikipedia.org/wiki/Entropy_%28information_theory%29" target="_blank">wiki definition</a> + * * <p> * This UDF's constructor takes 2 arguments. * </p> - * <p> + * * The 1st argument, the type of entropy estimator algorithm we currently support, includes: * <ul> * <li>empirical (empirical entropy estimator) * <li>chaosh (Chao-Shen entropy estimator) * </ul> - * </p> + * * <p> * The default estimation algorithm is empirical. * </p> + * * <p> * The 2nd argument, the logarithm base we currently support, includes: * </p> - * <p> + * * <ul> * <li>log (use Euler's number as the logarithm base) * <li>log2 (use 2 as the logarithm base) * <li>log10 (use 10 as the logarithm base) * </ul> - * </p> + * * <p> * The default logarithm base is log. * </p> - * <p> + * * Note: * <ul> * <li>The input to this UDF must be a <b>sorted</b> bag of raw data tuples of X. * An exception will be thrown if the input bag is not sorted * <li>The returned entropy value is of double type. * </ul> - * </p> + * * <p> * How to use: * </p> + * * <p> * This UDF calculates entropy from raw data tuples without the need to pre-compute per tuple occurrence frequency. * </p> + * * <p> * It could be used in a nested FOREACH after a GROUP BY, in which we sort the inner bag and use the sorted bag as this UDF's input. * </p> + * * Example: - * <p> * <pre> * {@code * --calculate empirical entropy with Euler's number as the logarithm base @@ -95,7 +99,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; * } * } * </pre> - * </p> * @see CondEntropy * @see EmpiricalCountEntropy */ http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java b/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java index ee2c3f3..3dd4829 100644 --- a/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java +++ b/datafu-pig/src/main/java/datafu/pig/util/AliasableEvalFunc.java @@ -49,8 +49,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; * such as transposing two fields of the same type. If this contract is violated, say, by attempting to reference * a field that is not present, a meaningful error message may be thrown. * </p> - * - * <p> + * * Example: This example computes the monthly payments for mortgages depending on interest rate. * <pre> * {@code @@ -58,11 +57,11 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; * ... * public DataBag exec(Tuple input) throws IOException { * DataBag output = BagFactory.getInstance().newDefaultBag(); - * + * * Double principal = getDouble(input, "principal"); // get a value from the input tuple by alias * Integer numPayments = getInteger(input, "num_payments"); * DataBag interestRates = getBag(input, "interest_rates"); - * + * * for (Tuple interestTuple : interestRates) { * Double interest = getDouble(interestTuple, getPrefixedAliasName("interest_rates", "interest_rate")); // get a value from the inner bag tuple by alias * double monthlyPayment = computeMonthlyPayment(principal, numPayments, interest); @@ -73,11 +72,10 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; * } * } * </pre> - * </p> - * + * * @author wvaughan * - * @param <T> + * @param <T> type that the eval func returns */ public abstract class AliasableEvalFunc<T> extends ContextualEvalFunc<T> { @@ -101,8 +99,8 @@ public abstract class AliasableEvalFunc<T> extends ContextualEvalFunc<T> /** * Specify the output schema as in {link EvalFunc#outputSchema(Schema)}. * - * @param input - * @return outputSchema + * @param input input schema + * @return outputSchema output schema */ public abstract Schema getOutputSchema(Schema input); @@ -151,10 +149,10 @@ public abstract class AliasableEvalFunc<T> extends ContextualEvalFunc<T> } /** - * Field aliases are generated from the input schema<br/> - * Each alias maps to a bag position<br/> + * Field aliases are generated from the input schema. + * Each alias maps to a bag position. * Inner bags/tuples will have alias of outer.inner.foo - * + * * @return A map of field alias to field position */ public Map<String, Integer> getFieldAliases() http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java b/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java index 16f9247..27ae134 100644 --- a/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java +++ b/datafu-pig/src/main/java/datafu/pig/util/AssertUDF.java @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ - + package datafu.pig.util; import java.io.IOException; @@ -26,28 +26,26 @@ import org.apache.pig.data.Tuple; /** * Filter function which asserts that a value is true. - * + * * <p> * Unfortunately, the Pig interpreter doesn't recognize boolean expressions nested as function * arguments, so this uses C-style booleans. That is, the first argument should be * an integer. 0 is interpreted as "false", and anything else is considered "true". * The function will cause the Pig script to fail if a "false" value is encountered. * </p> - * + * * <p> * There is a unary and a binary version. The unary version just takes a boolean, and throws out a generic exception message when the * assertion is violated. The binary version takes a String as a second argument and throws that out when the assertion * is violated. * </p> - * - * <p> + * * Example: * <pre> * {@code * FILTER members BY AssertUDF( (member_id >= 0 ? 1 : 0), 'Doh! Some member ID is negative.' ); * } * </pre> - * </p> */ public class AssertUDF extends FilterFunc { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java b/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java index f8e25f4..855b305 100644 --- a/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java +++ b/datafu-pig/src/main/java/datafu/pig/util/Coalesce.java @@ -27,9 +27,8 @@ import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; /** - * Returns the first non-null value from a tuple, just like {@link <a href="http://msdn.microsoft.com/en-us/library/ms190349.aspx" target="_blank">COALESCE</a>} in SQL. - * - * <p> + * Returns the first non-null value from a tuple, just like <a href="http://msdn.microsoft.com/en-us/library/ms190349.aspx" target="_blank">COALESCE</a> in SQL. + * * Example: * <pre> * {@code @@ -44,10 +43,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; * * } * </pre> - * </p> - * - * @author "Matthew Hayes <[email protected]>" - * */ public class Coalesce extends AliasableEvalFunc<Object> { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java b/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java index c534b77..5e26ac1 100644 --- a/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java +++ b/datafu-pig/src/main/java/datafu/pig/util/ContextualEvalFunc.java @@ -29,8 +29,8 @@ import org.apache.pig.impl.util.UDFContext; * on the front end which will be available on the back end. * For example, properties may be set in the call to outputSchema(), * which will be available when exec() is called. - * - * @param <T> + * + * @param <T> the type the eval function returns */ public abstract class ContextualEvalFunc<T> extends EvalFunc<T> { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java b/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java index 0066aa8..9aa80ff 100644 --- a/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java +++ b/datafu-pig/src/main/java/datafu/pig/util/DataFuException.java @@ -60,7 +60,7 @@ public class DataFuException extends RuntimeException /** * Gets data relevant to this exception. * - * @return data + * @return data the data relevant to this exception */ public Object getData() { @@ -70,7 +70,7 @@ public class DataFuException extends RuntimeException /** * Sets field aliases for a UDF which may be relevant to this exception. * - * @param fieldAliases + * @param fieldAliases field aliases */ public void setFieldAliases(Map<String, Integer> fieldAliases) { @@ -79,7 +79,7 @@ public class DataFuException extends RuntimeException /** * Sets data relevant to this exception. - * @param data + * @param data data relevant to this exception */ public void setData(Object data) { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/InUDF.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/util/InUDF.java b/datafu-pig/src/main/java/datafu/pig/util/InUDF.java index 5057285..08d81af 100644 --- a/datafu-pig/src/main/java/datafu/pig/util/InUDF.java +++ b/datafu-pig/src/main/java/datafu/pig/util/InUDF.java @@ -25,26 +25,24 @@ import org.apache.pig.FilterFunc; import org.apache.pig.data.Tuple; /** - * Similar to the SQL IN function, this function provides a convenient way to filter - * using a logical disjunction over many values. + * Similar to the SQL IN function, this function provides a convenient way to filter + * using a logical disjunction over many values. * Returns true when the first value of the tuple is contained within the remainder of the tuple. - * - * <p> + * * Example: * <pre> * {@code * define In datafu.pig.util.InUDF(); * -- cars: (alice, red), (bob, blue), (charlie, green), (dave, red); * cars = LOAD cars AS (owner:chararray, color:chararray); - * + * * -- cars: (alice, red), (bob, blue), (dave, red); * red_blue_cars = FILTER cars BY In(color, 'red', 'blue'); - * - * }</pre> - * </p> - * - * @author wvaughan * + * } + * </pre> + * + * @author wvaughan */ public class InUDF extends FilterFunc { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java b/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java index f8a39df..d7c2592 100644 --- a/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java +++ b/datafu-pig/src/main/java/datafu/pig/util/TransposeTupleToBag.java @@ -34,8 +34,7 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; /** * Performs a transpose on a tuple, resulting in a bag of key, value fields where * the key is the column name and the value is the value of that column in the tuple. - * - * <p> + * * Example: * <pre> * {@code @@ -50,10 +49,6 @@ import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; * * } * </pre> - * </p> - * - * @author "William Vaughan <[email protected]>" - * */ public class TransposeTupleToBag extends AliasableEvalFunc<DataBag> { http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java index ac3e409..f652101 100644 --- a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java +++ b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHPigTest.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package datafu.test.pig.hash.lsh; import java.io.IOException; http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java index be64bc8..a6615ed 100644 --- a/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java +++ b/datafu-pig/src/test/java/datafu/test/pig/hash/lsh/LSHTest.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package datafu.test.pig.hash.lsh; import java.util.ArrayList; http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java ---------------------------------------------------------------------- diff --git a/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java b/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java index 99af987..fd7ff05 100644 --- a/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java +++ b/datafu-pig/src/test/java/datafu/test/pig/util/Base64Test.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ package datafu.test.pig.util; import org.adrianwalker.multilinestring.Multiline; http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle.properties ---------------------------------------------------------------------- diff --git a/gradle.properties b/gradle.properties index 33df918..648af68 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,2 +1,4 @@ group=org.apache.datafu -version=1.2.1 +version=1.3.0-SNAPSHOT +gradleVersion=1.12 +org.gradle.jvmargs="-XX:MaxPermSize=512m" http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle/buildscript.gradle ---------------------------------------------------------------------- diff --git a/gradle/buildscript.gradle b/gradle/buildscript.gradle index 225e0a8..669eb6e 100644 --- a/gradle/buildscript.gradle +++ b/gradle/buildscript.gradle @@ -1,7 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + repositories { repositories { - // For license plugin. maven { + // For gradle-nexus-plugin + url 'http://jcenter.bintray.com/' + } + maven { + // For license plugin. url 'http://dl.bintray.com/content/netflixoss/external-gradle-plugins/' } } @@ -9,4 +32,5 @@ repositories { dependencies { classpath 'nl.javadude.gradle.plugins:license-gradle-plugin:0.6.1' + classpath 'org.gradle.api.plugins:gradle-nexus-plugin:0.7.1' } http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle/dependency-versions.gradle ---------------------------------------------------------------------- diff --git a/gradle/dependency-versions.gradle b/gradle/dependency-versions.gradle index eb24e4a..3b0835f 100644 --- a/gradle/dependency-versions.gradle +++ b/gradle/dependency-versions.gradle @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + ext { antlrVersion="3.2" avroVersion="1.7.4" http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/0f9b853b/gradle/release.gradle ---------------------------------------------------------------------- diff --git a/gradle/release.gradle b/gradle/release.gradle new file mode 100644 index 0000000..c52b69c --- /dev/null +++ b/gradle/release.gradle @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +project(':') { + apply plugin: 'base' + apply plugin: 'signing' +} + +task sourceRelease(type: Tar) { + description = "Build a source release, specifically excluding the build directories and gradle wrapper files" + compression = Compression.GZIP + + baseName = "datafu-sources-${project.version}-incubating" + + from(project.rootDir) { + exclude '**/build' + exclude 'build' + exclude '.gradle' + exclude 'gradlew' + exclude 'gradlew.bat' + exclude 'gradle/wrapper/gradle-wrapper.jar' + exclude 'gradle/wrapper/gradle-wrapper.properties' + } + + into(baseName) + + // Set destination directory. + destinationDir = file("${project.buildDir}/distribution/source") + + archiveName = "${baseName}.tgz" + doLast { // generate md5 checksum + ant.checksum file:"$destinationDir/$archiveName" + } +} + +signing { + // TODO: this doesn't show up in the 'tasks' for some reason, need to figure out why. + // This creates a task 'signSourceRelease' that builds the source release and signs it. + sign sourceRelease +} + +// Publishing to Apache's Maven repository (Nexus). To install the archives in the +// local repository, run the 'install' task. +subprojects { + apply plugin: 'nexus' + + nexus { + attachSources = true + attachTests = false + attachJavadoc = true + sign = true + repositoryUrl = 'https://repository.apache.org/service/local/staging/deploy/maven2' + snapshotRepositoryUrl = 'https://repository.apache.org/content/repositories/snapshots' + } + + modifyPom { + project { + name 'Apache DataFu (incubating)' + description 'Librares that make easier to solve data problems using Hadoop and higher level languages based on it.' + url 'http://datafu.incubator.apache.org/' + + scm { + url 'https://git-wip-us.apache.org/repos/asf?p=incubator-datafu.git;a=tree' + connection 'scm:http://git-wip-us.apache.org/repos/asf/incubator-datafu.git' + developerConnection 'scm:https://git-wip-us.apache.org/repos/asf/incubator-datafu.git' + } + + licenses { + license { + name 'The Apache Software License, Version 2.0' + url 'http://www.apache.org/licenses/LICENSE-2.0.txt' + } + } + } + } +}
