cloud-fan commented on a change in pull request #34380:
URL: https://github.com/apache/spark/pull/34380#discussion_r736609256



##########
File path: 
sql/catalyst/src/main/java/org/apache/spark/sql/util/NumericHistogram.java
##########
@@ -0,0 +1,288 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.util;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Random;
+
+
+/**
+ * A generic, re-usable histogram class that supports partial aggregations.
+ * The algorithm is a heuristic adapted from the following paper:
+ * Yael Ben-Haim and Elad Tom-Tov, "A streaming parallel decision tree 
algorithm",
+ * J. Machine Learning Research 11 (2010), pp. 849--872. Although there are no 
approximation
+ * guarantees, it appears to work well with adequate data and a large (e.g., 
20-80) number
+ * of histogram bins.
+ *
+ * Adapted from Hive's NumericHistogram. Can refer to
+ * https://github.com/apache/hive/blob/master/ql/src/
+ * java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java
+ *
+ * Differences:
+ *   1. Declaring [[Coord]] and it's variables as public types for
+ *      easy access in the HistogramNumeric class.
+ *   2. Add method [[getNBins()]] for serialize [[NumericHistogram]]
+ *      in [[NumericHistogramSerializer]].
+ *   3. Add method [[setBin()]] for deserialize [[NumericHistogram]]
+ *      in [[NumericHistogramSerializer]].
+ *   4. In Hive's code, the method [[merge()] pass a serialized histogram,
+ *      in Spark, this method pass a deserialized histogram.
+ *      Here we change the code about merge bins.
+ */
+public class NumericHistogram {
+    /**
+     * The Coord class defines a histogram bin, which is just an (x,y) pair.
+     */
+    public static class Coord implements Comparable {
+        public double x;
+        public double y;
+
+        public int compareTo(Object other) {
+            return Double.compare(x, ((Coord) other).x);
+        }
+    };
+
+    // Class variables
+    private int nbins;
+    private int nusedbins;
+    private ArrayList<Coord> bins;
+    private Random prng;
+
+    /**
+     * Creates a new histogram object. Note that the allocate() or merge()
+     * method must be called before the histogram can be used.
+     */
+    public NumericHistogram() {
+        nbins = 0;
+        nusedbins = 0;
+        bins = null;
+
+        // init the RNG for breaking ties in histogram merging. A fixed seed 
is specified here
+        // to aid testing, but can be eliminated to use a time-based seed 
(which would
+        // make the algorithm non-deterministic).
+        prng = new Random(31183);
+    }
+
+    /**
+     * Resets a histogram object to its initial state. allocate() or merge() 
must be
+     * called again before use.
+     */
+    public void reset() {
+        bins = null;
+        nbins = nusedbins = 0;
+    }
+
+    /**
+     * Returns the number of bins.
+     */
+    public int getNBins() {
+        return nbins;
+    }
+
+    /**
+     * Returns the number of bins currently being used by the histogram.
+     */
+    public int getUsedBins() {
+        return nusedbins;
+    }
+
+    /**
+     * Set the number of bins currently being used by the histogram.
+     */
+    public void setUsedBins(int nusedBins) {
+        this.nusedbins = nusedBins;
+    }
+
+    /**
+     * Returns true if this histogram object has been initialized by calling 
merge()
+     * or allocate().
+     */
+    public boolean isReady() {
+        return nbins != 0;
+    }
+
+    /**
+     * Returns a particular histogram bin.
+     */
+    public Coord getBin(int b) {
+        return bins.get(b);
+    }
+
+    /**
+     * Set a particular histogram bin with index.
+     */
+    public void setBin(double x, double y, int b) {
+        Coord coord = new Coord();
+        coord.x = x;
+        coord.y = y;
+        bins.add(b, coord);
+    }
+
+    /**
+     * Sets the number of histogram bins to use for approximating data.
+     *
+     * @param num_bins Number of non-uniform-width histogram bins to use
+     */
+    public void allocate(int num_bins) {
+        nbins = num_bins;
+        bins = new ArrayList<Coord>();
+        nusedbins = 0;
+    }
+
+    /**
+     * Takes a serialized histogram created by the serialize() method and 
merges

Review comment:
       We need to update the doc here.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to