github-advanced-security[bot] commented on code in PR #511: URL: https://github.com/apache/datasketches-java/pull/511#discussion_r1500074451
########## src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java: ########## @@ -0,0 +1,547 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.tdigest; + +import java.nio.ByteOrder; +import java.util.function.Function; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.common.SketchesStateException; +import org.apache.datasketches.memory.Buffer; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.WritableBuffer; +import org.apache.datasketches.memory.WritableMemory; +import org.apache.datasketches.quantilescommon.QuantilesAPI; + +/** + * t-Digest for estimating quantiles and ranks. + * This implementation is based on the following paper: + * Ted Dunning, Otmar Ertl. Extremely Accurate Quantiles Using t-Digests + * and the following implementation: + * https://github.com/tdunning/t-digest + * This implementation is similar to MergingDigest in the above implementation + */ +public final class TDigestDouble { + + public static final boolean USE_ALTERNATING_SORT = true; + public static final boolean USE_TWO_LEVEL_COMPRESSION = true; + public static final boolean USE_WEIGHT_LIMIT = true; + + private boolean reverseMerge_; + private final int k_; + private final int internalK_; + private double minValue_; + private double maxValue_; + private int centroidsCapacity_; + private int numCentroids_; + private double[] centroidMeans_; + private long[] centroidWeights_; + private long centroidsWeight_; + private int bufferCapacity_; + private int numBuffered_; + private double[] bufferValues_; + private long[] bufferWeights_; + private long bufferedWeight_; + + private static final byte PREAMBLE_LONGS_EMPTY = 1; + private static final byte PREAMBLE_LONGS_NON_EMPTY = 2; + private static final byte SERIAL_VERSION = 1; + private static final byte SKETCH_TYPE = 20; + + private static final int COMPAT_DOUBLE = 1; + private static final int COMPAT_FLOAT = 2; + + enum flags { IS_EMPTY, REVERSE_MERGE }; + + public TDigestDouble(final int k) { + this(false, k, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, null, null, 0); + } + + private TDigestDouble(final boolean reverseMerge, final int k, final double min, final double max, + final double[] means, final long[] weights, final long weight) { + reverseMerge_ = reverseMerge; + k_ = k; + minValue_ = min; + maxValue_ = max; + if (k < 10) throw new SketchesArgumentException("k must be at least 10"); + int fudge = 0; + if (USE_WEIGHT_LIMIT) { + fudge = 10; + if (k < 30) fudge += 20; + } + centroidsCapacity_ = k_ * 2 + fudge; + bufferCapacity_ = centroidsCapacity_ * 5; + double scale = Math.max(1.0, (double) bufferCapacity_ / centroidsCapacity_ - 1.0); + if (!USE_TWO_LEVEL_COMPRESSION) scale = 1; + internalK_ = (int) Math.ceil(Math.sqrt(scale) * k_); + centroidsCapacity_ = Math.max(centroidsCapacity_, internalK_ + fudge); + bufferCapacity_ = Math.max(bufferCapacity_, centroidsCapacity_ * 2); + centroidMeans_ = new double[centroidsCapacity_]; + centroidWeights_ = new long[centroidsCapacity_]; + bufferValues_ = new double[bufferCapacity_]; + bufferWeights_ = new long[bufferCapacity_]; + numCentroids_ = 0; + numBuffered_ = 0; + centroidsWeight_ = weight; + bufferedWeight_ = 0; + if (weight > 0) { + System.arraycopy(means, 0, centroidMeans_, 0, means.length); Review Comment: ## Dereferenced variable may be null Variable [means](1) may be null at this access because of [this](2) null argument. [Show more details](https://github.com/apache/datasketches-java/security/code-scanning/683) ########## src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java: ########## @@ -0,0 +1,547 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.tdigest; + +import java.nio.ByteOrder; +import java.util.function.Function; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.common.SketchesStateException; +import org.apache.datasketches.memory.Buffer; +import org.apache.datasketches.memory.Memory; +import org.apache.datasketches.memory.WritableBuffer; +import org.apache.datasketches.memory.WritableMemory; +import org.apache.datasketches.quantilescommon.QuantilesAPI; + +/** + * t-Digest for estimating quantiles and ranks. + * This implementation is based on the following paper: + * Ted Dunning, Otmar Ertl. Extremely Accurate Quantiles Using t-Digests + * and the following implementation: + * https://github.com/tdunning/t-digest + * This implementation is similar to MergingDigest in the above implementation + */ +public final class TDigestDouble { + + public static final boolean USE_ALTERNATING_SORT = true; + public static final boolean USE_TWO_LEVEL_COMPRESSION = true; + public static final boolean USE_WEIGHT_LIMIT = true; + + private boolean reverseMerge_; + private final int k_; + private final int internalK_; + private double minValue_; + private double maxValue_; + private int centroidsCapacity_; + private int numCentroids_; + private double[] centroidMeans_; + private long[] centroidWeights_; + private long centroidsWeight_; + private int bufferCapacity_; + private int numBuffered_; + private double[] bufferValues_; + private long[] bufferWeights_; + private long bufferedWeight_; + + private static final byte PREAMBLE_LONGS_EMPTY = 1; + private static final byte PREAMBLE_LONGS_NON_EMPTY = 2; + private static final byte SERIAL_VERSION = 1; + private static final byte SKETCH_TYPE = 20; + + private static final int COMPAT_DOUBLE = 1; + private static final int COMPAT_FLOAT = 2; + + enum flags { IS_EMPTY, REVERSE_MERGE }; + + public TDigestDouble(final int k) { + this(false, k, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, null, null, 0); + } + + private TDigestDouble(final boolean reverseMerge, final int k, final double min, final double max, + final double[] means, final long[] weights, final long weight) { + reverseMerge_ = reverseMerge; + k_ = k; + minValue_ = min; + maxValue_ = max; + if (k < 10) throw new SketchesArgumentException("k must be at least 10"); + int fudge = 0; + if (USE_WEIGHT_LIMIT) { + fudge = 10; + if (k < 30) fudge += 20; + } + centroidsCapacity_ = k_ * 2 + fudge; + bufferCapacity_ = centroidsCapacity_ * 5; + double scale = Math.max(1.0, (double) bufferCapacity_ / centroidsCapacity_ - 1.0); + if (!USE_TWO_LEVEL_COMPRESSION) scale = 1; + internalK_ = (int) Math.ceil(Math.sqrt(scale) * k_); + centroidsCapacity_ = Math.max(centroidsCapacity_, internalK_ + fudge); + bufferCapacity_ = Math.max(bufferCapacity_, centroidsCapacity_ * 2); + centroidMeans_ = new double[centroidsCapacity_]; + centroidWeights_ = new long[centroidsCapacity_]; + bufferValues_ = new double[bufferCapacity_]; + bufferWeights_ = new long[bufferCapacity_]; + numCentroids_ = 0; + numBuffered_ = 0; + centroidsWeight_ = weight; + bufferedWeight_ = 0; + if (weight > 0) { + System.arraycopy(means, 0, centroidMeans_, 0, means.length); + System.arraycopy(weights, 0, centroidWeights_, 0, weights.length); Review Comment: ## Dereferenced variable may be null Variable [weights](1) may be null at this access because of [this](2) null argument. [Show more details](https://github.com/apache/datasketches-java/security/code-scanning/684) ########## src/main/java/org/apache/datasketches/tdigest/Sort.java: ########## @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.tdigest; + +import java.util.Random; + +public final class Sort { + private static final Random prng = new Random(); + + public static void stableSort(final double[] keys, final long[] values, final int n) { + stableLimitedQuickSort(keys, values, 0, n, 64); + stableLimitedInsertionSort(keys, values, 0, n, 64); + } + + private static void stableLimitedQuickSort(final double[] keys, final long[] values, int start, int end, final int limit) { + // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases + while (end - start > limit) { + + final int pivotIndex = start + prng.nextInt(end - start); + double pivotValue = keys[pivotIndex]; + + // move pivot to beginning of array + swap(keys, start, pivotIndex); + swap(values, start, pivotIndex); + + // use a three way partition because many duplicate values is an important case + int low = start + 1; // low points to first value not known to be equal to pivotValue + int high = end; // high points to first value > pivotValue + int i = low; // i scans the array + while (i < high) { + // invariant: values[k] == pivotValue for k in [0..low) + // invariant: values[k] < pivotValue for k in [low..i) + // invariant: values[k] > pivotValue for k in [high..end) + // in-loop: i < high + // in-loop: low < high + // in-loop: i >= low + final double vi = keys[i]; + if (vi == pivotValue && i == pivotIndex) { + if (low != i) { + swap(keys, low, i); + swap(values, low, i); + } else { + i++; + } + low++; + } else if (vi > pivotValue || (vi == pivotValue && i > pivotIndex)) { + high--; + swap(keys, i, high); + swap(values, i, high); + } else { + i++; + } + } + // assert i == high || low == high therefore, we are done with partition + // at this point, i==high, from [start,low) are == pivot, [low,high) are < and [high,end) are > + // we have to move the values equal to the pivot into the middle. To do this, we swap pivot + // values into the top end of the [low,high) range stopping when we run out of destinations + // or when we run out of values to copy + int from = start; + int to = high - 1; + for (i = 0; from < low && to >= low; i++) { + swap(keys, from, to); + swap(values, from++, to--); + } + if (from == low) { + // ran out of things to copy. This means that the last destination is the boundary + low = to + 1; + } else { + // ran out of places to copy to. This means that there are uncopied pivots and the + // boundary is at the beginning of those + low = from; + } + + // now recurse, but arrange it to handle the longer limit by tail recursion + // we have to sort the pivot values because they may have different weights + // we can't do that, however until we know how much weight is in the left and right + if (low - start < end - high) { + // left side is smaller + stableLimitedQuickSort(keys, values, start, low, limit); + // this is really a way to do + // quickSort(keys, values, high, end, limit); + start = high; + } else { + stableLimitedQuickSort(keys, values, high, end, limit); + // this is really a way to do + // quickSort(keys, values, start, low, limit); + end = low; + } + } + } + + private static void stableLimitedInsertionSort(final double[] keys, final long[] values, int from, int to, final int limit) { + for (int i = from + 1; i < to; i++) { + final double k = keys[i]; + final long v = values[i]; + final int m = Math.max(i - limit, from); + // values in [from, i) are ordered + // scan backwards to find where to stick the current key + for (int j = i; j >= m; j--) { + if (j == 0 || keys[j - 1] < k || (keys[j - 1] == k && (j - 1 <= i))) { Review Comment: ## Useless comparison test Test is always true. [Show more details](https://github.com/apache/datasketches-java/security/code-scanning/682) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
