This is an automated email from the git hooks/post-receive script. bengen pushed a commit to branch master in repository t-digest.
commit 69830b7c8eb9c89d98e32d8a6ec4f88b19bc59c9 Author: Hilko Bengen <[email protected]> Date: Mon May 11 04:04:53 2015 +0200 Reverted to t-digest 3.0 because 3.1 had introduced grave API/ABI incompatibility issues. (Closes: #784947) --- .travis.yml | 4 - README.md | 13 +- docs/t-digest-paper/deviation-array.pdf | Bin 5675 -> 0 bytes docs/t-digest-paper/deviation-avl.pdf | Bin 5829 -> 0 bytes docs/t-digest-paper/deviation-tree.pdf | Bin 5859 -> 0 bytes docs/t-digest-paper/error-array.pdf | Bin 6105 -> 0 bytes docs/t-digest-paper/error-avl.pdf | Bin 5829 -> 0 bytes docs/t-digest-paper/error-tree.pdf | Bin 5852 -> 0 bytes docs/t-digest-paper/histo.pdf | Bin 1494120 -> 1493366 bytes docs/t-digest-paper/histo.tex | 13 +- docs/t-digest-paper/k-q-diagram.graffle | 919 --------------------- docs/t-digest-paper/k-q-diagram.pdf | Bin 15697 -> 0 bytes docs/t-digest-paper/sizes-array.pdf | Bin 454980 -> 0 bytes docs/t-digest-paper/sizes-avl.pdf | Bin 101900 -> 0 bytes docs/t-digest-paper/sizes-tree.pdf | Bin 101965 -> 0 bytes pom.xml | 18 +- .../java/com/tdunning/math/stats/AVLGroupTree.java | 7 +- .../com/tdunning/math/stats/AVLTreeDigest.java | 34 +- .../com/tdunning/math/stats/AbstractTDigest.java | 42 +- .../java/com/tdunning/math/stats/ArrayDigest.java | 108 ++- .../java/com/tdunning/math/stats/Centroid.java | 9 +- .../java/com/tdunning/math/stats/GroupTree.java | 10 +- .../java/com/tdunning/math/stats/IntAVLTree.java | 2 +- .../com/tdunning/math/stats/MergingDigest.java | 640 -------------- src/main/java/com/tdunning/math/stats/Sort.java | 247 ------ src/main/java/com/tdunning/math/stats/TDigest.java | 39 +- .../java/com/tdunning/math/stats/TreeDigest.java | 21 +- .../com/tdunning/math/stats/AVLTreeDigestTest.java | 123 +-- .../com/tdunning/math/stats/ArrayDigestTest.java | 34 +- .../com/tdunning/math/stats/MergingDigestTest.java | 566 ------------- .../java/com/tdunning/math/stats/SortTest.java | 149 ---- .../com/tdunning/math/stats/TDigestBenchmark.java | 4 +- .../math/stats/TDigestMemoryBenchmark.java | 2 +- .../java/com/tdunning/math/stats/TDigestTest.java | 77 +- .../com/tdunning/math/stats/TreeDigestTest.java | 39 +- 35 files changed, 222 insertions(+), 2898 deletions(-) diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 10904f3..0000000 --- a/.travis.yml +++ /dev/null @@ -1,4 +0,0 @@ -language: java -jdk: - - oraclejdk7 - - openjdk7 diff --git a/README.md b/README.md index ee489fd..bb169eb 100644 --- a/README.md +++ b/README.md @@ -56,11 +56,11 @@ The data from these tests are stored in a variety of data files in the root dire quite large. To visualize the contents of these files, copy all of them into the t-digest-paper directory so that they are accessible to the R scripts there: - cp *.?sv docs/t-digest-paper/ + cp *.?sv docs/theory/t-digest-paper/ At this point you can run the R analysis scripts: - cd docs/t-digest-paper/ + cd docs/theory/t-digest-paper/ for i in *.r; do (R --slave -f $i; echo $i complete) & echo $i started; done Most of these scripts will complete almost instantaneously; one or two will take a few tens of seconds. @@ -69,13 +69,4 @@ The output of these scripts are a collection of PNG image files that can be view such as Preview on a Mac. Many of these images are used as figures in the paper in the same directory with the R scripts. -Continuous Integration -================= -The t-digest project makes use of Travis integration with github for testing whenever a change is made. - -You can see the reports at: - - https://travis-ci.org/tdunning/t-digest - -travis update diff --git a/docs/t-digest-paper/deviation-array.pdf b/docs/t-digest-paper/deviation-array.pdf deleted file mode 100644 index 0124349..0000000 Binary files a/docs/t-digest-paper/deviation-array.pdf and /dev/null differ diff --git a/docs/t-digest-paper/deviation-avl.pdf b/docs/t-digest-paper/deviation-avl.pdf deleted file mode 100644 index 1a1a379..0000000 Binary files a/docs/t-digest-paper/deviation-avl.pdf and /dev/null differ diff --git a/docs/t-digest-paper/deviation-tree.pdf b/docs/t-digest-paper/deviation-tree.pdf deleted file mode 100644 index 758132f..0000000 Binary files a/docs/t-digest-paper/deviation-tree.pdf and /dev/null differ diff --git a/docs/t-digest-paper/error-array.pdf b/docs/t-digest-paper/error-array.pdf deleted file mode 100644 index 0c3b7d7..0000000 Binary files a/docs/t-digest-paper/error-array.pdf and /dev/null differ diff --git a/docs/t-digest-paper/error-avl.pdf b/docs/t-digest-paper/error-avl.pdf deleted file mode 100644 index e223ab9..0000000 Binary files a/docs/t-digest-paper/error-avl.pdf and /dev/null differ diff --git a/docs/t-digest-paper/error-tree.pdf b/docs/t-digest-paper/error-tree.pdf deleted file mode 100644 index 6b10be5..0000000 Binary files a/docs/t-digest-paper/error-tree.pdf and /dev/null differ diff --git a/docs/t-digest-paper/histo.pdf b/docs/t-digest-paper/histo.pdf index 85c92e0..dff7144 100644 Binary files a/docs/t-digest-paper/histo.pdf and b/docs/t-digest-paper/histo.pdf differ diff --git a/docs/t-digest-paper/histo.tex b/docs/t-digest-paper/histo.tex index f6643cd..26ccc95 100644 --- a/docs/t-digest-paper/histo.tex +++ b/docs/t-digest-paper/histo.tex @@ -20,8 +20,6 @@ \author{Ted Dunning} \address{Ted Dunning \\ MapR Technologies, Inc \\ San Jose, CA} \email{[email protected]} -\author{Otmar Ertl} -\email{[email protected]} \date{} % Activate to display a given date or no date \begin{document} \begin{abstract} @@ -58,10 +56,10 @@ The basic outline of the algorithm for constructing a $t$-digest is quite simple \KwOut{final set $C=[c_1 \ldots c_m]$ of weighted centroids} \For{$(x_n, w_n) \in X$} { $z = \min | c_i.\mathtt{mean} - x |$\; - $S = \lbrace c_i : |c_i.\mathtt{mean} - x| = z \wedge c_i.\mathtt{count} + w_n \le 4n\delta q(c_i) (1-q(c_i)) \rbrace $\; + $S = \lbrace c_i : |c_i.\mathtt{mean} - x| = z \wedge c_i.\mathtt{count} + 1 \le 4n\delta q(c_i) (1-q(c_i)) \rbrace $\; \While {$S \ne \lbrace \rbrace\wedge w_n > 0$} { Sample $c_j \sim \mathrm{Uniform}( S)$\; - $\Delta w = \min(4n\delta q(c_j) (1-q(c_j))-c_j.\mathtt{count}, w_n)$\; + $\Delta w = \min(4n\delta q(c_i) (1-q(c_i))-c_j.\mathtt{count}, w_n)$\; $c_j.\mathtt{count} \gets c_j.\mathtt{count} + \Delta w$\; $c_j.\mathtt{mean} \gets c_j.\mathtt{mean} + \Delta w (x_n - c_j.\mathtt{mean})/c_j.\mathtt{count}$\; $w_n \gets w_n - \Delta w$\; @@ -133,7 +131,7 @@ $t = 0$, $N = \sum_i k_i$\; } $z = \max(-1, (x-m_i)/\Delta)$\; \If {$z < 1$} { - \Return $( \frac{t}{N} + \frac{k_i} N \frac{z+1} {2})$ + \Return $(t + \frac{k_i} N \frac{z+1} {2})$ } $t \gets t + k_i$\; } @@ -160,8 +158,7 @@ Computing an approximation of the $q$ quantile of the data points seen so far ca $t = 0$, $q \gets q \sum c_i.\mathtt{count}$\; \For {$i \in 1\ldots m$} { $k_i = c_i.\mathtt{count}$\; - $m_i = c_i.\mathtt{mean}$\; - \If {$q < t + k_i$} { + \If {$q < k_i$} { \uIf {$i = 1$} { $\Delta \gets (c_{i+1}.\mathtt{mean} - c_i.\mathtt{mean})$\; } \uElseIf {$i = m$} { @@ -298,4 +295,4 @@ The $t$-digest algorithm is available in the form of an open source, well-tested \bibliographystyle{alpha} \bibliography{refs}{} -\end{document} +\end{document} \ No newline at end of file diff --git a/docs/t-digest-paper/k-q-diagram.graffle b/docs/t-digest-paper/k-q-diagram.graffle deleted file mode 100644 index 389e98c..0000000 --- a/docs/t-digest-paper/k-q-diagram.graffle +++ /dev/null @@ -1,919 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> -<plist version="1.0"> -<dict> - <key>ActiveLayerIndex</key> - <integer>0</integer> - <key>ApplicationVersion</key> - <array> - <string>com.omnigroup.OmniGrafflePro</string> - <string>139.18.0.187838</string> - </array> - <key>AutoAdjust</key> - <false/> - <key>BackgroundGraphic</key> - <dict> - <key>Bounds</key> - <string>{{0, 0}, {288, 100.79999542236328}}</string> - <key>Class</key> - <string>SolidGraphic</string> - <key>ID</key> - <integer>2</integer> - <key>Style</key> - <dict> - <key>shadow</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>stroke</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - </dict> - </dict> - <key>BaseZoom</key> - <integer>0</integer> - <key>CanvasOrigin</key> - <string>{0, 0}</string> - <key>CanvasSize</key> - <string>{288, 100.79999542236328}</string> - <key>ColumnAlign</key> - <integer>1</integer> - <key>ColumnSpacing</key> - <real>36</real> - <key>CreationDate</key> - <string>2015-02-22 20:45:24 +0000</string> - <key>Creator</key> - <string>Ted Dunning</string> - <key>DisplayScale</key> - <string>1 0/72 in = 1 0/72 in</string> - <key>GraphDocumentVersion</key> - <integer>8</integer> - <key>GraphicsList</key> - <array> - <dict> - <key>Class</key> - <string>LineGraphic</string> - <key>ID</key> - <integer>42</integer> - <key>Points</key> - <array> - <string>{59.250002239288655, 44.546972851488952}</string> - <string>{59.250002239288655, 50.451896280717193}</string> - </array> - <key>Style</key> - <dict> - <key>stroke</key> - <dict> - <key>HeadArrow</key> - <string>0</string> - <key>Legacy</key> - <true/> - <key>TailArrow</key> - <string>0</string> - <key>Width</key> - <real>0.5</real> - </dict> - </dict> - </dict> - <dict> - <key>Class</key> - <string>LineGraphic</string> - <key>ID</key> - <integer>41</integer> - <key>Points</key> - <array> - <string>{247.62525268014707, 44.546972851488952}</string> - <string>{247.62525268014707, 50.451896280717193}</string> - </array> - <key>Style</key> - <dict> - <key>stroke</key> - <dict> - <key>HeadArrow</key> - <string>0</string> - <key>Legacy</key> - <true/> - <key>TailArrow</key> - <string>0</string> - <key>Width</key> - <real>0.5</real> - </dict> - </dict> - </dict> - <dict> - <key>Bounds</key> - <string>{{52.750003814697266, 50.201896667480469}, {14, 16}}</string> - <key>Class</key> - <string>ShapedGraphic</string> - <key>FitText</key> - <string>YES</string> - <key>Flow</key> - <string>Resize</string> - <key>ID</key> - <integer>40</integer> - <key>Shape</key> - <string>Rectangle</string> - <key>Style</key> - <dict> - <key>fill</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>shadow</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>stroke</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - </dict> - <key>Text</key> - <dict> - <key>Align</key> - <integer>0</integer> - <key>Pad</key> - <integer>0</integer> - <key>Text</key> - <string>{\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210 -\cocoascreenfonts1{\fonttbl\f0\froman\fcharset0 Times-Roman;} -{\colortbl;\red255\green255\blue255;} -\deftab720 -\pard\pardeftab720\sa240 - -\f0\i\fs26 \cf0 i -\i0 / -\i n}</string> - <key>VerticalPad</key> - <integer>0</integer> - </dict> - <key>Wrap</key> - <string>NO</string> - </dict> - <dict> - <key>Bounds</key> - <string>{{229.30534221912285, 50.226128628121799}, {50.677060582390851, 16}}</string> - <key>Class</key> - <string>ShapedGraphic</string> - <key>FitText</key> - <string>Vertical</string> - <key>Flow</key> - <string>Resize</string> - <key>ID</key> - <integer>39</integer> - <key>Shape</key> - <string>Rectangle</string> - <key>Style</key> - <dict> - <key>fill</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>shadow</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>stroke</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - </dict> - <key>Text</key> - <dict> - <key>Align</key> - <integer>0</integer> - <key>Pad</key> - <integer>0</integer> - <key>Text</key> - <string>{\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210 -\cocoascreenfonts1{\fonttbl\f0\froman\fcharset0 Times-Roman;} -{\colortbl;\red255\green255\blue255;} -\deftab720 -\pard\pardeftab720\sa240 - -\f0\fs26 \cf0 ( -\i i -\i0 + 3)/ -\i n}</string> - <key>VerticalPad</key> - <integer>0</integer> - </dict> - </dict> - <dict> - <key>Bounds</key> - <string>{{164.98982866365355, 50.226128628121799}, {50.677060582390851, 16}}</string> - <key>Class</key> - <string>ShapedGraphic</string> - <key>FitText</key> - <string>Vertical</string> - <key>Flow</key> - <string>Resize</string> - <key>ID</key> - <integer>38</integer> - <key>Shape</key> - <string>Rectangle</string> - <key>Style</key> - <dict> - <key>fill</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>shadow</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>stroke</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - </dict> - <key>Text</key> - <dict> - <key>Align</key> - <integer>0</integer> - <key>Pad</key> - <integer>0</integer> - <key>Text</key> - <string>{\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210 -\cocoascreenfonts1{\fonttbl\f0\froman\fcharset0 Times-Roman;} -{\colortbl;\red255\green255\blue255;} -\deftab720 -\pard\pardeftab720\sa240 - -\f0\fs26 \cf0 ( -\i i -\i0 + 2)/ -\i n}</string> - <key>VerticalPad</key> - <integer>0</integer> - </dict> - </dict> - <dict> - <key>Bounds</key> - <string>{{101.75269699096684, 50.226128628121799}, {50.677060582390851, 16}}</string> - <key>Class</key> - <string>ShapedGraphic</string> - <key>FitText</key> - <string>Vertical</string> - <key>Flow</key> - <string>Resize</string> - <key>ID</key> - <integer>37</integer> - <key>Shape</key> - <string>Rectangle</string> - <key>Style</key> - <dict> - <key>fill</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>shadow</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>stroke</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - </dict> - <key>Text</key> - <dict> - <key>Align</key> - <integer>0</integer> - <key>Pad</key> - <integer>0</integer> - <key>Text</key> - <string>{\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210 -\cocoascreenfonts1{\fonttbl\f0\froman\fcharset0 Times-Roman;} -{\colortbl;\red255\green255\blue255;} -\deftab720 -\pard\pardeftab720\sa240 - -\f0\fs26 \cf0 ( -\i i -\i0 + 1)/ -\i n}</string> - <key>VerticalPad</key> - <integer>0</integer> - </dict> - </dict> - <dict> - <key>Class</key> - <string>LineGraphic</string> - <key>ID</key> - <integer>29</integer> - <key>Points</key> - <array> - <string>{184.83350253319429, 44.546972851488952}</string> - <string>{184.83350253319429, 50.451896280717193}</string> - </array> - <key>Style</key> - <dict> - <key>stroke</key> - <dict> - <key>HeadArrow</key> - <string>0</string> - <key>Legacy</key> - <true/> - <key>TailArrow</key> - <string>0</string> - <key>Width</key> - <real>0.5</real> - </dict> - </dict> - </dict> - <dict> - <key>Class</key> - <string>LineGraphic</string> - <key>ID</key> - <integer>27</integer> - <key>Points</key> - <array> - <string>{122.04175238624138, 44.546972851488952}</string> - <string>{122.04175238624138, 50.451896280717193}</string> - </array> - <key>Style</key> - <dict> - <key>stroke</key> - <dict> - <key>HeadArrow</key> - <string>0</string> - <key>Legacy</key> - <true/> - <key>TailArrow</key> - <string>0</string> - <key>Width</key> - <real>0.5</real> - </dict> - </dict> - </dict> - <dict> - <key>Bounds</key> - <string>{{3.6106719970702734, 6.7104722792607694}, {18.552361645744838, 9.7577002053388089}}</string> - <key>Class</key> - <string>ShapedGraphic</string> - <key>FontInfo</key> - <dict> - <key>Font</key> - <string>Times-Italic</string> - <key>Size</key> - <real>15</real> - </dict> - <key>ID</key> - <integer>18</integer> - <key>Shape</key> - <string>Rectangle</string> - <key>Style</key> - <dict> - <key>fill</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>shadow</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>stroke</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - </dict> - <key>Text</key> - <dict> - <key>Align</key> - <integer>0</integer> - <key>Pad</key> - <integer>0</integer> - <key>Text</key> - <string>{\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210 -\cocoascreenfonts1{\fonttbl\f0\froman\fcharset0 Times-Roman;} -{\colortbl;\red255\green255\blue255;} -\deftab720 -\pard\pardeftab720\sa240 - -\f0\i\fs26 \cf0 k+1}</string> - <key>VerticalPad</key> - <integer>0</integer> - </dict> - <key>Wrap</key> - <string>NO</string> - </dict> - <dict> - <key>Bounds</key> - <string>{{17.06303638347142, 27.508726899383952}, {5.044661260808553, 9.7577002053388089}}</string> - <key>Class</key> - <string>ShapedGraphic</string> - <key>FontInfo</key> - <dict> - <key>Font</key> - <string>Times-Italic</string> - <key>Size</key> - <real>15</real> - </dict> - <key>ID</key> - <integer>19</integer> - <key>Shape</key> - <string>Rectangle</string> - <key>Style</key> - <dict> - <key>fill</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>shadow</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>stroke</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - </dict> - <key>Text</key> - <dict> - <key>Align</key> - <integer>0</integer> - <key>Pad</key> - <integer>0</integer> - <key>Text</key> - <string>{\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210 -\cocoascreenfonts1{\fonttbl\f0\froman\fcharset0 Times-Roman;} -{\colortbl;\red255\green255\blue255;} -\deftab720 -\pard\pardeftab720\sa240 - -\f0\i\fs26 \cf0 k}</string> - <key>VerticalPad</key> - <integer>0</integer> - </dict> - <key>Wrap</key> - <string>NO</string> - </dict> - <dict> - <key>Bounds</key> - <string>{{212.34956182880438, 76.25}, {12.139229705714891, 13.552361396303901}}</string> - <key>Class</key> - <string>ShapedGraphic</string> - <key>FontInfo</key> - <dict> - <key>Font</key> - <string>Times-Italic</string> - <key>Size</key> - <real>15</real> - </dict> - <key>ID</key> - <integer>20</integer> - <key>Shape</key> - <string>Rectangle</string> - <key>Style</key> - <dict> - <key>fill</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>shadow</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>stroke</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - </dict> - <key>Text</key> - <dict> - <key>Align</key> - <integer>0</integer> - <key>Pad</key> - <integer>0</integer> - <key>Text</key> - <string>{\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210 -\cocoascreenfonts1{\fonttbl\f0\froman\fcharset0 Times-Roman;} -{\colortbl;\red255\green255\blue255;} -\deftab720 -\pard\pardeftab720\sa240 - -\f0\i\fs26 \cf0 q -\fs22 \dn6 \sub 2}</string> - <key>VerticalPad</key> - <integer>0</integer> - </dict> - <key>Wrap</key> - <string>NO</string> - </dict> - <dict> - <key>Bounds</key> - <string>{{78.75, 76.25}, {12.139229705714891, 13.552361396303901}}</string> - <key>Class</key> - <string>ShapedGraphic</string> - <key>FontInfo</key> - <dict> - <key>Font</key> - <string>Times-Italic</string> - <key>Size</key> - <real>15</real> - </dict> - <key>ID</key> - <integer>21</integer> - <key>Shape</key> - <string>Rectangle</string> - <key>Style</key> - <dict> - <key>fill</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>shadow</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - <key>stroke</key> - <dict> - <key>Draws</key> - <string>NO</string> - </dict> - </dict> - <key>Text</key> - <dict> - <key>Align</key> - <integer>0</integer> - <key>Pad</key> - <integer>0</integer> - <key>Text</key> - <string>{\rtf1\ansi\ansicpg1252\cocoartf1265\cocoasubrtf210 -\cocoascreenfonts1{\fonttbl\f0\froman\fcharset0 Times-Roman;} -{\colortbl;\red255\green255\blue255;} -\deftab720 -\pard\pardeftab720\sa240 - -\f0\i\fs26 \cf0 q -\fs22 \dn6 \sub 1}</string> - <key>VerticalPad</key> - <integer>0</integer> - </dict> - <key>Wrap</key> - <string>NO</string> - </dict> - <dict> - <key>Class</key> - <string>LineGraphic</string> - <key>FontInfo</key> - <dict> - <key>Font</key> - <string>Times-Italic</string> - <key>Size</key> - <real>15</real> - </dict> - <key>ID</key> - <integer>22</integer> - <key>Points</key> - <array> - <string>{215.79283685438187, 73.580473729566677}</string> - <string>{215.79283685438187, 7.0836755646817817}</string> - </array> - <key>Style</key> - <dict> - <key>stroke</key> - <dict> - <key>HeadArrow</key> - <string>0</string> - <key>Legacy</key> - <true/> - <key>TailArrow</key> - <string>0</string> - <key>Width</key> - <real>0.5</real> - </dict> - </dict> - </dict> - <dict> - <key>Class</key> - <string>LineGraphic</string> - <key>FontInfo</key> - <dict> - <key>Font</key> - <string>Times-Italic</string> - <key>Size</key> - <real>15</real> - </dict> - <key>ID</key> - <integer>23</integer> - <key>Points</key> - <array> - <string>{82.376349995506715, 73.749996365463033}</string> - <string>{82.376349995506715, 27.508726899383952}</string> - </array> - <key>Style</key> - <dict> - <key>stroke</key> - <dict> - <key>HeadArrow</key> - <string>0</string> - <key>Legacy</key> - <true/> - <key>TailArrow</key> - <string>0</string> - <key>Width</key> - <real>0.5</real> - </dict> - </dict> - </dict> - <dict> - <key>Class</key> - <string>LineGraphic</string> - <key>FontInfo</key> - <dict> - <key>Font</key> - <string>Times-Italic</string> - <key>Size</key> - <real>15</real> - </dict> - <key>ID</key> - <integer>24</integer> - <key>Points</key> - <array> - <string>{26.787882521546127, 13.152464065708429}</string> - <string>{234.49999494830382, 13.152464065708429}</string> - </array> - <key>Style</key> - <dict> - <key>stroke</key> - <dict> - <key>HeadArrow</key> - <string>0</string> - <key>Legacy</key> - <true/> - <key>TailArrow</key> - <string>0</string> - <key>Width</key> - <real>0.5</real> - </dict> - </dict> - </dict> - <dict> - <key>Class</key> - <string>LineGraphic</string> - <key>FontInfo</key> - <dict> - <key>Font</key> - <string>Times-Italic</string> - <key>Size</key> - <real>15</real> - </dict> - <key>ID</key> - <integer>25</integer> - <key>Points</key> - <array> - <string>{26.787882521545946, 33.523100616016436}</string> - <string>{131.91188170756305, 33.523100616016436}</string> - </array> - <key>Style</key> - <dict> - <key>stroke</key> - <dict> - <key>HeadArrow</key> - <string>0</string> - <key>Legacy</key> - <true/> - <key>TailArrow</key> - <string>0</string> - <key>Width</key> - <real>0.5</real> - </dict> - </dict> - </dict> - <dict> - <key>Class</key> - <string>LineGraphic</string> - <key>ControlPoints</key> - <array> - <string>{54.175745194548277, -20.19301848049281}</string> - <string>{-14.293175327923377, 18.566735112936335}</string> - </array> - <key>FontInfo</key> - <dict> - <key>Font</key> - <string>Times-Italic</string> - <key>Size</key> - <real>15</real> - </dict> - <key>ID</key> - <integer>26</integer> - <key>Points</key> - <array> - <string>{42.003198193206401, 43.66632443531828}</string> - <string>{234.49999494830365, 4.4999999999999964}</string> - </array> - <key>Style</key> - <dict> - <key>stroke</key> - <dict> - <key>Bezier</key> - <true/> - <key>Color</key> - <dict> - <key>a</key> - <string>0.55</string> - <key>b</key> - <string>0</string> - <key>g</key> - <string>0</string> - <key>r</key> - <string>0</string> - </dict> - <key>HeadArrow</key> - <string>0</string> - <key>Legacy</key> - <true/> - <key>LineType</key> - <integer>1</integer> - <key>TailArrow</key> - <string>0</string> - <key>Width</key> - <real>3</real> - </dict> - </dict> - </dict> - </array> - <key>GridInfo</key> - <dict/> - <key>GuidesLocked</key> - <string>NO</string> - <key>GuidesVisible</key> - <string>YES</string> - <key>HPages</key> - <integer>1</integer> - <key>ImageCounter</key> - <integer>1</integer> - <key>KeepToScale</key> - <false/> - <key>Layers</key> - <array> - <dict> - <key>Lock</key> - <string>NO</string> - <key>Name</key> - <string>Layer 1</string> - <key>Print</key> - <string>YES</string> - <key>View</key> - <string>YES</string> - </dict> - </array> - <key>LayoutInfo</key> - <dict> - <key>Animate</key> - <string>NO</string> - <key>circoMinDist</key> - <real>18</real> - <key>circoSeparation</key> - <real>0.0</real> - <key>layoutEngine</key> - <string>dot</string> - <key>neatoSeparation</key> - <real>0.0</real> - <key>twopiSeparation</key> - <real>0.0</real> - </dict> - <key>LinksVisible</key> - <string>NO</string> - <key>MagnetsVisible</key> - <string>NO</string> - <key>MasterSheets</key> - <array/> - <key>ModificationDate</key> - <string>2015-02-23 09:57:36 +0000</string> - <key>Modifier</key> - <string>Ted Dunning</string> - <key>NotesVisible</key> - <string>NO</string> - <key>Orientation</key> - <integer>2</integer> - <key>OriginVisible</key> - <string>NO</string> - <key>PageBreaks</key> - <string>YES</string> - <key>PrintInfo</key> - <dict> - <key>NSBottomMargin</key> - <array> - <string>float</string> - <string>41</string> - </array> - <key>NSHorizonalPagination</key> - <array> - <string>coded</string> - <string>BAtzdHJlYW10eXBlZIHoA4QBQISEhAhOU051bWJlcgCEhAdOU1ZhbHVlAISECE5TT2JqZWN0AIWEASqEhAFxlwCG</string> - </array> - <key>NSLeftMargin</key> - <array> - <string>float</string> - <string>18</string> - </array> - <key>NSPaperSize</key> - <array> - <string>size</string> - <string>{612, 792}</string> - </array> - <key>NSPrintReverseOrientation</key> - <array> - <string>int</string> - <string>0</string> - </array> - <key>NSRightMargin</key> - <array> - <string>float</string> - <string>18</string> - </array> - <key>NSTopMargin</key> - <array> - <string>float</string> - <string>18</string> - </array> - </dict> - <key>PrintOnePage</key> - <false/> - <key>ReadOnly</key> - <string>NO</string> - <key>RowAlign</key> - <integer>1</integer> - <key>RowSpacing</key> - <real>36</real> - <key>SheetTitle</key> - <string>Canvas 1</string> - <key>SmartAlignmentGuidesActive</key> - <string>YES</string> - <key>SmartDistanceGuidesActive</key> - <string>YES</string> - <key>UniqueID</key> - <integer>1</integer> - <key>UseEntirePage</key> - <false/> - <key>VPages</key> - <integer>1</integer> - <key>WindowInfo</key> - <dict> - <key>CurrentSheet</key> - <integer>0</integer> - <key>ExpandedCanvases</key> - <array> - <dict> - <key>name</key> - <string>Canvas 1</string> - </dict> - </array> - <key>Frame</key> - <string>{{0, 4}, {1280, 774}}</string> - <key>ListView</key> - <true/> - <key>OutlineWidth</key> - <integer>142</integer> - <key>RightSidebar</key> - <false/> - <key>ShowRuler</key> - <true/> - <key>Sidebar</key> - <true/> - <key>SidebarWidth</key> - <integer>120</integer> - <key>VisibleRegion</key> - <string>{{-142, -104}, {572.5, 309.5}}</string> - <key>Zoom</key> - <real>2</real> - <key>ZoomValues</key> - <array> - <array> - <string>Canvas 1</string> - <real>2</real> - <real>4</real> - </array> - </array> - </dict> -</dict> -</plist> diff --git a/docs/t-digest-paper/k-q-diagram.pdf b/docs/t-digest-paper/k-q-diagram.pdf deleted file mode 100644 index c938749..0000000 Binary files a/docs/t-digest-paper/k-q-diagram.pdf and /dev/null differ diff --git a/docs/t-digest-paper/sizes-array.pdf b/docs/t-digest-paper/sizes-array.pdf deleted file mode 100644 index fc36139..0000000 Binary files a/docs/t-digest-paper/sizes-array.pdf and /dev/null differ diff --git a/docs/t-digest-paper/sizes-avl.pdf b/docs/t-digest-paper/sizes-avl.pdf deleted file mode 100644 index 97682db..0000000 Binary files a/docs/t-digest-paper/sizes-avl.pdf and /dev/null differ diff --git a/docs/t-digest-paper/sizes-tree.pdf b/docs/t-digest-paper/sizes-tree.pdf deleted file mode 100644 index b0eb91b..0000000 Binary files a/docs/t-digest-paper/sizes-tree.pdf and /dev/null differ diff --git a/pom.xml b/pom.xml index 928132b..d32e7f7 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ <groupId>com.tdunning</groupId> <artifactId>t-digest</artifactId> - <version>3.1</version> + <version>3.0</version> <packaging>jar</packaging> <name>T-Digest</name> @@ -22,7 +22,7 @@ <scm> <connection>scm:git:https://github.com/tdunning/t-digest.git</connection> <developerConnection>scm:git:https://github.com/tdunning/t-digest.git</developerConnection> - <tag>t-digest-3.1</tag> + <tag>t-digest-3.0</tag> <url>https://github.com/tdunning/t-digest</url> </scm> @@ -61,7 +61,7 @@ <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> - <version>4.12</version> + <version>4.8.2</version> <scope>test</scope> </dependency> <dependency> @@ -73,7 +73,7 @@ <dependency> <groupId>com.carrotsearch</groupId> <artifactId>java-sizeof</artifactId> - <version>0.0.5</version> + <version>0.0.4</version> <scope>test</scope> </dependency> </dependencies> @@ -94,7 +94,7 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> - <version>2.18.1</version> + <version>2.16</version> <configuration> <parallel>methods</parallel> <perCoreThreadCount>true</perCoreThreadCount> @@ -104,7 +104,7 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> - <version>3.3</version> + <version>3.0</version> <configuration> <verbose>true</verbose> <compilerVersion>1.7</compilerVersion> @@ -115,7 +115,7 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-source-plugin</artifactId> - <version>2.4</version> + <version>2.2.1</version> <executions> <execution> <id>attach-sources</id> @@ -128,7 +128,7 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-javadoc-plugin</artifactId> - <version>2.10.2</version> + <version>2.9.1</version> <executions> <execution> @@ -142,7 +142,7 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-release-plugin</artifactId> - <version>2.5.1</version> + <version>2.4.2</version> <configuration> <arguments>-Dgpg.keyname=${gpg.keyname}</arguments> <arguments>-Dgpg.passphrase=${gpg.passphrase}</arguments> diff --git a/src/main/java/com/tdunning/math/stats/AVLGroupTree.java b/src/main/java/com/tdunning/math/stats/AVLGroupTree.java index 864746c..c5da2d4 100644 --- a/src/main/java/com/tdunning/math/stats/AVLGroupTree.java +++ b/src/main/java/com/tdunning/math/stats/AVLGroupTree.java @@ -17,7 +17,6 @@ package com.tdunning.math.stats; -import java.util.AbstractCollection; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; @@ -26,7 +25,7 @@ import java.util.List; /** * A tree of t-digest centroids. */ -final class AVLGroupTree extends AbstractCollection<Centroid> { +final class AVLGroupTree implements Iterable<Centroid> { /* For insertions into the tree */ private double centroid; @@ -154,10 +153,8 @@ final class AVLGroupTree extends AbstractCollection<Centroid> { tree.add(); } - @Override - public boolean add(Centroid centroid) { + public void add(Centroid centroid) { add(centroid.mean(), centroid.count(), centroid.data()); - return true; } /** diff --git a/src/main/java/com/tdunning/math/stats/AVLTreeDigest.java b/src/main/java/com/tdunning/math/stats/AVLTreeDigest.java index b6d00c1..6720864 100644 --- a/src/main/java/com/tdunning/math/stats/AVLTreeDigest.java +++ b/src/main/java/com/tdunning/math/stats/AVLTreeDigest.java @@ -18,8 +18,6 @@ package com.tdunning.math.stats; import java.nio.ByteBuffer; -import java.util.Collection; -import java.util.Collections; import java.util.Iterator; import java.util.List; @@ -126,8 +124,8 @@ public class AVLTreeDigest extends AbstractTDigest { d.addAll(data); } } - centroid = weightedAverage(centroid, count, x, w); count += w; + centroid += w * (x - centroid) / count; summary.update(closest, centroid, count, d); } count += w; @@ -151,7 +149,7 @@ public class AVLTreeDigest extends AbstractTDigest { final int[] nodes = new int[centroids.size()]; nodes[0] = centroids.first(); for (int i = 1; i < nodes.length; ++i) { - nodes[i] = centroids.next(nodes[i - 1]); + nodes[i] = centroids.next(nodes[i-1]); assert nodes[i] != IntAVLTree.NIL; } assert centroids.next(nodes[nodes.length - 1]) == IntAVLTree.NIL; @@ -168,6 +166,11 @@ public class AVLTreeDigest extends AbstractTDigest { } } + @Override + public void compress(GroupTree other) { + throw new UnsupportedOperationException(); + } + /** * Returns the number of samples represented in this histogram. If you want to know how many * centroids are being used, try centroids().size(). @@ -207,20 +210,20 @@ public class AVLTreeDigest extends AbstractTDigest { // scan to next to last element while (it.hasNext()) { if (x < a.mean() + right) { - double value = (r + a.count() * interpolate(x, a.mean() - left, a.mean() + right)) / count; - return value > 0.0 ? value : 0.0; + return (r + a.count() * interpolate(x, a.mean() - left, a.mean() + right)) / count; } - r += a.count(); a = b; - left = right; - b = it.next(); + + left = right; right = (b.mean() - a.mean()) / 2; } // for the last element, assume right width is same as left + left = right; + a = b; if (x < a.mean() + right) { return (r + a.count() * interpolate(x, a.mean() - left, a.mean() + right)) / count; } else { @@ -275,14 +278,14 @@ public class AVLTreeDigest extends AbstractTDigest { } // common case: we found two centroids previous and next so that the desired quantile is // after 'previous' but before 'next' - return quantile(index, previousIndex, nextIndex, previousMean, values.mean(next)); + return quantile(previousIndex, index, nextIndex, previousMean, values.mean(next)); } else if (values.next(next) == IntAVLTree.NIL) { // special case 2: the index we are interested in is beyond the last centroid // again, assume values grow linearly between index previousIndex and (count - 1) // which is the highest possible index final double nextIndex2 = count - 1; final double nextMean2 = (values.mean(next) * (nextIndex2 - previousIndex) - previousMean * (nextIndex2 - nextIndex)) / (nextIndex - previousIndex); - return quantile(index, nextIndex, nextIndex2, values.mean(next), nextMean2); + return quantile(nextIndex, index, nextIndex2, values.mean(next), nextMean2); } total += values.count(next); previousMean = values.mean(next); @@ -292,8 +295,13 @@ public class AVLTreeDigest extends AbstractTDigest { } @Override - public Collection<Centroid> centroids() { - return Collections.unmodifiableCollection(summary); + public int centroidCount() { + return summary.size(); + } + + @Override + public Iterable<? extends Centroid> centroids() { + return summary; } @Override diff --git a/src/main/java/com/tdunning/math/stats/AbstractTDigest.java b/src/main/java/com/tdunning/math/stats/AbstractTDigest.java index ffb714c..8d3a5e2 100644 --- a/src/main/java/com/tdunning/math/stats/AbstractTDigest.java +++ b/src/main/java/com/tdunning/math/stats/AbstractTDigest.java @@ -27,32 +27,6 @@ public abstract class AbstractTDigest extends TDigest { protected Random gen = new Random(); protected boolean recordAllData = false; - /** - * Same as {@link #weightedAverageSorted(double, int, double, int)} but flips - * the order of the variables if <code>x2</code> is greater than - * <code>x1</code>. - */ - public static double weightedAverage(double x1, int w1, double x2, int w2) { - if (x1 <= x2) { - return weightedAverageSorted(x1, w1, x2, w2); - } else { - return weightedAverageSorted(x2, w2, x1, w1); - } - } - - /** - * Compute the weighted average between <code>x1</code> with a weight of - * <code>w1</code> and <code>x2</code> with a weight of <code>w2</code>. - * This expects <code>x1</code> to be less than or equal to <code>x2</code> - * and is guaranteed to return a number between <code>x1</code> and - * <code>x2</code>. - */ - public static double weightedAverageSorted(double x1, int w1, double x2, int w2) { - assert x1 <= x2; - final double x = (x1 * w1 + x2 * w2) / (w1 + w2); - return Math.max(x1, Math.min(x, x2)); - } - public static double interpolate(double x, double x0, double x1) { return (x - x0) / (x1 - x0); } @@ -111,19 +85,9 @@ public abstract class AbstractTDigest extends TDigest { return r; } - /** - * Computes an interpolated value of a quantile that is between two centroids. - * - * Index is the quantile desired multiplied by the total number of samples - 1. - * - * @param index Denormalized quantile desired - * @param previousIndex The denormalized quantile corresponding to the center of the previous centroid. - * @param nextIndex The denormalized quantile corresponding to the center of the following centroid. - * @param previousMean The mean of the previous centroid. - * @param nextMean The mean of the following centroid. - * @return The interpolated mean. - */ - static double quantile(double index, double previousIndex, double nextIndex, double previousMean, double nextMean) { + public abstract void compress(GroupTree other); + + static double quantile(double previousIndex, double index, double nextIndex, double previousMean, double nextMean) { final double delta = nextIndex - previousIndex; final double previousWeight = (nextIndex - index) / delta; final double nextWeight = (index - previousIndex) / delta; diff --git a/src/main/java/com/tdunning/math/stats/ArrayDigest.java b/src/main/java/com/tdunning/math/stats/ArrayDigest.java index 9a6339a..ce425ae 100644 --- a/src/main/java/com/tdunning/math/stats/ArrayDigest.java +++ b/src/main/java/com/tdunning/math/stats/ArrayDigest.java @@ -18,9 +18,7 @@ package com.tdunning.math.stats; import java.nio.ByteBuffer; -import java.util.AbstractCollection; import java.util.ArrayList; -import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; @@ -107,14 +105,13 @@ public class ArrayDigest extends AbstractTDigest { if (n == 1) { // if the nearest point was unique, centroid ordering cannot change Page p = data.get(closest.page); - p.centroids[closest.subPage] = weightedAverage(p.centroids[closest.subPage], p.counts[closest.subPage], x, w); p.counts[closest.subPage] += w; p.totalCount += w; + p.centroids[closest.subPage] += (x - p.centroids[closest.subPage]) / p.counts[closest.subPage]; if (p.history != null && p.history.get(closest.subPage) != null) { p.history.get(closest.subPage).add(x); } totalWeight += w; - assert p.sorted(); } else { // if the nearest point was not unique, then we may not be modifying the first copy // which means that ordering can change @@ -175,6 +172,25 @@ public class ArrayDigest extends AbstractTDigest { return r; } + /** + * Returns the number of centroids strictly before the limit. + */ + private int headCount(Index limit) { + int r = 0; + + for (int i = 0; i < limit.page; i++) { + r += data.get(i).active; + } + + if (limit.page < data.size()) { + for (int j = 0; j < limit.subPage; j++) { + r++; + } + } + + return r; + } + public double mean(Index index) { return data.get(index.page).centroids[index.subPage]; } @@ -205,6 +221,11 @@ public class ArrayDigest extends AbstractTDigest { } @Override + public void compress(GroupTree other) { + throw new UnsupportedOperationException("Default operation"); + } + + @Override public long size() { return totalWeight; } @@ -260,9 +281,9 @@ public class ArrayDigest extends AbstractTDigest { throw new IllegalArgumentException("q should be in [0,1], got " + q); } - if (centroidCount == 0) { + if (centroidCount() == 0) { return Double.NaN; - } else if (centroidCount == 1) { + } else if (centroidCount() == 1) { return data.get(0).centroids[0]; } @@ -307,14 +328,14 @@ public class ArrayDigest extends AbstractTDigest { } // common case: we found two centroids previous and next so that the desired quantile is // after 'previous' but before 'next' - return quantile(index, previousIndex, nextIndex, previousMean, next.mean()); + return quantile(previousIndex, index, nextIndex, previousMean, next.mean()); } else if (!it.hasNext()) { // special case 2: the index we are interested in is beyond the last centroid // again, assume values grow linearly between index previousIndex and (count - 1) // which is the highest possible index final double nextIndex2 = size() - 1; final double nextMean2 = (next.mean() * (nextIndex2 - previousIndex) - previousMean * (nextIndex2 - nextIndex)) / (nextIndex - previousIndex); - return quantile(index, nextIndex, nextIndex2, next.mean(), nextMean2); + return quantile(nextIndex, index, nextIndex2, next.mean(), nextMean2); } total += next.count(); previousMean = next.mean(); @@ -323,44 +344,26 @@ public class ArrayDigest extends AbstractTDigest { } @Override - public Collection<Centroid> centroids() { - return new AbstractCollection<Centroid>() { - - @Override - public Iterator<Centroid> iterator() { - final Iterator<Index> ix = ArrayDigest.this.iterator(0, 0); - return new Iterator<Centroid>() { - - @Override - public boolean hasNext() { - return ix.hasNext(); - } - - @Override - public Centroid next() { - final Index index = ix.next(); - final Page current = data.get(index.page); - Centroid centroid = new Centroid(current.centroids[index.subPage], current.counts[index.subPage]); - if (current.history != null) { - for (double x : current.history.get(index.subPage)) { - centroid.insertData(x); - } - } - return centroid; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } + public int centroidCount() { + return centroidCount; + } - @Override - public int size() { - return centroidCount; + @Override + public Iterable<? extends Centroid> centroids() { + List<Centroid> r = new ArrayList<Centroid>(); + Iterator<Index> ix = iterator(0, 0); + while (ix.hasNext()) { + Index index = ix.next(); + Page current = data.get(index.page); + Centroid centroid = new Centroid(current.centroids[index.subPage], current.counts[index.subPage]); + if (current.history != null) { + for (double x : current.history.get(index.subPage)) { + centroid.insertData(x); + } } - }; + r.add(centroid); + } + return r; } public Iterator<Index> allAfter(double x) { @@ -784,15 +787,6 @@ public class ArrayDigest extends AbstractTDigest { history = this.recordAllData ? new ArrayList<List<Double>>() : null; } - boolean sorted() { - for (int i = 1; i < active; ++i) { - if (centroids[i] < centroids[i - 1]) { - return false; - } - } - return true; - } - public Page add(double x, int w, List<Double> history) { for (int i = 0; i < active; i++) { if (centroids[i] >= x) { @@ -805,12 +799,9 @@ public class ArrayDigest extends AbstractTDigest { } else { newPage.addAt(i - pageSize / 2, x, w, history); } - assert sorted(); - assert newPage.sorted(); return newPage; } else { addAt(i, x, w, history); - assert sorted(); return null; } } @@ -821,12 +812,9 @@ public class ArrayDigest extends AbstractTDigest { // split page Page newPage = split(); newPage.addAt(newPage.active, x, w, history); - assert sorted(); - assert newPage.sorted(); return newPage; } else { addAt(active, x, w, history); - assert sorted(); return null; } } @@ -842,7 +830,6 @@ public class ArrayDigest extends AbstractTDigest { centroids[i] = x; counts[i] = w; } else { - assert i == active; centroids[active] = x; counts[active] = w; if (this.history != null) { @@ -891,7 +878,6 @@ public class ArrayDigest extends AbstractTDigest { } active--; totalCount -= w; - assert sorted(); } } } diff --git a/src/main/java/com/tdunning/math/stats/Centroid.java b/src/main/java/com/tdunning/math/stats/Centroid.java index cf94e83..2c4d720 100644 --- a/src/main/java/com/tdunning/math/stats/Centroid.java +++ b/src/main/java/com/tdunning/math/stats/Centroid.java @@ -34,7 +34,7 @@ public class Centroid implements Comparable<Centroid> { private List<Double> actualData = null; Centroid(boolean record) { - id = uniqueCount.getAndIncrement(); + id = uniqueCount.incrementAndGet(); if (record) { actualData = new ArrayList<Double>(); } @@ -60,11 +60,6 @@ public class Centroid implements Comparable<Centroid> { start(x, 1, id); } - Centroid(double x, int w, List<Double> data) { - this(x, w); - actualData = data; - } - private void start(double x, int w, int id) { this.id = id; add(x, w); @@ -139,7 +134,7 @@ public class Centroid implements Comparable<Centroid> { actualData.add(x); } } - centroid = AbstractTDigest.weightedAverage(centroid, count, x, w); count += w; + centroid += w * (x - centroid) / count; } } diff --git a/src/main/java/com/tdunning/math/stats/GroupTree.java b/src/main/java/com/tdunning/math/stats/GroupTree.java index 031e9c0..fe107ff 100644 --- a/src/main/java/com/tdunning/math/stats/GroupTree.java +++ b/src/main/java/com/tdunning/math/stats/GroupTree.java @@ -17,7 +17,6 @@ package com.tdunning.math.stats; -import java.util.AbstractCollection; import java.util.ArrayDeque; import java.util.Deque; import java.util.Iterator; @@ -27,7 +26,7 @@ import java.util.NoSuchElementException; * A tree containing TDigest.Centroid. This adds to the normal NavigableSet the * ability to sum up the size of elements to the left of a particular group. */ -public class GroupTree extends AbstractCollection<Centroid> { +public class GroupTree implements Iterable<Centroid> { private long count; private int size; private int depth; @@ -56,13 +55,13 @@ public class GroupTree extends AbstractCollection<Centroid> { leaf = this.right.first(); } - public boolean add(Centroid centroid) { + public void add(Centroid centroid) { if (size == 0) { leaf = centroid; depth = 1; count = centroid.count(); size = 1; - return true; + return; } else if (size == 1) { int order = centroid.compareTo(leaf); if (order < 0) { @@ -83,7 +82,6 @@ public class GroupTree extends AbstractCollection<Centroid> { depth = Math.max(left.depth, right.depth) + 1; rebalance(); - return true; } /** @@ -227,7 +225,7 @@ public class GroupTree extends AbstractCollection<Centroid> { if (next == null) { next = computeNext(); } - return next != null && next != end; + return next != end; } @Override diff --git a/src/main/java/com/tdunning/math/stats/IntAVLTree.java b/src/main/java/com/tdunning/math/stats/IntAVLTree.java index 3e39015..2fa28fc 100644 --- a/src/main/java/com/tdunning/math/stats/IntAVLTree.java +++ b/src/main/java/com/tdunning/math/stats/IntAVLTree.java @@ -196,7 +196,7 @@ abstract class IntAVLTree { return true; } else { int node = root; assert parent(root) == NIL; - int parent; + int parent = NIL; int cmp; do { cmp = compare(node); diff --git a/src/main/java/com/tdunning/math/stats/MergingDigest.java b/src/main/java/com/tdunning/math/stats/MergingDigest.java deleted file mode 100644 index 65204c0..0000000 --- a/src/main/java/com/tdunning/math/stats/MergingDigest.java +++ /dev/null @@ -1,640 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.tdunning.math.stats; - -import java.nio.ByteBuffer; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.List; - -/** - * Maintains a t-digest by collecting new points in a buffer that is then sorted occasionally and merged - * into a sorted array that contains previously computed centroids. - * <p/> - * This can be very fast because the cost of sorting and merging is amortized over several insertion. If - * we keep N centroids total and have the input array is k long, then the amortized cost is something like - * <p/> - * N/k + log k - * <p/> - * These costs even out when N/k = log k. Balancing costs is often a good place to start in optimizing an - * algorithm. For different values of compression factor, the following table shows estimated asymptotic - * values of N and suggested values of k: - * <table> - * <thead> - * <tr><td>Compression</td><td>N</td><td>k</td></tr> - * </thead> - * <tbody> - * <tr><td>50</td><td>78</td><td>25</td></tr> - * <tr><td>100</td><td>157</td><td>42</td></tr> - * <tr><td>200</td><td>314</td><td>73</td></tr> - * </tbody> - * </table> - * <p/> - * The virtues of this kind of t-digest implementation include: - * <ul> - * <li>No allocation is required after initialization</li> - * <li>The data structure automatically compresses existing centroids when possible</li> - * <li>No Java object overhead is incurred for centroids since data is kept in primitive arrays</li> - * </ul> - * <p/> - * The current implementation takes the liberty of using ping-pong buffers for implementing the merge resulting - * in a substantial memory penalty, but the complexity of an in place merge was not considered as worthwhile - * since even with the overhead, the memory cost is less than 40 bytes per centroid which is much less than half - * what the AVLTreeDigest uses. Speed tests are still not complete so it is uncertain whether the merge - * strategy is faster than the tree strategy. - */ -public class MergingDigest extends AbstractTDigest { - private final double compression; - - // points to the centroid that is currently being merged - // if weight[lastUsedCell] == 0, then this is the number of centroids - // else the number is lastUsedCell+1 - private int lastUsedCell; - - // sum_i weight[i] See also unmergedWeight - private double totalWeight = 0; - - // number of points that have been added to each merged centroid - private double[] weight; - // mean of points added to each merged centroid - private double[] mean; - // absolute min and max samples seen - private double min, max; - - // history of all data added to centroids (for testing purposes) - private List<List<Double>> data = null; - - // buffers for merging - private double[] mergeWeight; - private double[] mergeMean; - private List<List<Double>> mergeData = null; - - // sum_i tempWeight[i] - private double unmergedWeight = 0; - - // this is the index of the next temporary centroid - // this is a more Java-like convention than lastUsedCell uses - private int tempUsed = 0; - private double[] tempWeight; - private double[] tempMean; - private List<List<Double>> tempData = null; - - - // array used for sorting the temp centroids. This is a field - // to avoid allocations during operation - private int[] order; - - /** - * Allocates a buffer merging t-digest. This is the normally used constructor that - * allocates default sized internal arrays. Other versions are available, but should - * only be used for special cases. - * - * @param compression The compression factor - */ - public MergingDigest(double compression) { - // magic formula created by regressing against known sizes for sample compression values - this(compression, estimateBufferSize(compression)); - } - - private static int estimateBufferSize(double compression) { - if (compression < 20) { - compression = 20; - } - if (compression > 1000) { - compression = 1000; - } - return (int) (7.5 + 0.37 * compression - 2e-4 * compression * compression); - } - - /** - * If you know the size of the temporary buffer for incoming points, you can use this entry point. - * - * @param compression Compression factor for t-digest. Same as 1/\delta in the paper. - * @param bufferSize How many samples to retain before merging. - */ - public MergingDigest(double compression, int bufferSize) { - // should only need ceiling(compression * PI / 2). Double the allocation for now for safety - this(compression, bufferSize, (int) (Math.PI * compression + 0.5)); - } - - /** - * Fully specified constructor. Normally only used for deserializing a buffer t-digest. - * - * @param compression Compression factor - * @param bufferSize Number of temporary centroids - * @param size Size of main buffer - */ - public MergingDigest(double compression, int bufferSize, int size) { - this.compression = compression; - - weight = new double[size]; - mean = new double[size]; - min = Double.MAX_VALUE; - max = -Double.MAX_VALUE; - - mergeWeight = new double[size]; - mergeMean = new double[size]; - - tempWeight = new double[bufferSize]; - tempMean = new double[bufferSize]; - order = new int[bufferSize]; - - lastUsedCell = 0; - } - - /** - * Turns on internal data recording. - */ - @Override - public TDigest recordAllData() { - super.recordAllData(); - data = new ArrayList<List<Double>>(); - mergeData = new ArrayList<List<Double>>(); - return this; - } - - @Override - void add(double x, int w, Centroid base) { - add(x, w, base.data()); - } - - @Override - public void add(double x, int w) { - add(x, w, (List<Double>) null); - } - - public void add(double x, int w, List<Double> history) { - if (Double.isNaN(x)) { - throw new IllegalArgumentException("Cannot add NaN to t-digest"); - } - if (tempUsed >= tempWeight.length) { - mergeNewValues(); - } - int where = tempUsed++; - tempWeight[where] = w; - tempMean[where] = x; - unmergedWeight += w; - - if (data != null) { - if (tempData == null) { - tempData = new ArrayList<List<Double>>(); - } - while (tempData.size() <= where) { - tempData.add(new ArrayList<Double>()); - } - if (history == null) { - history = Collections.singletonList(x); - } - tempData.get(where).addAll(history); - } - } - - private void mergeNewValues() { - if (unmergedWeight > 0) { - Sort.sort(order, tempMean, tempUsed); - - double wSoFar = 0; - double k1 = 0; - int i = 0; - int j = 0; - int n = 0; - if (totalWeight > 0) { - if (weight[lastUsedCell] > 0) { - n = lastUsedCell + 1; - } else { - n = lastUsedCell; - } - } - lastUsedCell = 0; - totalWeight += unmergedWeight; - unmergedWeight = 0; - - // merge tempWeight,tempMean and weight,mean into mergeWeight,mergeMean - while (i < tempUsed && j < n) { - int ix = order[i]; - if (tempMean[ix] <= mean[j]) { - wSoFar += tempWeight[ix]; - k1 = mergeCentroid(wSoFar, k1, tempWeight[ix], tempMean[ix], tempData != null ? tempData.get(ix) : null); - i++; - } else { - wSoFar += weight[j]; - k1 = mergeCentroid(wSoFar, k1, weight[j], mean[j], data != null ? data.get(j) : null); - j++; - } - } - - while (i < tempUsed) { - int ix = order[i]; - wSoFar += tempWeight[ix]; - k1 = mergeCentroid(wSoFar, k1, tempWeight[ix], tempMean[ix], tempData != null ? tempData.get(ix) : null); - i++; - } - - while (j < n) { - wSoFar += weight[j]; - k1 = mergeCentroid(wSoFar, k1, weight[j], mean[j], data != null ? data.get(j) : null); - j++; - } - tempUsed = 0; - - // swap pointers for working space and merge space - double[] z = weight; - weight = mergeWeight; - mergeWeight = z; - Arrays.fill(mergeWeight, 0); - - z = mean; - mean = mergeMean; - mergeMean = z; - - if (totalWeight > 0) { - min = Math.min(min, mean[0]); - if (weight[lastUsedCell] > 0) { - max = Math.max(max, mean[lastUsedCell]); - } else { - max = Math.max(max, mean[lastUsedCell - 1]); - } - } - - if (data != null) { - data = mergeData; - mergeData = new ArrayList<List<Double>>(); - tempData = new ArrayList<List<Double>>(); - } - } - } - - private double mergeCentroid(double wSoFar, double k1, double w, double m, List<Double> newData) { - double k2 = integratedLocation(wSoFar / totalWeight); - if (k2 - k1 <= 1 || mergeWeight[lastUsedCell] == 0) { - // merge into existing centroid - mergeWeight[lastUsedCell] += w; - mergeMean[lastUsedCell] = mergeMean[lastUsedCell] + (m - mergeMean[lastUsedCell]) * w / mergeWeight[lastUsedCell]; - } else { - // create new centroid - lastUsedCell++; - mergeMean[lastUsedCell] = m; - mergeWeight[lastUsedCell] = w; - - k1 = integratedLocation((wSoFar - w) / totalWeight); - } - if (mergeData != null) { - while (mergeData.size() <= lastUsedCell) { - mergeData.add(new ArrayList<Double>()); - } - mergeData.get(lastUsedCell).addAll(newData); - } - - return k1; - } - - /** - * Exposed for testing. - */ - int checkWeights() { - return checkWeights(weight, totalWeight, lastUsedCell); - } - - private int checkWeights(double[] w, double total, int last) { - int badCount = 0; - - int n = last; - if (w[n] > 0) { - n++; - } - - double k1 = 0; - double q = 0; - for (int i = 0; i < n; i++) { - double dq = w[i] / total; - double k2 = integratedLocation(q + dq); - if (k2 - k1 > 1 && w[i] != 1) { - System.out.printf("Oversize centroid at %d, k0=%.2f, k1=%.2f, dk=%.2f, w=%.2f, q=%.4f\n", i, k1, k2, k2 - k1, w[i], q); - badCount++; - } - if (k2 - k1 > 1.5 && w[i] != 1) { - throw new IllegalStateException(String.format("Egregiously oversized centroid at %d, k0=%.2f, k1=%.2f, dk=%.2f, w=%.2f, q=%.4f\n", i, k1, k2, k2 - k1, w[i], q)); - } - q += dq; - k1 = k2; - } - - return badCount; - } - - /** - * Converts a quantile into a centroid scale value. The centroid scale is nominally - * the number k of the centroid that a quantile point q should belong to. Due to - * round-offs, however, we can't align things perfectly without splitting points - * and centroids. We don't want to do that, so we have to allow for offsets. - * In the end, the criterion is that any quantile range that spans a centroid - * scale range more than one should be split across more than one centroid if - * possible. This won't be possible if the quantile range refers to a single point - * or an already existing centroid. - * <p/> - * This mapping is steep near q=0 or q=1 so each centroid there will correspond to - * less q range. Near q=0.5, the mapping is flatter so that centroids there will - * represent a larger chunk of quantiles. - * - * @param q The quantile scale value to be mapped. - * @return The centroid scale value corresponding to q. - */ - private double integratedLocation(double q) { - return compression * (Math.asin(2 * q - 1) + Math.PI / 2) / Math.PI; - } - - @Override - public void compress() { - mergeNewValues(); - } - - @Override - public long size() { - return (long) (totalWeight + unmergedWeight); - } - - @Override - public double cdf(double x) { - mergeNewValues(); - - if (lastUsedCell == 0) { - if (weight[lastUsedCell] == 0) { - // no data to examine - return Double.NaN; - } else { - // exactly one centroid, probably have max==min - if (x < min) { - return 0; - } else if (x > max) { - return 1; - } else if (max - min < Double.MIN_NORMAL) { - // x lands right on our only sample - return 0.5; - } else { - // interpolate - return (x - min) / (max - min); - } - } - } else { - int n = lastUsedCell; - if (weight[n] > 0) { - n++; - } - - if (x < min) { - return 0; - } - - if (x >= max) { - return 1; - } - - // we now know that there are at least two centroids - double r = 0; - // we scan a across the centroids - double a = min; - double aCount; - - double b = min; - double bCount = 0; - - double left; - double right = 0; - - // to find enclosing pair of centroids (counting min as a virtual centroid - for (int it = 0; it < n; it++) { - left = b - (a + right); - a = b; - aCount = bCount; - - b = mean[it]; - bCount = weight[it]; - right = (b - a) * aCount / (aCount + bCount); - - // we know that x >= a-left - if (x < a + right) { - double value = (r + aCount * interpolate(x, a - left, a + right)) / totalWeight; - return value > 0.0 ? value : 0.0; - } - - r += aCount; - } - - left = b - (a + right); - a = b; - aCount = bCount; - right = max - a; - - // for the last element, use max to determine right - if (x < a + right) { - return (r + aCount * interpolate(x, a - left, a + right)) / totalWeight; - } else { - return 1; - } - } - } - - @Override - public double quantile(double q) { - if (q < 0 || q > 1) { - throw new IllegalArgumentException("q should be in [0,1], got " + q); - } - mergeNewValues(); - - if (lastUsedCell == 0 && weight[lastUsedCell] == 0) { - return Double.NaN; - } else if (lastUsedCell == 0) { - return mean[0]; - } - - // we know that there are at least two centroids now - - int n = lastUsedCell; - if (weight[n] > 0) { - n++; - } - - // if values were stored in a sorted array, index would be the offset we are interested in - final double index = q * totalWeight; - - double left; - double a; - double aCount; - - double right = min; - double b = mean[0]; - double bCount = weight[0]; - - double weightSoFar = 0; - for (int it = 1; it < n; it++) { - a = b; - aCount = bCount; - left = right; - - b = mean[it]; - bCount = weight[it]; - right = (bCount * a + aCount * b) / (aCount + bCount); - - if (index < weightSoFar + aCount) { - // belongs to left side of a - double p = (index - weightSoFar) / aCount; - return left * (1 - p) + right * p; - } - weightSoFar += aCount; - } - - left = right; - aCount = bCount; - right = max; - - if (index < weightSoFar + aCount) { - // belongs to left side of a - double p = (index - weightSoFar) / aCount; - return left * (1 - p) + right * p; - } else { - return max; - } - } - - @Override - public Collection<Centroid> centroids() { - // we don't actually keep centroid structures around so we have to fake it - compress(); - List<Centroid> r = new ArrayList<Centroid>(); - for (int i = 0; i <= lastUsedCell; i++) { - r.add(new Centroid(mean[i], (int) weight[i], data != null ? data.get(i) : null)); - } - return r; - } - - @Override - public double compression() { - return compression; - } - - @Override - public int byteSize() { - compress(); - // format code, compression(float), buffer-size(int), temp-size(int), #centroids-1(int), - // then two doubles per centroid - return (lastUsedCell + 1) * 16 + 40; - } - - @Override - public int smallByteSize() { - compress(); - // format code(int), compression(float), buffer-size(short), temp-size(short), #centroids-1(short), - // then two floats per centroid - return lastUsedCell * 8 + 38; - } - - /** - * Over-ride the min and max values for testing purposes - */ - void setMinMax(double min, double max) { - this.min = min; - this.max = max; - } - - public enum Encoding { - VERBOSE_ENCODING(1), SMALL_ENCODING(2); - - private final int code; - - Encoding(int code) { - this.code = code; - } - } - - @Override - public void asBytes(ByteBuffer buf) { - compress(); - buf.putInt(Encoding.VERBOSE_ENCODING.code); - buf.putDouble(min); - buf.putDouble(max); - buf.putFloat((float) compression); - buf.putFloat((float) compression); - buf.putInt(mean.length); - buf.putInt(tempMean.length); - buf.putInt(lastUsedCell); - for (int i = 0; i <= lastUsedCell; i++) { - buf.putDouble(weight[i]); - buf.putDouble(mean[i]); - } - } - - @Override - public void asSmallBytes(ByteBuffer buf) { - compress(); - buf.putInt(Encoding.SMALL_ENCODING.code); - buf.putDouble(min); - buf.putDouble(max); - buf.putFloat((float) compression); - buf.putShort((short) mean.length); - buf.putShort((short) tempMean.length); - buf.putShort((short) lastUsedCell); - for (int i = 0; i <= lastUsedCell; i++) { - buf.putFloat((float) weight[i]); - buf.putFloat((float) mean[i]); - } - } - - public static MergingDigest fromBytes(ByteBuffer buf) { - int encoding = buf.getInt(); - if (encoding == Encoding.VERBOSE_ENCODING.code) { - double min = buf.getDouble(); - double max = buf.getDouble(); - double compression = buf.getFloat(); - int n = buf.getInt(); - int bufferSize = buf.getInt(); - MergingDigest r = new MergingDigest(compression, bufferSize, n); - r.min = min; - r.max = max; - r.lastUsedCell = buf.getInt(); - for (int i = 0; i <= r.lastUsedCell; i++) { - r.weight[i] = buf.getDouble(); - r.mean[i] = buf.getDouble(); - - r.totalWeight += r.weight[i]; - } - return r; - } else if (encoding == Encoding.SMALL_ENCODING.code) { - double min = buf.getDouble(); - double max = buf.getDouble(); - double compression = buf.getFloat(); - int n = buf.getShort(); - int bufferSize = buf.getShort(); - MergingDigest r = new MergingDigest(compression, bufferSize, n); - r.min = min; - r.max = max; - r.lastUsedCell = buf.getShort(); - for (int i = 0; i <= r.lastUsedCell; i++) { - r.weight[i] = buf.getFloat(); - r.mean[i] = buf.getFloat(); - - r.totalWeight += r.weight[i]; - } - return r; - } else { - throw new IllegalStateException("Invalid format for serialized histogram"); - } - - } -} diff --git a/src/main/java/com/tdunning/math/stats/Sort.java b/src/main/java/com/tdunning/math/stats/Sort.java deleted file mode 100644 index 7f26c66..0000000 --- a/src/main/java/com/tdunning/math/stats/Sort.java +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.tdunning.math.stats; - -/** - * Static sorting methods - */ -public class Sort { - /** - * Quick sort using an index array. On return, the - * values[order[i]] is in order as i goes 0..values.length - * - * @param order Indexes into values - * @param values The values to sort. - */ - public static void sort(int[] order, double[] values) { - sort(order, values, values.length); - } - - /** - * Quick sort using an index array. On return, the - * values[order[i]] is in order as i goes 0..values.length - * - * @param order Indexes into values - * @param values The values to sort. - * @param n The number of values to sort - */ - public static void sort(int[] order, double[] values, int n) { - for (int i = 0; i < n; i++) { - order[i] = i; - } - quickSort(order, values, 0, n, 8); - insertionSort(order, values, n, 8); - } - - /** - * Standard quick sort except that sorting is done on an index array rather than the values themselves - * - * @param order The pre-allocated index array - * @param values The values to sort - * @param start The beginning of the values to sort - * @param end The value after the last value to sort - * @param limit The minimum size to recurse down to. - */ - private static void quickSort(int[] order, double[] values, int start, int end, int limit) { - // the while loop implements tail-recursion to avoid excessive stack calls on nasty cases - while (end - start > limit) { - - // median of three values for the pivot - int a = start; - int b = (start + end) / 2; - int c = end - 1; - - int pivotIndex; - double pivotValue; - double va = values[order[a]]; - double vb = values[order[b]]; - double vc = values[order[c]]; - if (va > vb) { - if (vc > va) { - // vc > va > vb - pivotIndex = a; - pivotValue = va; - } else { - // va > vb, va >= vc - if (vc < vb) { - // va > vb > vc - pivotIndex = b; - pivotValue = vb; - } else { - // va >= vc >= vb - pivotIndex = c; - pivotValue = vc; - } - } - } else { - // vb >= va - if (vc > vb) { - // vc > vb >= va - pivotIndex = b; - pivotValue = vb; - } else { - // vb >= va, vb >= vc - if (vc < va) { - // vb >= va > vc - pivotIndex = a; - pivotValue = va; - } else { - // vb >= vc >= va - pivotIndex = c; - pivotValue = vc; - } - } - } - - // move pivot to beginning of array - swap(order, start, pivotIndex); - - // we use a three way partition because many duplicate values is an important case - - int low = start + 1; // low points to first value not known to be equal to pivotValue - int high = end; // high points to first value > pivotValue - int i = low; // i scans the array - while (i < high) { - // invariant: values[order[k]] == pivotValue for k in [0..low) - // invariant: values[order[k]] < pivotValue for k in [low..i) - // invariant: values[order[k]] > pivotValue for k in [high..end) - // in-loop: i < high - // in-loop: low < high - // in-loop: i >= low - double vi = values[order[i]]; - if (vi == pivotValue) { - if (low != i) { - swap(order, low, i); - } else { - i++; - } - low++; - } else if (vi > pivotValue) { - high--; - swap(order, i, high); - } else { - // vi < pivotValue - i++; - } - } - // invariant: values[order[k]] == pivotValue for k in [0..low) - // invariant: values[order[k]] < pivotValue for k in [low..i) - // invariant: values[order[k]] > pivotValue for k in [high..end) - // assert i == high || low == high therefore, we are done with partition - - // at this point, i==high, from [start,low) are == pivot, [low,high) are < and [high,end) are > - // we have to move the values equal to the pivot into the middle. To do this, we swap pivot - // values into the top end of the [low,high) range stopping when we run out of destinations - // or when we run out of values to copy - int from = start; - int to = high - 1; - for (i = 0; from < low && to >= low; i++) { - swap(order, from++, to--); - } - if (from == low) { - // ran out of things to copy. This means that the the last destination is the boundary - low = to + 1; - } else { - // ran out of places to copy to. This means that there are uncopied pivots and the - // boundary is at the beginning of those - low = from; - } - -// checkPartition(order, values, pivotValue, start, low, high, end); - - // now recurse, but arrange it so we handle the longer limit by tail recursion - if (low - start < end - high) { - quickSort(order, values, start, low, limit); - - // this is really a way to do - // quickSort(order, values, high, end, limit); - start = high; - } else { - quickSort(order, values, high, end, limit); - // this is really a way to do - // quickSort(order, values, start, low, limit); - end = low; - } - } - } - - private static void swap(int[] order, int i, int j) { - int t = order[i]; - order[i] = order[j]; - order[j] = t; - } - - /** - * Check that a partition step was done correctly. For debugging and testing. - */ - @SuppressWarnings("UnusedDeclaration") - public static void checkPartition(int[] order, double[] values, double pivotValue, int start, int low, int high, int end) { - if (order.length != values.length) { - throw new IllegalArgumentException("Arguments must be same size"); - } - - if (!(start >= 0 && low >= start && high >= low && end >= high)) { - throw new IllegalArgumentException(String.format("Invalid indices %d, %d, %d, %d", start, low, high, end)); - } - - for (int i = 0; i < low; i++) { - double v = values[order[i]]; - if (v >= pivotValue) { - throw new IllegalArgumentException(String.format("Value greater than pivot at %d", i)); - } - } - - for (int i = low; i < high; i++) { - if (values[order[i]] != pivotValue) { - throw new IllegalArgumentException(String.format("Non-pivot at %d", i)); - } - } - - for (int i = high; i < end; i++) { - double v = values[order[i]]; - if (v <= pivotValue) { - throw new IllegalArgumentException(String.format("Value less than pivot at %d", i)); - } - } - } - - /** - * Limited range insertion sort. We assume that no element has to move more than limit steps - * because quick sort has done its thing. - * - * @param order The permutation index - * @param values The values we are sorting - * @param limit The largest amount of disorder - */ - private static void insertionSort(int[] order, double[] values, int n, int limit) { - for (int i = 1; i < n; i++) { - int t = order[i]; - double v = values[order[i]]; - int m = Math.max(i - limit, 0); - for (int j = i; j >= m; j--) { - if (j == 0 || values[order[j - 1]] <= v) { - if (j < i) { - System.arraycopy(order, j, order, j + 1, i - j); - order[j] = t; - } - break; - } - } - } - } -} diff --git a/src/main/java/com/tdunning/math/stats/TDigest.java b/src/main/java/com/tdunning/math/stats/TDigest.java index f027081..b4e699a 100644 --- a/src/main/java/com/tdunning/math/stats/TDigest.java +++ b/src/main/java/com/tdunning/math/stats/TDigest.java @@ -18,7 +18,6 @@ package com.tdunning.math.stats; import java.nio.ByteBuffer; -import java.util.Collection; /** * Adaptive histogram based on something like streaming k-means crossed with Q-digest. @@ -64,7 +63,7 @@ public abstract class TDigest { } /** - * Creates a TreeDigest. Going forward, AVLTreeDigest should be preferred to the TreeDigest since they are + * Creates a TreeDigest. Going forward, ArrayDigests should be preferred to the TreeDigest since they are * uniformly faster and require less memory while producing nearly identical results. * * @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large. @@ -76,29 +75,6 @@ public abstract class TDigest { } /** - * Creates an AVLTreeDigest. AVLTreeDigest is generally the best known implementation right now. - * - * @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large. - * The number of centroids retained will be a smallish (usually less than 10) multiple of this number. - * @return the TreeDigest - */ - public static TDigest createAvlTreeDigest(double compression) { - return new AVLTreeDigest(compression); - } - - /** - * Creates a TreeDigest of whichever type is the currently recommended type. AVLTreeDigest is generally the best - * known implementation right now. - * - * @param compression The compression parameter. 100 is a common value for normal uses. 1000 is extremely large. - * The number of centroids retained will be a smallish (usually less than 10) multiple of this number. - * @return the TreeDigest - */ - public static TDigest createDigest(double compression) { - return createAvlTreeDigest(compression); - } - - /** * Adds a sample to a histogram. * * @param x The value to add. @@ -145,12 +121,19 @@ public abstract class TDigest { public abstract double quantile(double q); /** - * A {@link Collection} that lets you go through the centroids in ascending order by mean. Centroids + * The number of centroids currently in the TDigest. + * + * @return The number of centroids + */ + public abstract int centroidCount(); + + /** + * An iterable that lets you go through the centroids in ascending order by mean. Centroids * returned will not be re-used, but may or may not share storage with this TDigest. * - * @return The centroids in the form of a Collection. + * @return The centroids in the form of an Iterable. */ - public abstract Collection<Centroid> centroids(); + public abstract Iterable<? extends Centroid> centroids(); /** * Returns the current compression factor. diff --git a/src/main/java/com/tdunning/math/stats/TreeDigest.java b/src/main/java/com/tdunning/math/stats/TreeDigest.java index 01950df..86de5ad 100644 --- a/src/main/java/com/tdunning/math/stats/TreeDigest.java +++ b/src/main/java/com/tdunning/math/stats/TreeDigest.java @@ -19,7 +19,6 @@ package com.tdunning.math.stats; import java.nio.ByteBuffer; import java.util.ArrayList; -import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.List; @@ -149,12 +148,17 @@ public class TreeDigest extends AbstractTDigest { @Override public void compress() { + compress(summary); + } + + @Override + public void compress(GroupTree other) { TreeDigest reduced = new TreeDigest(compression); if (recordAllData) { reduced.recordAllData(); } List<Centroid> tmp = new ArrayList<Centroid>(); - for (Centroid centroid : summary) { + for (Centroid centroid : other) { tmp.add(centroid); } Collections.shuffle(tmp, gen); @@ -266,14 +270,14 @@ public class TreeDigest extends AbstractTDigest { } // common case: we found two centroids previous and next so that the desired quantile is // after 'previous' but before 'next' - return quantile(index, previousIndex, nextIndex, previousMean, next.mean()); + return quantile(previousIndex, index, nextIndex, previousMean, next.mean()); } else if (!it.hasNext()) { // special case 2: the index we are interested in is beyond the last centroid // again, assume values grow linearly between index previousIndex and (count - 1) // which is the highest possible index final double nextIndex2 = count - 1; final double nextMean2 = (next.mean() * (nextIndex2 - previousIndex) - previousMean * (nextIndex2 - nextIndex)) / (nextIndex - previousIndex); - return quantile(index, nextIndex, nextIndex2, next.mean(), nextMean2); + return quantile(nextIndex, index, nextIndex2, next.mean(), nextMean2); } total += next.count(); previousMean = next.mean(); @@ -282,8 +286,13 @@ public class TreeDigest extends AbstractTDigest { } @Override - public Collection<Centroid> centroids() { - return Collections.unmodifiableCollection(summary); + public int centroidCount() { + return summary.size(); + } + + @Override + public Iterable<? extends Centroid> centroids() { + return summary; } @Override diff --git a/src/test/java/com/tdunning/math/stats/AVLTreeDigestTest.java b/src/test/java/com/tdunning/math/stats/AVLTreeDigestTest.java index 64f626d..0e57a44 100644 --- a/src/test/java/com/tdunning/math/stats/AVLTreeDigestTest.java +++ b/src/test/java/com/tdunning/math/stats/AVLTreeDigestTest.java @@ -20,7 +20,6 @@ package com.tdunning.math.stats; import com.clearspring.analytics.stream.quantile.QDigest; import com.google.common.collect.Lists; -import com.google.common.collect.Sets; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.jet.random.AbstractContinousDistribution; import org.apache.mahout.math.jet.random.Gamma; @@ -37,14 +36,10 @@ import static org.junit.Assert.*; import static org.junit.Assume.assumeTrue; public class AVLTreeDigestTest extends TDigestTest { - @BeforeClass - public static void setup() throws IOException { - TDigestTest.setup("avl"); - } - private AvlDigestFactory factory = new AvlDigestFactory() { + private DigestFactory<TDigest> factory = new DigestFactory<TDigest>() { @Override - public AVLTreeDigest create() { + public TDigest create() { return new AVLTreeDigest(100); } }; @@ -54,6 +49,13 @@ public class AVLTreeDigestTest extends TDigestTest { RandomUtils.useTestSeed(); } + @After + public void flush() { + sizeDump.flush(); + errorDump.flush(); + deviationDump.flush(); + } + @Test public void testUniform() { Random gen = RandomUtils.getRandom(); @@ -131,10 +133,10 @@ public class AVLTreeDigestTest extends TDigestTest { } System.out.printf("# %fus per point\n", (System.nanoTime() - t0) * 1e-3 / 100000); - System.out.printf("# %d centroids\n", dist.centroids().size()); + System.out.printf("# %d centroids\n", dist.centroidCount()); // I would be happier with 5x compression, but repeated values make things kind of weird - assertTrue("Summary is too large: " + dist.centroids().size(), dist.centroids().size() < 10 * (double) 1000); + assertTrue("Summary is too large: " + dist.centroidCount(), dist.centroidCount() < 10 * (double) 1000); // all quantiles should round to nearest actual value for (int i = 0; i < 10; i++) { @@ -156,14 +158,14 @@ public class AVLTreeDigestTest extends TDigestTest { public void testSequentialPoints() { for (int i = 0; i < repeats(); i++) { runTest(factory, new AbstractContinousDistribution() { - double base = 0; + double base = 0; - @Override - public double nextDouble() { - base += Math.PI * 1e-5; - return base; - } - }, 100, new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999}, + @Override + public double nextDouble() { + base += Math.PI * 1e-5; + return base; + } + }, 100, new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999}, "sequential", true); } } @@ -192,7 +194,7 @@ public class AVLTreeDigestTest extends TDigestTest { buf.flip(); AVLTreeDigest dist2 = AVLTreeDigest.fromBytes(buf); - assertEquals(dist.centroids().size(), dist2.centroids().size()); + assertEquals(dist.centroidCount(), dist2.centroidCount()); assertEquals(dist.compression(), dist2.compression(), 0); assertEquals(dist.size(), dist2.size()); @@ -214,7 +216,7 @@ public class AVLTreeDigestTest extends TDigestTest { buf.flip(); dist2 = AVLTreeDigest.fromBytes(buf); - assertEquals(dist.centroids().size(), dist2.centroids().size()); + assertEquals(dist.centroidCount(), dist2.centroidCount()); assertEquals(dist.compression(), dist2.compression(), 0); assertEquals(dist.size(), dist2.size()); @@ -334,26 +336,19 @@ public class AVLTreeDigestTest extends TDigestTest { @Test() public void testSizeControl() throws IOException, InterruptedException, ExecutionException { - runSizeControl("scaling-avl.tsv", new AvlDigestFactory()); - - } - - private void runSizeControl(String name, final DigestFactory<? extends TDigest> factory) throws FileNotFoundException, InterruptedException { // very slow running data generator. Don't want to run this normally. To run slow tests use // mvn test -DrunSlowTests=true assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests"))); final Random gen0 = RandomUtils.getRandom(); - final PrintWriter out = new PrintWriter(new FileOutputStream(name)); + final PrintWriter out = new PrintWriter(new FileOutputStream("scaling.tsv")); out.printf("k\tsamples\tcompression\tsize1\tsize2\n"); - ExecutorService pool = Executors.newFixedThreadPool(20); - - Set<Future<String>> tasks = Sets.newHashSet(); - for (final int size : new int[]{10000, 1000, 100, 10}) { - for (int k = 0; k < 20; k++) { + List<Callable<String>> tasks = Lists.newArrayList(); + for (int k = 0; k < 20; k++) { + for (final int size : new int[]{10, 100, 1000, 10000}) { final int currentK = k; - Callable<String> task = new Callable<String>() { + tasks.add(new Callable<String>() { Random gen = new Random(gen0.nextLong()); @Override @@ -362,7 +357,7 @@ public class AVLTreeDigestTest extends TDigestTest { StringWriter s = new StringWriter(); PrintWriter out = new PrintWriter(s); for (double compression : new double[]{2, 5, 10, 20, 50, 100, 200, 500, 1000}) { - TDigest dist = factory.create(compression); + AVLTreeDigest dist = new AVLTreeDigest(compression); for (int i = 0; i < size * 1000; i++) { dist.add(gen.nextDouble()); } @@ -370,47 +365,19 @@ public class AVLTreeDigestTest extends TDigestTest { out.flush(); } out.close(); - System.out.printf("Done with %d,%d\n", currentK, size); return s.toString(); } - }; - - tasks.add(pool.submit(task)); + }); } } - // this idiom allows us to abort if there are any exceptions - // using pool.invokeAll(...) would require us to wait for all tasks - try { - while (tasks.size() > 0) { - List<Future<String>> done = Lists.newArrayList(); - for (Future<String> result : tasks) { - if (result.isDone()) { - done.add(result); - } - } - tasks.removeAll(done); - Thread.sleep(100); - } - } finally { - pool.shutdownNow(); + for (Future<String> result : Executors.newFixedThreadPool(20).invokeAll(tasks)) { + out.write(result.get()); } out.close(); } - private static class AvlDigestFactory extends DigestFactory<AVLTreeDigest> { - @Override - public AVLTreeDigest create() { - return create(50); - } - - @Override - public AVLTreeDigest create(double compression) { - return new AVLTreeDigest(compression); - } - } - @Test public void testScaling() throws FileNotFoundException, InterruptedException, ExecutionException { final Random gen0 = RandomUtils.getRandom(); @@ -507,9 +474,13 @@ public class AVLTreeDigestTest extends TDigestTest { // answer to extreme quantiles in that case ('extreme' in the sense that the // quantile is either before the first node or after the last one) AVLTreeDigest digest = new AVLTreeDigest(100); - digest.add(10, 3); - digest.add(20, 1); - digest.add(40, 5); + // we need to create the GroupTree manually + AVLGroupTree tree = (AVLGroupTree) digest.centroids(); + Centroid g = new Centroid(10); + tree.add(10, 3, null); + tree.add(20, 1, null); + tree.add(40, 5, null); + digest.count = 3 + 1 + 5; // this group tree is roughly equivalent to the following sorted array: // [ ?, 10, ?, 20, ?, ?, 50, ?, ? ] // and we expect it to compute approximate missing values: @@ -522,28 +493,6 @@ public class AVLTreeDigestTest extends TDigestTest { } @Test - public void testCDFNonDecreasing() { - AVLTreeDigest digest = new AVLTreeDigest(100); - - digest.add(25., 1); - digest.add(14., 1); - digest.add(10., 1); - digest.add(13., 1); - digest.add( 5., 1); - digest.add(20., 1); - digest.add(27., 1); - digest.compress(); - - double max = 0.; - for(int num = 0; num < 30; ++num) { - double cdf = digest.cdf((double)num); - System.out.printf("# cdf (%d) = %.3f\n", num, cdf); - assertTrue(max <= cdf); - max = cdf; - } - } - - @Test public void testSorted() { final TDigest digest = factory.create(); sorted(digest); diff --git a/src/test/java/com/tdunning/math/stats/ArrayDigestTest.java b/src/test/java/com/tdunning/math/stats/ArrayDigestTest.java index 5292dd2..683355b 100644 --- a/src/test/java/com/tdunning/math/stats/ArrayDigestTest.java +++ b/src/test/java/com/tdunning/math/stats/ArrayDigestTest.java @@ -25,7 +25,6 @@ import org.apache.mahout.math.jet.random.AbstractContinousDistribution; import org.apache.mahout.math.jet.random.Gamma; import org.apache.mahout.math.jet.random.Normal; import org.apache.mahout.math.jet.random.Uniform; -import org.junit.BeforeClass; import org.junit.Test; import java.io.FileNotFoundException; @@ -50,11 +49,6 @@ import static org.junit.Assert.*; import static org.junit.Assume.assumeTrue; public class ArrayDigestTest extends TDigestTest { - @BeforeClass - public static void setup() throws IOException { - TDigestTest.setup("array"); - } - private DigestFactory<ArrayDigest> factory = new DigestFactory<ArrayDigest>() { @Override public ArrayDigest create() { @@ -126,7 +120,7 @@ public class ArrayDigestTest extends TDigestTest { } assertEquals(totalWeight, ad.size()); - assertEquals(1001, ad.centroids().size()); + assertEquals(1001, ad.centroidCount()); for (int i = 0; i < 1000; i++) { int w = random.nextInt(5) + 2; @@ -137,7 +131,7 @@ public class ArrayDigestTest extends TDigestTest { } assertEquals(totalWeight, ad.size()); - assertEquals(2001, ad.centroids().size()); + assertEquals(2001, ad.centroidCount()); Collections.sort(ref); @@ -151,10 +145,10 @@ public class ArrayDigestTest extends TDigestTest { } assertEquals(0, Lists.newArrayList(ad.allBefore(0)).size()); - assertEquals(ad.centroids().size(), Lists.newArrayList(ad.allBefore(1)).size()); + assertEquals(ad.centroidCount(), Lists.newArrayList(ad.allBefore(1)).size()); assertEquals(0, Lists.newArrayList(ad.allAfter(1)).size()); - assertEquals(ad.centroids().size(), Lists.newArrayList(ad.allAfter(0)).size()); + assertEquals(ad.centroidCount(), Lists.newArrayList(ad.allAfter(0)).size()); for (int k = 0; k < 1000; k++) { final double split = random.nextDouble(); @@ -172,7 +166,7 @@ public class ArrayDigestTest extends TDigestTest { i++; } - assertEquals("Bad counts for split " + split, ad.centroids().size(), z1.size() + z2.size()); + assertEquals("Bad counts for split " + split, ad.centroidCount(), z1.size() + z2.size()); } } @@ -295,10 +289,10 @@ public class ArrayDigestTest extends TDigestTest { dist.compress(); System.out.printf("# %fus per point\n", (System.nanoTime() - t0) * 1e-3 / 100000); - System.out.printf("# %d centroids\n", dist.centroids().size()); + System.out.printf("# %d centroids\n", dist.centroidCount()); // I would be happier with 5x compression, but repeated values make things kind of weird - assertTrue(String.format("Summary is too large, got %d, wanted < %.1f", dist.centroids().size(), 10 * 1000.0), dist.centroids().size() < 10 * (double) 1000); + assertTrue(String.format("Summary is too large, got %d, wanted < %.1f", dist.centroidCount(), 10 * 1000.0), dist.centroidCount() < 10 * (double) 1000); // all quantiles should round to nearest actual value for (int i = 0; i < 10; i++) { @@ -350,7 +344,7 @@ public class ArrayDigestTest extends TDigestTest { buf.flip(); TDigest dist2 = ArrayDigest.fromBytes(buf); - assertEquals(dist.centroids().size(), dist2.centroids().size()); + assertEquals(dist.centroidCount(), dist2.centroidCount()); assertEquals(dist.compression(), dist2.compression(), 0); assertEquals(dist.size(), dist2.size()); @@ -364,7 +358,7 @@ public class ArrayDigestTest extends TDigestTest { buf.flip(); TDigest dist3 = ArrayDigest.fromBytes(buf); - assertEquals(dist.centroids().size(), dist3.centroids().size()); + assertEquals(dist.centroidCount(), dist3.centroidCount()); assertEquals(dist.compression(), dist3.compression(), 0); assertEquals(dist.size(), dist3.size()); @@ -386,7 +380,7 @@ public class ArrayDigestTest extends TDigestTest { buf.flip(); dist3 = ArrayDigest.fromBytes(buf); - assertEquals(dist.centroids().size(), dist3.centroids().size()); + assertEquals(dist.centroidCount(), dist3.centroidCount()); assertEquals(dist.compression(), dist3.compression(), 0); assertEquals(dist.size(), dist3.size()); @@ -512,12 +506,12 @@ public class ArrayDigestTest extends TDigestTest { assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests"))); final Random gen0 = RandomUtils.getRandom(); - final PrintWriter out = new PrintWriter(new FileOutputStream("scaling-array.tsv")); + final PrintWriter out = new PrintWriter(new FileOutputStream("scaling.tsv")); out.printf("k\tsamples\tcompression\tsize1\tsize2\n"); List<Callable<String>> tasks = Lists.newArrayList(); - for (final int size : new int[]{10000, 1000, 10, 100}) { - for (int k = 0; k < 20; k++) { + for (int k = 0; k < 20; k++) { + for (final int size : new int[]{10, 100, 1000, 10000}) { final int currentK = k; tasks.add(new Callable<String>() { Random gen = new Random(gen0.nextLong()); @@ -536,8 +530,6 @@ public class ArrayDigestTest extends TDigestTest { out.flush(); } out.close(); - System.out.printf("Finished %d,%d\n", currentK, size); - return s.toString(); } }); diff --git a/src/test/java/com/tdunning/math/stats/MergingDigestTest.java b/src/test/java/com/tdunning/math/stats/MergingDigestTest.java deleted file mode 100644 index ff99fbe..0000000 --- a/src/test/java/com/tdunning/math/stats/MergingDigestTest.java +++ /dev/null @@ -1,566 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.tdunning.math.stats; - -import com.clearspring.analytics.stream.quantile.QDigest; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import org.apache.mahout.common.RandomUtils; -import org.apache.mahout.math.jet.random.AbstractContinousDistribution; -import org.apache.mahout.math.jet.random.Gamma; -import org.apache.mahout.math.jet.random.Normal; -import org.apache.mahout.math.jet.random.Uniform; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.*; -import java.nio.ByteBuffer; -import java.util.*; -import java.util.concurrent.*; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; -import static org.junit.Assume.assumeTrue; - -public class MergingDigestTest extends TDigestTest { - @BeforeClass - public static void setup() throws IOException { - TDigestTest.setup("merge"); - } - - private MergingDigestFactory factory = new MergingDigestFactory() { - @Override - public MergingDigest create() { - return new MergingDigest(500); - } - }; - - @Before - public void testSetUp() { - RandomUtils.useTestSeed(); - } - - @Test - public void testUniform() { - Random gen = RandomUtils.getRandom(); - for (int i = 0; i < repeats(); i++) { - runTest(factory, new Uniform(0, 1, gen), 200, - new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999}, - "uniform", true); - } - } - - @Test - public void testGamma() { - // this Gamma distribution is very heavily skewed. The 0.1%-ile is 6.07e-30 while - // the median is 0.006 and the 99.9th %-ile is 33.6 while the mean is 1. - // this severe skew means that we have to have positional accuracy that - // varies by over 11 orders of magnitude. - Random gen = RandomUtils.getRandom(); - for (int i = 0; i < repeats(); i++) { - runTest(factory, new Gamma(0.1, 0.1, gen), 200, -// new double[]{6.0730483624079e-30, 6.0730483624079e-20, 6.0730483627432e-10, 5.9339110446023e-03, -// 2.6615455373884e+00, 1.5884778179295e+01, 3.3636770117188e+01}, - new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999}, - "gamma", true); - } - } - - @Test - public void testNarrowNormal() { - // this mixture of a uniform and normal distribution has a very narrow peak which is centered - // near the median. Our system should be scale invariant and work well regardless. - final Random gen = RandomUtils.getRandom(); - AbstractContinousDistribution mix = new AbstractContinousDistribution() { - AbstractContinousDistribution normal = new Normal(0, 1e-5, gen); - AbstractContinousDistribution uniform = new Uniform(-1, 1, gen); - - @Override - public double nextDouble() { - double x; - if (gen.nextDouble() < 0.5) { - x = uniform.nextDouble(); - } else { - x = normal.nextDouble(); - } - return x; - } - }; - - for (int i = 0; i < repeats(); i++) { - runTest(factory, mix, 200, new double[]{0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9, 0.99, 0.999}, "mixture", false); - } - } - - @Test - public void testRepeatedValues() { - final Random gen = RandomUtils.getRandom(); - - // 5% of samples will be 0 or 1.0. 10% for each of the values 0.1 through 0.9 - AbstractContinousDistribution mix = new AbstractContinousDistribution() { - @Override - public double nextDouble() { - return Math.rint(gen.nextDouble() * 10) / 10.0; - } - }; - - MergingDigest dist = new MergingDigest(500); - List<Double> data = Lists.newArrayList(); - for (int i = 0; i < 100000; i++) { - double x = mix.nextDouble(); - data.add(x); - } - - long t0 = System.nanoTime(); - for (double x : data) { - dist.add(x); - } - - System.out.printf("# %fus per point\n", (System.nanoTime() - t0) * 1e-3 / 100000); - System.out.printf("# %d centroids\n", dist.centroids().size()); - - assertTrue("Summary is too large: " + dist.centroids().size(), dist.centroids().size() < 5000); - - // all quantiles should round to nearest actual value - for (int i = 0; i < 10; i++) { - double z = i / 10.0; - // we skip over troublesome points that are nearly halfway between - for (double delta : new double[]{0.01, 0.02, 0.03, 0.07, 0.08, 0.09}) { - double q = z + delta; - double cdf = dist.cdf(q); - // we also relax the tolerances for repeated values - assertEquals(String.format("z=%.1f, q = %.3f, cdf = %.3f", z, q, cdf), z + 0.05, cdf, 0.01); - - double estimate = dist.quantile(q); - assertEquals(String.format("z=%.1f, q = %.3f, cdf = %.3f, estimate = %.3f", z, q, cdf, estimate), Math.rint(q * 10) / 10.0, estimate, 0.001); - } - } - } - - @Test - public void testSequentialPoints() { - for (int i = 0; i < repeats(); i++) { - runTest(factory, new AbstractContinousDistribution() { - double base = 0; - - @Override - public double nextDouble() { - base += Math.PI * 1e-5; - return base; - } - }, 200, new double[]{0.001, 0.01, 0.1, 0.5, 0.9, 0.99, 0.999}, - "sequential", true); - } - } - - @Test - public void testSerialization() { - Random gen = RandomUtils.getRandom(); - MergingDigest dist = new MergingDigest(100); - for (int i = 0; i < 100000; i++) { - double x = gen.nextDouble(); - dist.add(x); - } - dist.compress(); - - ByteBuffer buf = ByteBuffer.allocate(20000); - dist.asBytes(buf); - assertTrue(buf.position() < 11000); - assertEquals(buf.position(), dist.byteSize()); - buf.clear(); - - dist.asSmallBytes(buf); - assertTrue(buf.position() < 6000); - assertEquals(buf.position(), dist.smallByteSize()); - - System.out.printf("# big %d bytes\n", buf.position()); - - buf.flip(); - MergingDigest dist2 = MergingDigest.fromBytes(buf); - assertEquals(dist.centroids().size(), dist2.centroids().size()); - assertEquals(dist.compression(), dist2.compression(), 0); - assertEquals(dist.size(), dist2.size()); - - for (double q = 0; q < 1; q += 0.01) { - double quantile = dist2.quantile(q); - if (Double.isNaN(quantile)) { - System.out.printf("saw NaN for %.3f\n", dist2.quantile(q)); - } - assertEquals(dist.quantile(q), quantile, 1e-6); - } - - Iterator<? extends Centroid> ix = dist2.centroids().iterator(); - for (Centroid centroid : dist.centroids()) { - assertTrue(ix.hasNext()); - assertEquals(centroid.count(), ix.next().count()); - } - assertFalse(ix.hasNext()); - - buf.flip(); - dist.asSmallBytes(buf); - assertTrue(buf.position() < 6000); - System.out.printf("# small %d bytes\n", buf.position()); - - buf.flip(); - dist2 = MergingDigest.fromBytes(buf); - assertEquals(dist.centroids().size(), dist2.centroids().size()); - assertEquals(dist.compression(), dist2.compression(), 0); - assertEquals(dist.size(), dist2.size()); - - for (double q = 0; q < 1; q += 0.01) { - assertEquals(dist.quantile(q), dist2.quantile(q), 1e-6); - } - - ix = dist2.centroids().iterator(); - for (Centroid centroid : dist.centroids()) { - assertTrue(ix.hasNext()); - assertEquals(centroid.count(), ix.next().count()); - } - assertFalse(ix.hasNext()); - } - - @Test - public void testIntEncoding() { - Random gen = RandomUtils.getRandom(); - ByteBuffer buf = ByteBuffer.allocate(10000); - List<Integer> ref = Lists.newArrayList(); - for (int i = 0; i < 3000; i++) { - int n = gen.nextInt(); - n = n >>> (i / 100); - ref.add(n); - AbstractTDigest.encode(buf, n); - } - - buf.flip(); - - for (int i = 0; i < 3000; i++) { - int n = AbstractTDigest.decode(buf); - assertEquals(String.format("%d:", i), ref.get(i).intValue(), n); - } - } - - @Test - public void compareToQDigest() throws FileNotFoundException { - Random rand = RandomUtils.getRandom(); - PrintWriter out = new PrintWriter(new FileOutputStream("qd-tree-comparison.csv")); - try { - for (int i = 0; i < repeats(); i++) { - compareQD(out, new Gamma(0.1, 0.1, rand), "gamma", 1L << 48); - compareQD(out, new Uniform(0, 1, rand), "uniform", 1L << 48); - } - } finally { - out.close(); - } - } - - private void compareQD(PrintWriter out, AbstractContinousDistribution gen, String tag, long scale) { - for (double compression : new double[]{2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000}) { - QDigest qd = new QDigest(compression); - MergingDigest dist = new MergingDigest(compression); - List<Double> data = Lists.newArrayList(); - for (int i = 0; i < 100000; i++) { - double x = gen.nextDouble(); - dist.add(x); - qd.offer((long) (x * scale)); - data.add(x); - } - dist.compress(); - Collections.sort(data); - - for (double q : new double[]{0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.99, 0.999}) { - double x1 = dist.quantile(q); - double x2 = (double) qd.getQuantile(q) / scale; - double e1 = cdf(x1, data) - q; - out.printf("%s\t%.0f\t%.8f\t%.10g\t%.10g\t%d\t%d\n", tag, compression, q, e1, cdf(x2, data) - q, dist.smallByteSize(), QDigest.serialize(qd).length); - - } - } - } - - @Test - public void compareToStreamingQuantile() throws FileNotFoundException { - Random rand = RandomUtils.getRandom(); - - PrintWriter out = new PrintWriter(new FileOutputStream("sq-tree-comparison.csv")); - try { - - for (int i = 0; i < repeats(); i++) { - compareSQ(out, new Gamma(0.1, 0.1, rand), "gamma"); - compareSQ(out, new Uniform(0, 1, rand), "uniform"); - } - } finally { - out.close(); - } - } - - private void compareSQ(PrintWriter out, AbstractContinousDistribution gen, String tag) { - double[] quantiles = {0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.7, 0.8, 0.9, 0.99, 0.999}; - for (double compression : new double[]{2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000}) { - QuantileEstimator sq = new QuantileEstimator(1001); - MergingDigest dist = new MergingDigest(compression); - List<Double> data = Lists.newArrayList(); - for (int i = 0; i < 100000; i++) { - double x = gen.nextDouble(); - dist.add(x); - sq.add(x); - data.add(x); - } - dist.compress(); - Collections.sort(data); - - List<Double> qz = sq.getQuantiles(); - for (double q : quantiles) { - double x1 = dist.quantile(q); - double x2 = qz.get((int) (q * 1000 + 0.5)); - double e1 = cdf(x1, data) - q; - double e2 = cdf(x2, data) - q; - out.printf("%s\t%.0f\t%.8f\t%.10g\t%.10g\t%d\t%d\n", - tag, compression, q, e1, e2, dist.smallByteSize(), sq.serializedSize()); - - } - } - } - - @Test() - public void testSizeControl() throws IOException, InterruptedException, ExecutionException { - runSizeControl("scaling-merging.tsv", new MergingDigestFactory()); - - } - - private void runSizeControl(String name, final DigestFactory<? extends TDigest> factory) throws FileNotFoundException, InterruptedException { - // very slow running data generator. Don't want to run this normally. To run slow tests use - // mvn test -DrunSlowTests=true - assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests"))); - - final Random gen0 = RandomUtils.getRandom(); - final PrintWriter out = new PrintWriter(new FileOutputStream(name)); - out.printf("k\tsamples\tcompression\tsize1\tsize2\n"); - - ExecutorService pool = Executors.newFixedThreadPool(20); - - Set<Future<String>> tasks = Sets.newHashSet(); - for (final int size : new int[]{10000, 1000, 100, 10}) { - for (int k = 0; k < 20; k++) { - final int currentK = k; - Callable<String> task = new Callable<String>() { - Random gen = new Random(gen0.nextLong()); - - @Override - public String call() throws Exception { - System.out.printf("Starting %d,%d\n", currentK, size); - StringWriter s = new StringWriter(); - PrintWriter out = new PrintWriter(s); - for (double compression : new double[]{2, 5, 10, 20, 50, 100, 200, 500, 1000}) { - TDigest dist = factory.create(compression); - for (int i = 0; i < size * 1000; i++) { - dist.add(gen.nextDouble()); - } - out.printf("%d\t%d\t%.0f\t%d\t%d\n", currentK, size, compression, dist.smallByteSize(), dist.byteSize()); - out.flush(); - } - out.close(); - System.out.printf("Done with %d,%d\n", currentK, size); - return s.toString(); - } - }; - - tasks.add(pool.submit(task)); - } - } - - // this idiom allows us to abort if there are any exceptions - // using pool.invokeAll(...) would require us to wait for all tasks - try { - while (tasks.size() > 0) { - List<Future<String>> done = Lists.newArrayList(); - for (Future<String> result : tasks) { - if (result.isDone()) { - done.add(result); - } - } - tasks.removeAll(done); - Thread.sleep(100); - } - } finally { - pool.shutdownNow(); - } - - out.close(); - } - - private static class MergingDigestFactory extends DigestFactory<MergingDigest> { - @Override - public MergingDigest create() { - return create(50); - } - - @Override - public MergingDigest create(double compression) { - return new MergingDigest(compression); - } - } - - @Test - public void testScaling() throws FileNotFoundException, InterruptedException, ExecutionException { - final Random gen0 = RandomUtils.getRandom(); - - PrintWriter out = new PrintWriter(new FileOutputStream("error-scaling.tsv")); - try { - out.printf("pass\tcompression\tq\terror\tsize\n"); - - Collection<Callable<String>> tasks = Lists.newArrayList(); - int n = Math.max(3, repeats() * repeats()); - for (int k = 0; k < n; k++) { - final int currentK = k; - tasks.add(new Callable<String>() { - Random gen = new Random(gen0.nextLong()); - - @Override - public String call() throws Exception { - System.out.printf("Start %d\n", currentK); - StringWriter s = new StringWriter(); - PrintWriter out = new PrintWriter(s); - - List<Double> data = Lists.newArrayList(); - for (int i = 0; i < 100000; i++) { - data.add(gen.nextDouble()); - } - Collections.sort(data); - - for (double compression : new double[]{2, 5, 10, 20, 50, 100, 200, 500, 1000}) { - MergingDigest dist = new MergingDigest(compression); - for (Double x : data) { - dist.add(x); - } - dist.compress(); - - for (double q : new double[]{0.001, 0.01, 0.1, 0.5}) { - double estimate = dist.quantile(q); - double actual = data.get((int) (q * data.size())); - out.printf("%d\t%.0f\t%.3f\t%.9f\t%d\n", currentK, compression, q, estimate - actual, dist.byteSize()); - out.flush(); - } - } - out.close(); - System.out.printf("Finish %d\n", currentK); - - return s.toString(); - } - }); - } - - ExecutorService exec = Executors.newFixedThreadPool(16); - for (Future<String> result : exec.invokeAll(tasks)) { - out.write(result.get()); - } - } finally { - out.close(); - } - } - - @Test - public void testMerge() throws FileNotFoundException, InterruptedException, ExecutionException { - merge(new DigestFactory<MergingDigest>() { - @Override - public MergingDigest create() { - return new MergingDigest(1000); - } - }); - } - - @Test - public void testEmpty() { - empty(new MergingDigest(100)); - } - - @Test - public void testSingleValue() { - singleValue(new MergingDigest(100)); - } - - @Test - public void testFewValues() { - final TDigest digest = new MergingDigest(100); - fewValues(digest); - } - - @Test - public void testMoreThan2BValues() { - final TDigest digest = new MergingDigest(100); - moreThan2BValues(digest); - } - - @Test - public void testExtremeQuantiles() { - // t-digest shouldn't merge extreme nodes, but let's still test how it would - // answer to extreme quantiles in that case ('extreme' in the sense that the - // quantile is either before the first node or after the last one) - MergingDigest digest = new MergingDigest(100); - digest.add(10, 3); - digest.add(20, 1); - digest.add(40, 5); - digest.setMinMax(5, 70); - // this group tree is roughly equivalent to the following sorted array: - // [ ?, 10, ?, 20, ?, ?, 50, ?, ? ] - // and we expect it to compute approximate missing values: - // [ 5, 10, 15, 20, 30, 40, 50, 60, 70] - List<Double> values = Arrays.asList(5., 10., 15., 20., 30., 40., 50., 60., 70.); - for (int i = 0; i < digest.size(); ++i) { - final double q = i / (digest.size() - 1.0); // a quantile that matches an array index - double q1 = quantile(q, values); - double q2 = digest.quantile(q); - assertEquals(String.format("Expected quantile(%.2f) = %.3f, got %.3f", q, q1, q2), q1, q2, 5); - } - } - - @Test - public void testCDFNonDecreasing() { - MergingDigest digest = new MergingDigest(100); - - digest.add(25., 1); - digest.add(14., 1); - digest.add(10., 1); - digest.add(13., 1); - digest.add( 5., 1); - digest.add(20., 1); - digest.add(27., 1); - digest.compress(); - - double max = 0.; - for(int num = 0; num < 30; ++num) { - double cdf = digest.cdf((double)num); - System.out.printf("# cdf (%d) = %.3f vs %.3f\n", num, cdf, max); - assertTrue(max <= cdf); - max = cdf; - } - } - - @Test - public void testSorted() { - final TDigest digest = factory.create(); - sorted(digest); - } - - @Test - public void testNaN() { - final TDigest digest = factory.create(); - nan(digest); - } -} \ No newline at end of file diff --git a/src/test/java/com/tdunning/math/stats/SortTest.java b/src/test/java/com/tdunning/math/stats/SortTest.java deleted file mode 100644 index 35eacff..0000000 --- a/src/test/java/com/tdunning/math/stats/SortTest.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.tdunning.math.stats; - -import com.google.common.collect.HashMultiset; -import com.google.common.collect.Multiset; -import java.util.Random; -import junit.framework.TestCase; -import org.junit.Assert; - -public class SortTest extends TestCase { - public void testEmpty() { - Sort.sort(new int[]{}, new double[]{}); - } - - public void testOne() { - int[] order = new int[1]; - Sort.sort(order, new double[]{1}); - Assert.assertEquals(0, order[0]); - } - - public void testIdentical() { - int[] order = new int[6]; - double[] values = new double[6]; - - Sort.sort(order, values); - checkOrder(order, values); - } - - public void testRepeated() { - int n = 50; - int[] order = new int[n]; - double[] values = new double[n]; - for (int i = 0; i < n; i++) { - values[i] = Math.rint(10 * ((double) i / n)) / 10.0; - } - - Sort.sort(order, values); - checkOrder(order, values); - } - - public void testShort() { - int[] order = new int[6]; - double[] values = new double[6]; - - // all duplicates - for (int i = 0; i < 6; i++) { - values[i] = 1; - } - - Sort.sort(order, values); - checkOrder(order, values); - - values[0] = 0.8; - values[1] = 0.3; - - Sort.sort(order, values); - checkOrder(order, values); - - values[5] = 1.5; - values[4] = 1.2; - - Sort.sort(order, values); - checkOrder(order, values); - } - - public void testLonger() { - int[] order = new int[20]; - double[] values= new double[20]; - for (int i = 0; i < 20; i++) { - values[i] = (i * 13) % 20; - } - Sort.sort(order, values); - checkOrder(order, values); - } - - public void testMultiPivots() { - // more pivots than low split on first pass - // multiple pivots, but more low data on second part of recursion - int[] order = new int[30]; - double[] values= new double[30]; - for (int i = 0; i < 9; i++) { - values[i] = i + 20 * (i % 2); - } - - for (int i = 9; i < 20; i++) { - values[i] = 10; - } - - for (int i = 20; i < 30; i++) { - values[i] = i - 20 * (i % 2); - } - values[29] = 29; - values[24] = 25; - values[26] = 25; - - Sort.sort(order, values); - checkOrder(order, values); - } - - public void testRandomized() { - Random rand = new Random(); - - for (int k = 0; k < 100; k++) { - int[] order = new int[30]; - double[] values= new double[30]; - for (int i = 0; i < 30; i++) { - values[i] = rand.nextDouble(); - } - - Sort.sort(order, values); - checkOrder(order, values); - } - } - - private void checkOrder(int[] order, double[] values) { - double previous = -Double.MAX_VALUE; - Multiset<Integer> counts = HashMultiset.create(); - for (int i = 0; i < values.length; i++) { - counts.add(i); - double v = values[order[i]]; - if (v < previous) { - throw new IllegalArgumentException("Values out of order at %d"); - } - previous = v; - } - - Assert.assertEquals(order.length, counts.size()); - - for (Integer count : counts) { - Assert.assertEquals(1, counts.count(count)); - } - } -} \ No newline at end of file diff --git a/src/test/java/com/tdunning/math/stats/TDigestBenchmark.java b/src/test/java/com/tdunning/math/stats/TDigestBenchmark.java index 578262d..b8a3e13 100644 --- a/src/test/java/com/tdunning/math/stats/TDigestBenchmark.java +++ b/src/test/java/com/tdunning/math/stats/TDigestBenchmark.java @@ -17,7 +17,7 @@ public class TDigestBenchmark extends Benchmark { private static final int ARRAY_PAGE_SIZE = 32; - enum TDigestFactory { + static enum TDigestFactory { ARRAY { @Override TDigest create(double compression) { @@ -40,7 +40,7 @@ public class TDigestBenchmark extends Benchmark { abstract TDigest create(double compression); } - private enum DistributionFactory { + private static enum DistributionFactory { UNIFORM { @Override AbstractDistribution create(Random random) { diff --git a/src/test/java/com/tdunning/math/stats/TDigestMemoryBenchmark.java b/src/test/java/com/tdunning/math/stats/TDigestMemoryBenchmark.java index 1ab72fb..87ccac4 100644 --- a/src/test/java/com/tdunning/math/stats/TDigestMemoryBenchmark.java +++ b/src/test/java/com/tdunning/math/stats/TDigestMemoryBenchmark.java @@ -26,7 +26,7 @@ public class TDigestMemoryBenchmark { private static double memoryUsagePerCentroid(TDigest tdigest) { final long totalSize = RamUsageEstimator.sizeOf(tdigest); - return (double) totalSize / tdigest.centroids().size(); + return (double) totalSize / tdigest.centroidCount(); } } diff --git a/src/test/java/com/tdunning/math/stats/TDigestTest.java b/src/test/java/com/tdunning/math/stats/TDigestTest.java index 7fd9b29..8373731 100644 --- a/src/test/java/com/tdunning/math/stats/TDigestTest.java +++ b/src/test/java/com/tdunning/math/stats/TDigestTest.java @@ -22,7 +22,8 @@ import com.google.common.collect.Lists; import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.jet.random.AbstractContinousDistribution; -import org.junit.*; +import org.junit.AfterClass; +import org.junit.BeforeClass; import java.io.File; import java.io.FileNotFoundException; @@ -50,28 +51,22 @@ import static org.junit.Assert.fail; * Common test methods for TDigests */ public class TDigestTest { - protected static PrintWriter sizeDump = null; - protected static PrintWriter errorDump = null; - protected static PrintWriter deviationDump = null; + protected static PrintWriter sizeDump; + protected static PrintWriter errorDump; + protected static PrintWriter deviationDump; - public static void setup(String digestName) throws IOException { - sizeDump = new PrintWriter(new FileWriter("sizes-" + digestName + ".csv")); + @BeforeClass + public static void setup() throws IOException { + sizeDump = new PrintWriter(new FileWriter("sizes.csv")); sizeDump.printf("tag\ti\tq\tk\tactual\n"); - errorDump = new PrintWriter((new FileWriter("errors-" + digestName + ".csv"))); + errorDump = new PrintWriter((new FileWriter("errors.csv"))); errorDump.printf("dist\ttag\tx\tQ\terror\n"); - deviationDump = new PrintWriter((new FileWriter("deviation-" + digestName + ".csv"))); + deviationDump = new PrintWriter((new FileWriter("deviation.csv"))); deviationDump.printf("tag\tQ\tk\tx\tmean\tleft\tright\tdeviation\n"); } - @After - public void flush() { - sizeDump.flush(); - errorDump.flush(); - deviationDump.flush(); - } - @AfterClass public static void teardown() { sizeDump.close(); @@ -148,21 +143,13 @@ public class TDigestTest { assertEquals(ix.next(), x); } - if (dist instanceof MergingDigest) { - ((MergingDigest) dist).checkWeights(); - ((MergingDigest) dist2).checkWeights(); - for (TDigest sub : subs) { - ((MergingDigest) sub).checkWeights(); - } - } - for (double q : new double[]{0.001, 0.01, 0.1, 0.2, 0.3, 0.5}) { double z = quantile(q, data); double e1 = dist.quantile(q) - z; double e2 = dist2.quantile(q) - z; out.printf("quantile\t%d\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f\n", parts, q, z - q, e1, e2, Math.abs(e2) / q); - assertTrue(String.format("Relative error: parts=%d, q=%.4f, e1=%.5f, e2=%.5f, rel=%.4f", parts, q, e1, e2, Math.abs(e2) / q), Math.abs(e2) / q < 0.3); - assertTrue(String.format("Absolute error: parts=%d, q=%.4f, e1=%.5f, e2=%.5f, rel=%.4f", parts, q, e1, e2, Math.abs(e2) / q), Math.abs(e2) < 0.015); + assertTrue(String.format("parts=%d, q=%.4f, e1=%.5f, e2=%.5f, rel=%.4f", parts, q, e1, e2, Math.abs(e2) / q), Math.abs(e2) / q < 0.1); + assertTrue(String.format("parts=%d, q=%.4f, e1=%.5f, e2=%.5f, rel=%.4f", parts, q, e1, e2, Math.abs(e2) / q), Math.abs(e2) < 0.015); } for (double x : new double[]{0.001, 0.01, 0.1, 0.2, 0.3, 0.5}) { @@ -171,8 +158,8 @@ public class TDigestTest { double e2 = dist2.cdf(x) - z; out.printf("cdf\t%d\t%.6f\t%.6f\t%.6f\t%.6f\t%.6f\n", parts, x, z - x, e1, e2, Math.abs(e2) / x); - assertTrue(String.format("Absolute cdf: parts=%d, x=%.4f, e1=%.5f, e2=%.5f", parts, x, e1, e2), Math.abs(e2) < 0.015); - assertTrue(String.format("Relative cdf: parts=%d, x=%.4f, e1=%.5f, e2=%.5f, rel=%.3f", parts, x, e1, e2, Math.abs(e2) / x), Math.abs(e2) / x < 0.3); + assertTrue(String.format("parts=%d, x=%.4f, e1=%.5f, e2=%.5f", parts, x, e1, e2), Math.abs(e2) < 0.015); + assertTrue(String.format("parts=%d, x=%.4f, e1=%.5f, e2=%.5f", parts, x, e1, e2), Math.abs(e2) / x < 0.1); } out.flush(); } @@ -193,7 +180,7 @@ public class TDigestTest { final double value = RandomUtils.getRandom().nextDouble() * 1000; digest.add(value); final double q = RandomUtils.getRandom().nextDouble(); - for (double qValue : new double[]{0, q, 1}) { + for (double qValue : new double[] {0, q, 1}) { assertEquals(value, digest.quantile(qValue), 0.001f); } } @@ -217,11 +204,9 @@ public class TDigestTest { Collections.sort(values); // for this value of the compression, the tree shouldn't have merged any node - assertEquals(digest.centroids().size(), values.size()); + assertEquals(digest.centroidCount(), values.size()); for (double q : new double [] {0, 1e-10, r.nextDouble(), 0.5, 1-1e-10, 1}) { - double q1 = quantile(q, values); - double q2 = digest.quantile(q); - assertEquals(String.format("At q=%g, expected %.2f vs %.2f", q, q1, q2), q1, q2, 0.03); + assertEquals(quantile(q, values), digest.quantile(q), 0.01); } } @@ -281,7 +266,7 @@ public class TDigestTest { } dist.compress(); System.out.printf("# %fus per point\n", (System.nanoTime() - t0) * 1e-3 / 100000); - System.out.printf("# %d centroids\n", dist.centroids().size()); + System.out.printf("# %d centroids\n", dist.centroidCount()); Collections.sort(data); double[] xValues = qValues.clone(); @@ -301,9 +286,9 @@ public class TDigestTest { iz++; } assertEquals(qz, dist.size(), 1e-10); - assertEquals(iz, dist.centroids().size()); + assertEquals(iz, dist.centroidCount()); - assertTrue(String.format("Summary is too large (got %d, wanted < %.1f)", dist.centroids().size(), 11 * sizeGuide), dist.centroids().size() < 11 * sizeGuide); + assertTrue(String.format("Summary is too large (got %d, wanted < %.1f)", dist.centroidCount(), 11 * sizeGuide), dist.centroidCount() < 11 * sizeGuide); int softErrors = 0; for (int i = 0; i < xValues.length; i++) { double x = xValues[i]; @@ -360,7 +345,7 @@ public class TDigestTest { } assertEquals(1000 + 10L * (1 << 28), digest.size()); assertTrue(digest.size() > Integer.MAX_VALUE); - final double[] quantiles = new double[]{0, 0.1, 0.5, 0.9, 1, gen.nextDouble()}; + final double[] quantiles = new double[] {0, 0.1, 0.5, 0.9, 1, gen.nextDouble()}; Arrays.sort(quantiles); double prev = Double.NEGATIVE_INFINITY; for (double q : quantiles) { @@ -370,24 +355,8 @@ public class TDigestTest { } } - @Test - public void testMergeEmpty() { - final Random gen0 = RandomUtils.getRandom(); - List<TDigest> subData = new ArrayList<TDigest>(); - subData.add(new TreeDigest(10)); - TreeDigest foo = new TreeDigest(10); - AbstractTDigest.merge(subData, gen0, foo); - empty(foo); - } - - public static class DigestFactory<T extends TDigest> { - public T create() { - throw new UnsupportedOperationException("Must over-ride"); - } - - public T create(double ignore) { - throw new UnsupportedOperationException("Must over-ride"); - } + public interface DigestFactory<T extends TDigest> { + T create(); } protected void sorted(TDigest digest) { diff --git a/src/test/java/com/tdunning/math/stats/TreeDigestTest.java b/src/test/java/com/tdunning/math/stats/TreeDigestTest.java index 5622900..0a84e94 100644 --- a/src/test/java/com/tdunning/math/stats/TreeDigestTest.java +++ b/src/test/java/com/tdunning/math/stats/TreeDigestTest.java @@ -36,14 +36,10 @@ import static org.junit.Assert.*; import static org.junit.Assume.assumeTrue; public class TreeDigestTest extends TDigestTest { - @BeforeClass - public static void setup() throws IOException { - TDigestTest.setup("tree"); - } - private DigestFactory<TreeDigest> factory = new DigestFactory<TreeDigest>() { + private DigestFactory<TDigest> factory = new DigestFactory<TDigest>() { @Override - public TreeDigest create() { + public TDigest create() { return new TreeDigest(100); } }; @@ -53,6 +49,13 @@ public class TreeDigestTest extends TDigestTest { RandomUtils.useTestSeed(); } + @After + public void flush() { + sizeDump.flush(); + errorDump.flush(); + deviationDump.flush(); + } + @Test public void testUniform() { Random gen = RandomUtils.getRandom(); @@ -130,10 +133,10 @@ public class TreeDigestTest extends TDigestTest { } System.out.printf("# %fus per point\n", (System.nanoTime() - t0) * 1e-3 / 100000); - System.out.printf("# %d centroids\n", dist.centroids().size()); + System.out.printf("# %d centroids\n", dist.centroidCount()); // I would be happier with 5x compression, but repeated values make things kind of weird - assertTrue("Summary is too large", dist.centroids().size() < 10 * (double) 1000); + assertTrue("Summary is too large", dist.centroidCount() < 10 * (double) 1000); // all quantiles should round to nearest actual value for (int i = 0; i < 10; i++) { @@ -191,7 +194,7 @@ public class TreeDigestTest extends TDigestTest { buf.flip(); TreeDigest dist2 = TreeDigest.fromBytes(buf); - assertEquals(dist.centroids().size(), dist2.centroids().size()); + assertEquals(dist.centroidCount(), dist2.centroidCount()); assertEquals(dist.compression(), dist2.compression(), 0); assertEquals(dist.size(), dist2.size()); @@ -213,7 +216,7 @@ public class TreeDigestTest extends TDigestTest { buf.flip(); dist2 = TreeDigest.fromBytes(buf); - assertEquals(dist.centroids().size(), dist2.centroids().size()); + assertEquals(dist.centroidCount(), dist2.centroidCount()); assertEquals(dist.compression(), dist2.compression(), 0); assertEquals(dist.size(), dist2.size()); @@ -338,7 +341,7 @@ public class TreeDigestTest extends TDigestTest { assumeTrue(Boolean.parseBoolean(System.getProperty("runSlowTests"))); final Random gen0 = RandomUtils.getRandom(); - final PrintWriter out = new PrintWriter(new FileOutputStream("scaling-tree.tsv")); + final PrintWriter out = new PrintWriter(new FileOutputStream("scaling.tsv")); out.printf("k\tsamples\tcompression\tsize1\tsize2\n"); List<Callable<String>> tasks = Lists.newArrayList(); @@ -471,9 +474,17 @@ public class TreeDigestTest extends TDigestTest { // answer to extreme quantiles in that case ('extreme' in the sense that the // quantile is either before the first node or after the last one) TreeDigest digest = new TreeDigest(100); - digest.add(10, 3); - digest.add(20, 1); - digest.add(40, 5); + // we need to create the GroupTree manually + GroupTree tree = (GroupTree) digest.centroids(); + Centroid g = new Centroid(10); + g.add(10, 2); // 10 has a weight of 3 (1+2) + tree.add(g); + g = new Centroid(20); // 20 has a weight of 1 + tree.add(g); + g = new Centroid(40); + g.add(40, 4); // 40 has a weight of 5 (1+4) + tree.add(g); + digest.count = 3 + 1 + 5; // this group tree is roughly equivalent to the following sorted array: // [ ?, 10, ?, 20, ?, ?, 50, ?, ? ] // and we expect it to compute approximate missing values: -- Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-java/t-digest.git _______________________________________________ pkg-java-commits mailing list [email protected] http://lists.alioth.debian.org/cgi-bin/mailman/listinfo/pkg-java-commits

