Author: catholicon
Date: Wed Dec 5 00:21:17 2018
New Revision: 1848181
URL: http://svn.apache.org/viewvc?rev=1848181&view=rev
Log:
OAK-7930: Add tape sampling
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java
(with props)
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java
(with props)
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java?rev=1848181&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java
Wed Dec 5 00:21:17 2018
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.AbstractIterator;
+
+import java.util.Iterator;
+import java.util.Random;
+
+/**
+ * Sampling algorithm that picks 'k' random samples from streaming input.
+ * The algorithm would maintain 'k/N' probability to pick any of the item
+ * where 'N' is the number of items seen currently.
+ *
+ * While the input could be streaming, the algorithm requires {@code N} to be
known
+ * before hand.
+ *
+ * The algorithm produces random saamples without replacement and hence has
O(1) extra
+ * memory complexity
+ *
+ * Implementation inspired from "JONES,T.G. A note on sampling a tape file"
+ * (https://dl.acm.org/citation.cfm?id=368159)
+ */
+public class TapeSampling<T> {
+ private final Random rGen;
+ private final Iterator<T> input;
+ private final int N;
+ private final int k;
+
+ public TapeSampling(final Random rGen, final Iterator<T> input, final int
N, final int k) {
+ this.rGen = rGen;
+ this.input = input;
+ this.N = N;
+ this.k = k;
+ }
+
+ public Iterator<T> getSamples() {
+ return new AbstractIterator<T>() {
+ int sampled = 0;
+ int seen = 0;
+
+ @Override
+ protected T computeNext() {
+ if (sampled == k) {
+ return endOfData();
+ }
+
+ while (true) {
+ Preconditions.checkArgument(input.hasNext(),
+ "Not enough input items provided. Declared: " + N
+ "; got " + seen + "; sampled: " + sampled);
+
+ T i = input.next();
+
+ int r = rGen.nextInt(N - seen) + 1;
+ seen++;
+
+ if (r <= k - sampled) {
+ sampled++;
+ return i;
+ }
+ }
+ }
+ };
+ }
+}
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSampling.java
------------------------------------------------------------------------------
svn:eol-style = native
Added:
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java
URL:
http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java?rev=1848181&view=auto
==============================================================================
---
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java
(added)
+++
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java
Wed Dec 5 00:21:17 2018
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.plugins.index.lucene.util;
+
+import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.Iterators;
+import org.junit.Test;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+
+import static com.google.common.collect.Lists.newArrayList;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TapeSamplingTest {
+ @Test
+ public void testWithHighestRandom() {
+ final int start = 10;
+ final int end = 30;
+ final int k = 10;
+ final Random r = new Random() {
+ @Override
+ public int nextInt(int i) {
+ return i - 1;
+ }
+ };
+
+ List<Integer> input = range(start, end);
+ TapeSampling<Integer> res = new TapeSampling<>(r, input.iterator(),
input.size(), k);
+
+ List<Integer> samples = newArrayList(res.getSamples());
+ List<Integer> expected = range(end - k + 1, end);
+
+ assertEquals(expected, samples);
+ }
+
+ @Test
+ public void testWithLowestRandom() {
+ final int start = 10;
+ final int end = 30;
+ final int k = 10;
+ final Random r = new Random() {
+ @Override
+ public int nextInt(int i) {
+ return 0;
+ }
+ };
+
+ List<Integer> input = range(start, end);
+ TapeSampling<Integer> res = new TapeSampling<>(r, input.iterator(),
input.size(), k);
+
+ List<Integer> samples = newArrayList(res.getSamples());
+ List<Integer> expected = range(start, start + k - 1);
+
+ assertEquals(expected, samples);
+ }
+
+ @Test
+ public void allItemsWhenKisN() {
+ final int start = 11;
+ final int end = 20;
+ final int k = 10;
+ final Random r = new Random();
+
+ List<Integer> input = range(start, end);
+ TapeSampling<Integer> res = new TapeSampling<>(r, input.iterator(),
input.size(), k);
+
+ List<Integer> samples = newArrayList(res.getSamples());
+ List<Integer> expected = input;
+
+ assertEquals(expected, samples);
+ }
+
+ @Test
+ public void sampleExactlyK() {
+ final int start = 11;
+ final int end = 1000;
+ final int k = 10;
+ final Random r = new Random();
+
+ List<Integer> input = range(start, end);
+ TapeSampling<Integer> res = new TapeSampling<>(r, input.iterator(),
input.size(), k);
+
+ assertEquals("Must sample exactly " + k + " items", k,
Iterators.size(res.getSamples()));
+ }
+
+ @Test
+ public void sampleBias() {
+ int size = 200;
+ int k = 20;
+ int[] counts = new int[size];
+ Random r = new Random(42);
+ int testCount = 100 * size;
+ for (int i = 0; i < testCount; i++) {
+ List<Integer> input = range(0, size - 1);
+ TapeSampling<Integer> res = new TapeSampling<>(r,
input.iterator(), input.size(), k);
+ Iterator<Integer> it = res.getSamples();
+ while (it.hasNext()) {
+ counts[it.next()]++;
+ }
+ }
+ int expectedCount = testCount / (size / k);
+ for (int i = 0; i < size; i++) {
+ assertTrue(counts[i] > expectedCount* 0.9 && counts[i] <
expectedCount * 1.1);
+ }
+ }
+
+ private List<Integer> range(final int start, final int end) {
+ Iterator<Integer> iter = new AbstractIterator<Integer>() {
+ int curr = start;
+ @Override
+ protected Integer computeNext() {
+ if (curr > end) {
+ return endOfData();
+ }
+
+ return curr++;
+ }
+ };
+
+ return newArrayList(iter);
+ }
+}
Propchange:
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/util/TapeSamplingTest.java
------------------------------------------------------------------------------
svn:eol-style = native