[ https://issues.apache.org/jira/browse/ACCUMULO-3913?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14732128#comment-14732128 ]
ASF GitHub Bot commented on ACCUMULO-3913: ------------------------------------------ Github user joshelser commented on a diff in the pull request: https://github.com/apache/accumulo/pull/46#discussion_r38815457 --- Diff: test/src/main/java/org/apache/accumulo/test/SampleIT.java --- @@ -0,0 +1,497 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.accumulo.test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; +import java.util.TreeMap; + +import org.apache.accumulo.core.client.AccumuloException; +import org.apache.accumulo.core.client.AccumuloSecurityException; +import org.apache.accumulo.core.client.BatchScanner; +import org.apache.accumulo.core.client.BatchWriter; +import org.apache.accumulo.core.client.BatchWriterConfig; +import org.apache.accumulo.core.client.ClientSideIteratorScanner; +import org.apache.accumulo.core.client.Connector; +import org.apache.accumulo.core.client.IsolatedScanner; +import org.apache.accumulo.core.client.IteratorSetting; +import org.apache.accumulo.core.client.MutationsRejectedException; +import org.apache.accumulo.core.client.SampleNotPresentException; +import org.apache.accumulo.core.client.Scanner; +import org.apache.accumulo.core.client.ScannerBase; +import org.apache.accumulo.core.client.TableNotFoundException; +import org.apache.accumulo.core.client.admin.CompactionConfig; +import org.apache.accumulo.core.client.admin.NewTableConfiguration; +import org.apache.accumulo.core.client.admin.SamplerConfiguration; +import org.apache.accumulo.core.client.impl.Credentials; +import org.apache.accumulo.core.client.impl.OfflineScanner; +import org.apache.accumulo.core.data.ByteSequence; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Mutation; +import org.apache.accumulo.core.data.Range; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.core.iterators.IteratorEnvironment; +import org.apache.accumulo.core.iterators.SortedKeyValueIterator; +import org.apache.accumulo.core.iterators.WrappingIterator; +import org.apache.accumulo.core.sample.RowSampler; +import org.apache.accumulo.core.security.Authorizations; +import org.apache.accumulo.harness.AccumuloClusterHarness; +import org.junit.Assert; +import org.junit.Test; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Iterables; + +public class SampleIT extends AccumuloClusterHarness { + + private static final Map<String,String> OPTIONS_1 = ImmutableMap.of("hasher", "murmur3_32", "modulus", "1009"); + private static final Map<String,String> OPTIONS_2 = ImmutableMap.of("hasher", "murmur3_32", "modulus", "997"); + + private static final SamplerConfiguration SC1 = new SamplerConfiguration(RowSampler.class.getName()).setOptions(OPTIONS_1); + private static final SamplerConfiguration SC2 = new SamplerConfiguration(RowSampler.class.getName()).setOptions(OPTIONS_2); + + public static class IteratorThatUsesSample extends WrappingIterator { --- End diff -- This is pretty cool, actually. I hadn't considered iterators benefiting as well as normal scans. > Add per table sampling > ---------------------- > > Key: ACCUMULO-3913 > URL: https://issues.apache.org/jira/browse/ACCUMULO-3913 > Project: Accumulo > Issue Type: Improvement > Reporter: Keith Turner > Fix For: 1.8.0 > > > I am working on prototyping adding hash based sampling to Accumulo. I am > trying to accomplish the following goals in the prototype. > # Have each RFile store a sample per locality group. Also store the > configuration used to generate the sample. > # Use sampling functions that ensure the same row columns exist across the > samples in all RFiles. Hash mod is a good candidate that gives a random > sample that's consistent across files. > # Have scanners support scanning RFile's samples sets. Scan should fail if > RFiles have different sample configuration. Different sampling config > implies the RFile's sample sets contain a possibly disjoint set of row > columns. > # Support generating sample data for RFiles generated for bulk import > # Support sample data in the memory map > # Support enabling and disabling sampling per table AND configuring a > sample function. > I am currently using the following function in my prototype to determine what > data an RFile stores in its sample set. This code will always select same > subset of rows for each RFile's sample set. I have not yet made the function > configurable. > {code:java} > public class RowSampler implements Sampler { > private HashFunction hasher = Hashing.murmur3_32(); > @Override > public boolean accept(Key k) { > ByteSequence row = k.getRowData(); > HashCode hc = hasher.hashBytes(row.getBackingArray(), row.offset(), > row.length()); > return hc.asInt() % 1009 == 0; > } > } > {code} > Although not yet implemented, the divisor in this RowSample could be > configurable. RFiles with sample data would store the fact that a RowSample > with a divisor of 1009 was used to generate sample data. -- This message was sent by Atlassian JIRA (v6.3.4#6332)