http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/pom.xml ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/pom.xml b/standalone-metastore/metastore-common/pom.xml index d6df15f..b0d9eee 100644 --- a/standalone-metastore/metastore-common/pom.xml +++ b/standalone-metastore/metastore-common/pom.xml @@ -170,42 +170,17 @@ <artifactId>libthrift</artifactId> </dependency> <dependency> - <groupId>org.datanucleus</groupId> - <artifactId>datanucleus-api-jdo</artifactId> - </dependency> - <dependency> - <groupId>org.datanucleus</groupId> - <artifactId>datanucleus-core</artifactId> - </dependency> - <dependency> - <groupId>org.datanucleus</groupId> - <artifactId>datanucleus-rdbms</artifactId> - </dependency> - <dependency> - <groupId>org.datanucleus</groupId> - <artifactId>javax.jdo</artifactId> - </dependency> - <dependency> <groupId>org.skyscreamer</groupId> <artifactId>jsonassert</artifactId> <scope>test</scope> </dependency> <dependency> - <groupId>sqlline</groupId> - <artifactId>sqlline</artifactId> - </dependency> - <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId> </dependency> <!-- test scope dependencies --> <dependency> - <groupId>com.microsoft.sqlserver</groupId> - <artifactId>mssql-jdbc</artifactId> - <scope>test</scope> - </dependency> - <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <scope>test</scope> @@ -215,18 +190,6 @@ <artifactId>mockito-core</artifactId> <scope>test</scope> </dependency> - <dependency> - <!-- Note, this is LGPL. But we're only using it in a test and not changing it, so I - believe we are fine. --> - <groupId>org.mariadb.jdbc</groupId> - <artifactId>mariadb-java-client</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.postgresql</groupId> - <artifactId>postgresql</artifactId> - <scope>test</scope> - </dependency> </dependencies> <profiles> @@ -360,48 +323,6 @@ </plugins> </reporting> </profile> - <!-- - <profile> - <id>checkin</id> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-surefire-plugin</artifactId> - <version>${maven.surefire.version}</version> - <configuration> - <includes> - <include>**/Test*</include> - </includes> - <redirectTestOutputToFile>true</redirectTestOutputToFile> - <reuseForks>false</reuseForks> - <forkCount>${test.forkcount}</forkCount> - <argLine>-Xmx2048m</argLine> - <failIfNoTests>false</failIfNoTests> - <systemPropertyVariables> - <build.dir>${project.build.directory}</build.dir> - <datanucleus.schema.autoCreateAll>true</datanucleus.schema.autoCreateAll> - <derby.version>${derby.version}</derby.version> - <derby.stream.error.file>${test.tmp.dir}/derby.log</derby.stream.error.file> - <log4j.debug>true</log4j.debug> - <java.io.tmpdir>${test.tmp.dir}</java.io.tmpdir> - <javax.jdo.option.ConnectionURL>jdbc:derby:memory:${test.tmp.dir}/junit_metastore_db;create=true</javax.jdo.option.ConnectionURL> - <metastore.schema.verification>false</metastore.schema.verification> - <test.tmp.dir>${test.tmp.dir}</test.tmp.dir> - <metastore.warehouse.dir>${test.warehouse.scheme}${test.warehouse.dir}</metastore.warehouse.dir> - </systemPropertyVariables> - <additionalClasspathElements> - <additionalClasspathElement>${log4j.conf.dir}</additionalClasspathElement> - </additionalClasspathElements> - </configuration> - - </plugin> - - - </plugins> - </build> - </profile> - --> </profiles> <build> @@ -490,14 +411,6 @@ <goals> <goal>run</goal> </goals> - <configuration> - <target> - <mkdir dir="${test.tmp.dir}/scripts/metastore/upgrade" /> - <copy todir="${test.tmp.dir}/scripts/metastore/upgrade"> - <fileset dir="${basedir}/src/main/sql/"/> - </copy> - </target> - </configuration> </execution> </executions> </plugin> @@ -692,47 +605,6 @@ </executions> </plugin> <plugin> - <groupId>org.codehaus.mojo</groupId> - <artifactId>exec-maven-plugin</artifactId> - <executions> - <execution> - <phase>prepare-package</phase> - <goals> - <goal>exec</goal> - </goals> - <configuration> - <executable>java</executable> - <arguments> - <argument>-classpath</argument> - <classpath/> - <argument>org.apache.hadoop.hive.metastore.conf.ConfTemplatePrinter</argument> - <argument>${project.build.directory}/generated-sources/conf/metastore-site.xml.template</argument> - </arguments> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.datanucleus</groupId> - <artifactId>datanucleus-maven-plugin</artifactId> - <version>4.0.5</version> - <configuration> - <api>JDO</api> - <verbose>false</verbose> - <log4jConfiguration>${basedir}/src/main/resources/datanucleus-log4j.properties</log4jConfiguration> - <metadataIncludes>**/*.jdo</metadataIncludes> - <fork>false</fork> - </configuration> - <executions> - <execution> - <phase>process-classes</phase> - <goals> - <goal>enhance</goal> - </goals> - </execution> - </executions> - </plugin> - <plugin> <groupId>org.antlr</groupId> <artifactId>antlr3-maven-plugin</artifactId> <version>${antlr.version}</version>
http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/assembly/bin.xml ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/assembly/bin.xml b/standalone-metastore/metastore-common/src/assembly/bin.xml index 81912d7..5992a48 100644 --- a/standalone-metastore/metastore-common/src/assembly/bin.xml +++ b/standalone-metastore/metastore-common/src/assembly/bin.xml @@ -77,27 +77,6 @@ </fileSet> <fileSet> - <fileMode>755</fileMode> - <directory>${project.basedir}/src/main/scripts</directory> - <includes> - <include>base</include> - <include>schematool</include> - <include>start-metastore</include> - <include>metastore-config.sh</include> - <include>ext/**/*</include> - </includes> - <outputDirectory>bin</outputDirectory> - </fileSet> - - <fileSet> - <directory>${project.basedir}/src/main/sql</directory> - <includes> - <include>**/*</include> - </includes> - <outputDirectory>scripts/metastore/upgrade</outputDirectory> - </fileSet> - - <fileSet> <directory>${project.basedir}/src/gen/thrift/gen-php</directory> <includes> <include>**/*</include> @@ -118,19 +97,12 @@ <directory>${project.basedir}/src/main/resources/</directory> <fileMode>644</fileMode> <includes> - <include>metastore-site.xml</include> <include>metastore-log4j2.properties</include> </includes> <outputDirectory>conf</outputDirectory> </fileSet> </fileSets> - <files> - <file> - <source>${project.build.directory}/generated-sources/conf/metastore-site.xml.template</source> - <outputDirectory>conf</outputDirectory> - </file> - </files> </assembly> http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java deleted file mode 100644 index a7ca05a..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/StatsSetupConst.java +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.common; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.TreeMap; - -import com.google.common.collect.ImmutableList; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.metastore.conf.MetastoreConf; -import org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.annotation.JsonInclude; -import com.fasterxml.jackson.annotation.JsonProperty; -import com.fasterxml.jackson.core.JsonGenerator; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.DeserializationContext; -import com.fasterxml.jackson.databind.JsonDeserializer; -import com.fasterxml.jackson.databind.JsonSerializer; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.ObjectReader; -import com.fasterxml.jackson.databind.ObjectWriter; -import com.fasterxml.jackson.databind.SerializerProvider; -import com.fasterxml.jackson.databind.annotation.JsonDeserialize; -import com.fasterxml.jackson.databind.annotation.JsonSerialize; - - -/** - * A class that defines the constant strings used by the statistics implementation. - */ - -public class StatsSetupConst { - - protected static final Logger LOG = LoggerFactory.getLogger(StatsSetupConst.class.getName()); - - public enum StatDB { - fs { - @Override - public String getPublisher(Configuration conf) { - return "org.apache.hadoop.hive.ql.stats.fs.FSStatsPublisher"; - } - - @Override - public String getAggregator(Configuration conf) { - return "org.apache.hadoop.hive.ql.stats.fs.FSStatsAggregator"; - } - }, - custom { - @Override - public String getPublisher(Configuration conf) { - return MetastoreConf.getVar(conf, ConfVars.STATS_DEFAULT_PUBLISHER); } - @Override - public String getAggregator(Configuration conf) { - return MetastoreConf.getVar(conf, ConfVars.STATS_DEFAULT_AGGREGATOR); } - }; - public abstract String getPublisher(Configuration conf); - public abstract String getAggregator(Configuration conf); - } - - // statistics stored in metastore - /** - * The name of the statistic Num Files to be published or gathered. - */ - public static final String NUM_FILES = "numFiles"; - - /** - * The name of the statistic Num Partitions to be published or gathered. - */ - public static final String NUM_PARTITIONS = "numPartitions"; - - /** - * The name of the statistic Total Size to be published or gathered. - */ - public static final String TOTAL_SIZE = "totalSize"; - - /** - * The name of the statistic Row Count to be published or gathered. - */ - public static final String ROW_COUNT = "numRows"; - - public static final String RUN_TIME_ROW_COUNT = "runTimeNumRows"; - - /** - * The name of the statistic Raw Data Size to be published or gathered. - */ - public static final String RAW_DATA_SIZE = "rawDataSize"; - - /** - * The name of the statistic for Number of Erasure Coded Files - to be published or gathered. - */ - public static final String NUM_ERASURE_CODED_FILES = "numFilesErasureCoded"; - - /** - * Temp dir for writing stats from tasks. - */ - public static final String STATS_TMP_LOC = "hive.stats.tmp.loc"; - - public static final String STATS_FILE_PREFIX = "tmpstats-"; - /** - * List of all supported statistics - */ - public static final List<String> SUPPORTED_STATS = ImmutableList.of( - NUM_FILES, ROW_COUNT, TOTAL_SIZE, RAW_DATA_SIZE, NUM_ERASURE_CODED_FILES); - - /** - * List of all statistics that need to be collected during query execution. These are - * statistics that inherently require a scan of the data. - */ - public static final List<String> STATS_REQUIRE_COMPUTE = ImmutableList.of(ROW_COUNT, RAW_DATA_SIZE); - - /** - * List of statistics that can be collected quickly without requiring a scan of the data. - */ - public static final List<String> FAST_STATS = ImmutableList.of( - NUM_FILES, TOTAL_SIZE, NUM_ERASURE_CODED_FILES); - - // This string constant is used to indicate to AlterHandler that - // alterPartition/alterTable is happening via statsTask or via user. - public static final String STATS_GENERATED = "STATS_GENERATED"; - - public static final String TASK = "TASK"; - - public static final String USER = "USER"; - - // This string constant is used by AlterHandler to figure out that it should not attempt to - // update stats. It is set by any client-side task which wishes to signal that no stats - // update should take place, such as with replication. - public static final String DO_NOT_UPDATE_STATS = "DO_NOT_UPDATE_STATS"; - - //This string constant will be persisted in metastore to indicate whether corresponding - //table or partition's statistics and table or partition's column statistics are accurate or not. - public static final String COLUMN_STATS_ACCURATE = "COLUMN_STATS_ACCURATE"; - - public static final String COLUMN_STATS = "COLUMN_STATS"; - - public static final String BASIC_STATS = "BASIC_STATS"; - - public static final String CASCADE = "CASCADE"; - - public static final String TRUE = "true"; - - public static final String FALSE = "false"; - - // The parameter keys for the table statistics. Those keys are excluded from 'show create table' command output. - public static final List<String> TABLE_PARAMS_STATS_KEYS = ImmutableList.of( - COLUMN_STATS_ACCURATE, NUM_FILES, TOTAL_SIZE, ROW_COUNT, RAW_DATA_SIZE, NUM_PARTITIONS, - NUM_ERASURE_CODED_FILES); - - private static class ColumnStatsAccurate { - private static ObjectReader objectReader; - private static ObjectWriter objectWriter; - - static { - ObjectMapper objectMapper = new ObjectMapper(); - objectReader = objectMapper.readerFor(ColumnStatsAccurate.class); - objectWriter = objectMapper.writerFor(ColumnStatsAccurate.class); - } - - static class BooleanSerializer extends JsonSerializer<Boolean> { - - @Override - public void serialize(Boolean value, JsonGenerator jsonGenerator, - SerializerProvider serializerProvider) throws IOException { - jsonGenerator.writeString(value.toString()); - } - } - - static class BooleanDeserializer extends JsonDeserializer<Boolean> { - - public Boolean deserialize(JsonParser jsonParser, - DeserializationContext deserializationContext) - throws IOException { - return Boolean.valueOf(jsonParser.getValueAsString()); - } - } - - @JsonInclude(JsonInclude.Include.NON_DEFAULT) - @JsonSerialize(using = BooleanSerializer.class) - @JsonDeserialize(using = BooleanDeserializer.class) - @JsonProperty(BASIC_STATS) - boolean basicStats; - - @JsonInclude(JsonInclude.Include.NON_EMPTY) - @JsonProperty(COLUMN_STATS) - @JsonSerialize(contentUsing = BooleanSerializer.class) - @JsonDeserialize(contentUsing = BooleanDeserializer.class) - TreeMap<String, Boolean> columnStats = new TreeMap<>(); - - } - - public static boolean areBasicStatsUptoDate(Map<String, String> params) { - if (params == null) { - return false; - } - ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE)); - return stats.basicStats; - } - - public static boolean areColumnStatsUptoDate(Map<String, String> params, String colName) { - if (params == null) { - return false; - } - ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE)); - return stats.columnStats.containsKey(colName); - } - - // It will only throw JSONException when stats.put(BASIC_STATS, TRUE) - // has duplicate key, which is not possible - // note that set basic stats false will wipe out column stats too. - public static void setBasicStatsState(Map<String, String> params, String setting) { - if (setting.equals(FALSE)) { - if (params!=null && params.containsKey(COLUMN_STATS_ACCURATE)) { - params.remove(COLUMN_STATS_ACCURATE); - } - return; - } - if (params == null) { - throw new RuntimeException("params are null...cant set columnstatstate!"); - } - ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE)); - stats.basicStats = true; - try { - params.put(COLUMN_STATS_ACCURATE, ColumnStatsAccurate.objectWriter.writeValueAsString(stats)); - } catch (JsonProcessingException e) { - throw new RuntimeException("can't serialize column stats", e); - } - } - - public static void setColumnStatsState(Map<String, String> params, List<String> colNames) { - if (params == null) { - throw new RuntimeException("params are null...cant set columnstatstate!"); - } - if (colNames == null) { - return; - } - ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE)); - - for (String colName : colNames) { - if (!stats.columnStats.containsKey(colName)) { - stats.columnStats.put(colName, true); - } - } - try { - params.put(COLUMN_STATS_ACCURATE, ColumnStatsAccurate.objectWriter.writeValueAsString(stats)); - } catch (JsonProcessingException e) { - LOG.trace(e.getMessage()); - } - } - - public static boolean canColumnStatsMerge(Map<String, String> params, String colName) { - if (params == null) { - return false; - } - ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE)); - return stats.columnStats.containsKey(colName); - } - - public static void clearColumnStatsState(Map<String, String> params) { - if (params == null) { - return; - } - - ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE)); - stats.columnStats.clear(); - - try { - params.put(COLUMN_STATS_ACCURATE, ColumnStatsAccurate.objectWriter.writeValueAsString(stats)); - } catch (JsonProcessingException e) { - LOG.trace(e.getMessage()); - } - } - - public static void removeColumnStatsState(Map<String, String> params, List<String> colNames) { - if (params == null) { - return; - } - try { - ColumnStatsAccurate stats = parseStatsAcc(params.get(COLUMN_STATS_ACCURATE)); - for (String string : colNames) { - stats.columnStats.remove(string); - } - params.put(COLUMN_STATS_ACCURATE, ColumnStatsAccurate.objectWriter.writeValueAsString(stats)); - } catch (JsonProcessingException e) { - LOG.trace(e.getMessage()); - } - } - - public static void setStatsStateForCreateTable(Map<String, String> params, - List<String> cols, String setting) { - if (TRUE.equals(setting)) { - for (String stat : StatsSetupConst.SUPPORTED_STATS) { - params.put(stat, "0"); - } - } - setBasicStatsState(params, setting); - if (TRUE.equals(setting)) { - setColumnStatsState(params, cols); - } - } - - private static ColumnStatsAccurate parseStatsAcc(String statsAcc) { - if (statsAcc == null) { - return new ColumnStatsAccurate(); - } - try { - return ColumnStatsAccurate.objectReader.readValue(statsAcc); - } catch (Exception e) { - ColumnStatsAccurate ret = new ColumnStatsAccurate(); - if (TRUE.equalsIgnoreCase(statsAcc)) { - ret.basicStats = true; - } - return ret; - } - } -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimator.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimator.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimator.java deleted file mode 100644 index 668db10..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimator.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.common.ndv; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.ql.util.JavaDataModel; - -public interface NumDistinctValueEstimator { - - Logger LOG = LoggerFactory.getLogger(NumDistinctValueEstimator.class.getName()); - - void reset(); - - byte[] serialize(); - - NumDistinctValueEstimator deserialize(byte[] buf); - - void addToEstimator(long v); - - void addToEstimator(double d); - - void addToEstimator(String s); - - void addToEstimator(HiveDecimal decimal); - - void mergeEstimators(NumDistinctValueEstimator o); - - long estimateNumDistinctValues(); - - int lengthFor(JavaDataModel model); - - boolean canMerge(NumDistinctValueEstimator o); - -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java deleted file mode 100644 index b630fa3..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hadoop.hive.common.ndv; - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.hadoop.hive.common.ndv.fm.FMSketch; -import org.apache.hadoop.hive.common.ndv.fm.FMSketchUtils; -import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; -import org.apache.hadoop.hive.common.ndv.hll.HyperLogLogUtils; - -public class NumDistinctValueEstimatorFactory { - - private NumDistinctValueEstimatorFactory() { - } - - private static boolean isFMSketch(byte[] buf) throws IOException { - byte[] magic = new byte[2]; - magic[0] = (byte) buf[0]; - magic[1] = (byte) buf[1]; - return Arrays.equals(magic, FMSketchUtils.MAGIC); - } - - public static NumDistinctValueEstimator getNumDistinctValueEstimator(byte[] buf) { - // Right now we assume only FM and HLL are available. - try { - if (isFMSketch(buf)) { - return FMSketchUtils.deserializeFM(buf); - } else { - return HyperLogLogUtils.deserializeHLL(buf); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - public static NumDistinctValueEstimator getEmptyNumDistinctValueEstimator( - NumDistinctValueEstimator n) { - if (n instanceof FMSketch) { - return new FMSketch(((FMSketch) n).getNumBitVectors()); - } else { - return HyperLogLog.builder().setSizeOptimized().build(); - } - } - - public static NumDistinctValueEstimator getEmptyNumDistinctValueEstimator(String func, - int numBitVectors) { - if ("fm".equals(func.toLowerCase())) { - return new FMSketch(numBitVectors); - } else if ("hll".equals(func.toLowerCase())) { - return HyperLogLog.builder().setSizeOptimized().build(); - } else { - throw new RuntimeException("Can not recognize " + func); - } - } - -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/fm/FMSketch.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/fm/FMSketch.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/fm/FMSketch.java deleted file mode 100644 index f6cdc4c..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/fm/FMSketch.java +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.common.ndv.fm; - -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.Random; - -import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.ql.util.JavaDataModel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javolution.util.FastBitSet; - -public class FMSketch implements NumDistinctValueEstimator { - - static final Logger LOG = LoggerFactory.getLogger(FMSketch.class.getName()); - - /* We want a,b,x to come from a finite field of size 0 to k, where k is a prime number. - * 2^p - 1 is prime for p = 31. Hence bitvectorSize has to be 31. Pick k to be 2^p -1. - * If a,b,x didn't come from a finite field ax1 + b mod k and ax2 + b mod k will not be pair wise - * independent. As a consequence, the hash values will not distribute uniformly from 0 to 2^p-1 - * thus introducing errors in the estimates. - */ - public static final int BIT_VECTOR_SIZE = 31; - - // Refer to Flajolet-Martin'86 for the value of phi - private static final double PHI = 0.77351; - - private final int[] a; - private final int[] b; - private final FastBitSet[] bitVector; - - private final Random aValue; - private final Random bValue; - - private int numBitVectors; - - /* Create a new distinctValueEstimator - */ - public FMSketch(int numBitVectors) { - this.numBitVectors = numBitVectors; - bitVector = new FastBitSet[numBitVectors]; - for (int i=0; i< numBitVectors; i++) { - bitVector[i] = new FastBitSet(BIT_VECTOR_SIZE); - } - - a = new int[numBitVectors]; - b = new int[numBitVectors]; - - /* Use a large prime number as a seed to the random number generator. - * Java's random number generator uses the Linear Congruential Generator to generate random - * numbers using the following recurrence relation, - * - * X(n+1) = (a X(n) + c ) mod m - * - * where X0 is the seed. Java implementation uses m = 2^48. This is problematic because 2^48 - * is not a prime number and hence the set of numbers from 0 to m don't form a finite field. - * If these numbers don't come from a finite field any give X(n) and X(n+1) may not be pair - * wise independent. - * - * However, empirically passing in prime numbers as seeds seems to work better than when passing - * composite numbers as seeds. Ideally Java's Random should pick m such that m is prime. - * - */ - aValue = new Random(99397); - bValue = new Random(9876413); - - for (int i = 0; i < numBitVectors; i++) { - int randVal; - /* a and b shouldn't be even; If a and b are even, then none of the values - * will set bit 0 thus introducing errors in the estimate. Both a and b can be even - * 25% of the times and as a result 25% of the bit vectors could be inaccurate. To avoid this - * always pick odd values for a and b. - */ - do { - randVal = aValue.nextInt(); - } while (randVal % 2 == 0); - - a[i] = randVal; - - do { - randVal = bValue.nextInt(); - } while (randVal % 2 == 0); - - b[i] = randVal; - - if (a[i] < 0) { - a[i] = a[i] + (1 << BIT_VECTOR_SIZE - 1); - } - - if (b[i] < 0) { - b[i] = b[i] + (1 << BIT_VECTOR_SIZE - 1); - } - } - } - - /** - * Resets a distinctValueEstimator object to its original state. - */ - public void reset() { - for (int i=0; i< numBitVectors; i++) { - bitVector[i].clear(); - } - } - - public FastBitSet getBitVector(int index) { - return bitVector[index]; - } - - public FastBitSet setBitVector(FastBitSet fastBitSet, int index) { - return bitVector[index] = fastBitSet; - } - - public int getNumBitVectors() { - return numBitVectors; - } - - public int getBitVectorSize() { - return BIT_VECTOR_SIZE; - } - - public void printNumDistinctValueEstimator() { - String t = new String(); - - LOG.debug("NumDistinctValueEstimator"); - LOG.debug("Number of Vectors: {}", numBitVectors); - LOG.debug("Vector Size: {}", BIT_VECTOR_SIZE); - - for (int i=0; i < numBitVectors; i++) { - t = t + bitVector[i].toString(); - } - - LOG.debug("Serialized Vectors: "); - LOG.debug(t); - } - - @Override - public byte[] serialize() { - ByteArrayOutputStream bos = new ByteArrayOutputStream(); - // write bytes to bos ... - try { - FMSketchUtils.serializeFM(bos, this); - final byte[] result = bos.toByteArray(); - bos.close(); - return result; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Override - public NumDistinctValueEstimator deserialize(byte[] buf) { - InputStream is = new ByteArrayInputStream(buf); - try { - NumDistinctValueEstimator n = FMSketchUtils.deserializeFM(is); - is.close(); - return n; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - private int generateHash(long v, int hashNum) { - int mod = (1<<BIT_VECTOR_SIZE) - 1; - long tempHash = a[hashNum] * v + b[hashNum]; - tempHash %= mod; - int hash = (int) tempHash; - - /* Hash function should map the long value to 0...2^L-1. - * Hence hash value has to be non-negative. - */ - if (hash < 0) { - hash = hash + mod; - } - return hash; - } - - private int generateHashForPCSA(long v) { - int mod = 1 << (BIT_VECTOR_SIZE - 1) - 1; - long tempHash = a[0] * v + b[0]; - tempHash %= mod; - int hash = (int) tempHash; - - /* Hash function should map the long value to 0...2^L-1. - * Hence hash value has to be non-negative. - */ - if (hash < 0) { - hash = hash + mod + 1; - } - return hash; - } - - public void addToEstimator(long v) { - /* Update summary bitVector : - * Generate hash value of the long value and mod it by 2^bitVectorSize-1. - * In this implementation bitVectorSize is 31. - */ - - for (int i = 0; i<numBitVectors; i++) { - int hash = generateHash(v,i); - int index; - - // Find the index of the least significant bit that is 1 - for (index=0; index<BIT_VECTOR_SIZE; index++) { - if (hash % 2 != 0) { - break; - } - hash = hash >> 1; - } - - // Set bitvector[index] := 1 - bitVector[i].set(index); - } - } - - public void addToEstimatorPCSA(long v) { - int hash = generateHashForPCSA(v); - int rho = hash/numBitVectors; - int index; - - // Find the index of the least significant bit that is 1 - for (index=0; index<BIT_VECTOR_SIZE; index++) { - if (rho % 2 != 0) { - break; - } - rho = rho >> 1; - } - - // Set bitvector[index] := 1 - bitVector[hash%numBitVectors].set(index); - } - - public void addToEstimator(double d) { - int v = new Double(d).hashCode(); - addToEstimator(v); - } - - public void addToEstimatorPCSA(double d) { - int v = new Double(d).hashCode(); - addToEstimatorPCSA(v); - } - - public void addToEstimator(HiveDecimal decimal) { - int v = decimal.hashCode(); - addToEstimator(v); - } - - public void addToEstimatorPCSA(HiveDecimal decimal) { - int v = decimal.hashCode(); - addToEstimatorPCSA(v); - } - - public void mergeEstimators(FMSketch o) { - // Bitwise OR the bitvector with the bitvector in the agg buffer - for (int i=0; i<numBitVectors; i++) { - bitVector[i].or(o.getBitVector(i)); - } - } - - public long estimateNumDistinctValuesPCSA() { - double numDistinctValues = 0.0; - long S = 0; - - for (int i=0; i < numBitVectors; i++) { - int index = 0; - while (bitVector[i].get(index) && index < BIT_VECTOR_SIZE) { - index = index + 1; - } - S = S + index; - } - - numDistinctValues = ((numBitVectors/PHI) * Math.pow(2.0, S/numBitVectors)); - return ((long)numDistinctValues); - } - - /* We use the Flajolet-Martin estimator to estimate the number of distinct values.FM uses the - * location of the least significant zero as an estimate of log2(phi*ndvs). - */ - public long estimateNumDistinctValues() { - int sumLeastSigZero = 0; - double avgLeastSigZero; - double numDistinctValues; - - for (int i=0; i< numBitVectors; i++) { - int leastSigZero = bitVector[i].nextClearBit(0); - sumLeastSigZero += leastSigZero; - } - - avgLeastSigZero = - sumLeastSigZero/(numBitVectors * 1.0) - (Math.log(PHI)/Math.log(2.0)); - numDistinctValues = Math.pow(2.0, avgLeastSigZero); - return ((long)(numDistinctValues)); - } - - @InterfaceAudience.LimitedPrivate(value = {"Hive" }) - static int lengthFor(JavaDataModel model, Integer numVector) { - int length = model.object(); - length += model.primitive1() * 2; // two int - length += model.primitive2(); // one double - length += model.lengthForRandom() * 2; // two Random - - if (numVector == null) { - numVector = 16; // HiveConf hive.stats.ndv.error default produces 16 vectors - } - - if (numVector > 0) { - length += model.array() * 3; // three array - length += model.primitive1() * numVector * 2; // two int array - length += (model.object() + model.array() + model.primitive1() + - model.primitive2()) * numVector; // bitset array - } - return length; - } - - public int lengthFor(JavaDataModel model) { - return lengthFor(model, getNumBitVectors()); - } - - // the caller needs to gurrantee that they are the same type based on numBitVectors - @Override - public void mergeEstimators(NumDistinctValueEstimator o) { - // Bitwise OR the bitvector with the bitvector in the agg buffer - for (int i = 0; i < numBitVectors; i++) { - bitVector[i].or(((FMSketch) o).getBitVector(i)); - } - } - - @Override - public void addToEstimator(String s) { - int v = s.hashCode(); - addToEstimator(v); - } - - @Override - public boolean canMerge(NumDistinctValueEstimator o) { - return o instanceof FMSketch && this.numBitVectors == ((FMSketch) o).numBitVectors; - } -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/fm/FMSketchUtils.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/fm/FMSketchUtils.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/fm/FMSketchUtils.java deleted file mode 100644 index 02c64b8..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/common/ndv/fm/FMSketchUtils.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.common.ndv.fm; - -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Arrays; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javolution.util.FastBitSet; - -public class FMSketchUtils { - - static final Logger LOG = LoggerFactory.getLogger(FMSketch.class.getName()); - public static final byte[] MAGIC = new byte[] { 'F', 'M' }; - - /* - * Serializes a distinctValueEstimator object to Text for transport. - * - * <b>4 byte header</b> is encoded like below 2 bytes - FM magic string to - * identify serialized stream 2 bytes - numbitvectors because - * BIT_VECTOR_SIZE=31, 4 bytes are enough to hold positions of 0-31 - */ - public static void serializeFM(OutputStream out, FMSketch fm) throws IOException { - out.write(MAGIC); - - // max of numBitVectors = 1024, 2 bytes is enough. - byte[] nbv = new byte[2]; - nbv[0] = (byte) fm.getNumBitVectors(); - nbv[1] = (byte) (fm.getNumBitVectors() >>> 8); - - out.write(nbv); - - // original toString takes too much space - // we compress a fastbitset to 4 bytes - for (int i = 0; i < fm.getNumBitVectors(); i++) { - writeBitVector(out, fm.getBitVector(i)); - } - } - - // BIT_VECTOR_SIZE is 31, we can use 32 bits, i.e., 4 bytes to represent a - // FastBitSet, rather than using 31 integers. - private static void writeBitVector(OutputStream out, FastBitSet bit) throws IOException { - int num = 0; - for (int pos = 0; pos < FMSketch.BIT_VECTOR_SIZE; pos++) { - if (bit.get(pos)) { - num |= 1 << pos; - } - } - byte[] i = new byte[4]; - for (int j = 0; j < 4; j++) { - i[j] = (byte) ((num >>> (8 * j)) & 0xff); - } - out.write(i); - } - - /* - * Deserializes from string to FastBitSet; Creates a NumDistinctValueEstimator - * object and returns it. - */ - public static FMSketch deserializeFM(byte[] buf) throws IOException { - InputStream is = new ByteArrayInputStream(buf); - try { - FMSketch sketch = deserializeFM(is); - is.close(); - return sketch; - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - public static FMSketch deserializeFM(InputStream in) throws IOException { - checkMagicString(in); - - byte[] nbv = new byte[2]; - nbv[0] = (byte) in.read(); - nbv[1] = (byte) in.read(); - - int numBitVectors = 0; - numBitVectors |= (nbv[0] & 0xff); - numBitVectors |= ((nbv[1] & 0xff) << 8); - - FMSketch sketch = new FMSketch(numBitVectors); - for (int n = 0; n < numBitVectors; n++) { - sketch.setBitVector(readBitVector(in), n); - } - return sketch; - } - - private static FastBitSet readBitVector(InputStream in) throws IOException { - FastBitSet fastBitSet = new FastBitSet(); - fastBitSet.clear(); - for (int i = 0; i < 4; i++) { - byte b = (byte) in.read(); - for (int j = 0; j < 8; j++) { - if ((b & (1 << j)) != 0) { - fastBitSet.set(j + 8 * i); - } - } - } - return fastBitSet; - } - - private static void checkMagicString(InputStream in) throws IOException { - byte[] magic = new byte[2]; - magic[0] = (byte) in.read(); - magic[1] = (byte) in.read(); - - if (!Arrays.equals(magic, MAGIC)) { - throw new IllegalArgumentException("The input stream is not a FMSketch stream."); - } - } -}