kasakrisz commented on code in PR #4228:
URL: https://github.com/apache/hive/pull/4228#discussion_r1176442212
##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java:
##########
@@ -921,14 +924,23 @@ public Table toTable(HiveConf conf) throws HiveException {
// When replicating the statistics for a table will be obtained from the
source. Do not
// reset it on replica.
if (replicationSpec == null || !replicationSpec.isInReplicationScope()) {
- if (!this.isCTAS && (tbl.getPath() == null || (!isExternal() &&
tbl.isEmpty()))) {
- if (!tbl.isPartitioned() &&
conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
-
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(),
- MetaStoreUtils.getColumnNames(tbl.getCols()),
StatsSetupConst.TRUE);
- }
- } else {
-
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(),
null,
- StatsSetupConst.FALSE);
+ // Remove COLUMN_STATS_ACCURATE=true from table's parameter, let the HMS
determine if
+ // there is need to add column stats dependent on the table's location
in case the metastore transformer
+ // sets or alters the table's location.
+
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(),
null,
+ StatsSetupConst.FALSE);
+ if (!this.isCTAS && !tbl.isPartitioned() && !tbl.isTemporary() &&
+ conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
+ // Put the flag into the dictionary in order not to pollute the table,
+ // ObjectDictionary is meant to convey repeatitive messages.
+ ObjectDictionary dictionary = tbl.getTTable().isSetDictionary() ?
+ tbl.getTTable().getDictionary() : new ObjectDictionary();
+ String cols =
MetaStoreUtils.getColumnNames(tbl.getCols()).stream().collect(Collectors.joining("\0"));
Review Comment:
Please extract `"\0"` to a constant.
##########
standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/utils/MetaStoreServerUtils.java:
##########
@@ -508,6 +511,48 @@ public static void clearQuickStats(Map<String, String>
params) {
params.remove(StatsSetupConst.NUM_ERASURE_CODED_FILES);
}
+ public static void updateTableStatsForCreateTable(Warehouse wh, Database db,
Table tbl,
+ EnvironmentContext envContext, Configuration conf, Path tblPath, boolean
newDir)
+ throws MetaException {
+ // If the created table is a view, skip generating the stats
+ if (MetaStoreUtils.isView(tbl)) {
+ return;
+ }
+ assert tblPath != null;
+ if (tbl.isSetDictionary() && tbl.getDictionary().getValues() != null) {
+ List<java.nio.ByteBuffer> values = tbl.getDictionary().getValues().
+ remove(StatsSetupConst.STATS_FOR_CREATE_TABLE);
+ java.nio.ByteBuffer buffer;
+ if (values != null && values.size() > 0 && (buffer =
values.get(0)).hasArray()) {
+ String val = new String(buffer.array(), StandardCharsets.UTF_8);
+ if (StatsSetupConst.TRUE.equals(val)) {
+ try {
+ boolean isIcebergTable =
+
HiveMetaHook.ICEBERG.equalsIgnoreCase(tbl.getParameters().get(HiveMetaHook.TABLE_TYPE));
+ PathFilter pathFilter = isIcebergTable ?
+ path -> !"metadata".equals(path.getName()) :
FileUtils.HIDDEN_FILES_PATH_FILTER;
Review Comment:
This part is too Iceberg specific. Is it possible to get this filter via
storage handler api?
##########
ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java:
##########
@@ -921,14 +924,23 @@ public Table toTable(HiveConf conf) throws HiveException {
// When replicating the statistics for a table will be obtained from the
source. Do not
// reset it on replica.
if (replicationSpec == null || !replicationSpec.isInReplicationScope()) {
- if (!this.isCTAS && (tbl.getPath() == null || (!isExternal() &&
tbl.isEmpty()))) {
- if (!tbl.isPartitioned() &&
conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
-
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(),
- MetaStoreUtils.getColumnNames(tbl.getCols()),
StatsSetupConst.TRUE);
- }
- } else {
-
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(),
null,
- StatsSetupConst.FALSE);
+ // Remove COLUMN_STATS_ACCURATE=true from table's parameter, let the HMS
determine if
+ // there is need to add column stats dependent on the table's location
in case the metastore transformer
+ // sets or alters the table's location.
+
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(),
null,
+ StatsSetupConst.FALSE);
+ if (!this.isCTAS && !tbl.isPartitioned() && !tbl.isTemporary() &&
+ conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
+ // Put the flag into the dictionary in order not to pollute the table,
+ // ObjectDictionary is meant to convey repeatitive messages.
+ ObjectDictionary dictionary = tbl.getTTable().isSetDictionary() ?
+ tbl.getTTable().getDictionary() : new ObjectDictionary();
+ String cols =
MetaStoreUtils.getColumnNames(tbl.getCols()).stream().collect(Collectors.joining("\0"));
+ List<java.nio.ByteBuffer> buffers = new ArrayList<>();
+
buffers.add(java.nio.ByteBuffer.wrap(StatsSetupConst.TRUE.getBytes(StandardCharsets.UTF_8)));
+
buffers.add(java.nio.ByteBuffer.wrap(cols.getBytes(StandardCharsets.UTF_8)));
+ dictionary.putToValues(StatsSetupConst.STATS_FOR_CREATE_TABLE,
buffers);
Review Comment:
Could you please use some serialization library here like
`ObjectOutputStream` or json? Maybe a new class could be introduced to
represent the data structure and makes easier to extend it in the future.
```
class StatsSetup {
private boolean enabled;
private List<String> columnNames;
}
```
##########
standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/Warehouse.java:
##########
@@ -500,8 +501,16 @@ public void recycleDirToCmPath(Path f, boolean ifPurge)
throws MetaException {
}
public boolean isEmptyDir(Path path) throws IOException, MetaException {
+ return isEmptyDir(path, null);
Review Comment:
Can you pass `p -> true` instead of `null` ?
Also it seems that `FileSystem.listStatus` has a bigger memory footprint
when a filter is passed:
https://github.com/apache/hadoop/blob/a3b9c37a397ad4188041dd80621bdeefc46885f2/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/FileSystem.java#L2011
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]