This is an automated email from the ASF dual-hosted git repository.
abstractdog pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 7fa1f54 HIVE-23712: metadata-only queries return incorrect results
with empty acid partition (#1182) (Laszlo Bodor reviewed by Mustafa Iman,
Ashutosh Chauhan)
7fa1f54 is described below
commit 7fa1f546bc90f196a3b2a6c1d9897a0a0acae686
Author: Bodor Laszlo <[email protected]>
AuthorDate: Wed Oct 14 10:03:51 2020 +0200
HIVE-23712: metadata-only queries return incorrect results with empty acid
partition (#1182) (Laszlo Bodor reviewed by Mustafa Iman, Ashutosh Chauhan)
Change-Id: Ia63c4a502f77f863f1ee20e0a407893689d79afb
---
.../org/apache/hadoop/hive/ql/exec/Utilities.java | 19 +--
.../optimizer/physical/NullScanTaskDispatcher.java | 32 ++++-
.../metadataonly_acid_empty_partition.q | 22 ++++
.../llap/metadataonly_acid_empty_partition.q.out | 136 +++++++++++++++++++++
4 files changed, 198 insertions(+), 11 deletions(-)
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
index b2dde64..9b0f683 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java
@@ -2780,16 +2780,21 @@ public final class Utilities {
}
public static boolean isEmptyPath(Configuration job, Path dirPath) throws
IOException {
+ FileStatus[] fStats = listNonHiddenFileStatus(job, dirPath);
+ if (fStats.length > 0) {
+ return false;
+ }
+ return true;
+ }
+
+ public static FileStatus[] listNonHiddenFileStatus(Configuration job, Path
dirPath)
+ throws IOException {
FileSystem inpFs = dirPath.getFileSystem(job);
try {
- FileStatus[] fStats = inpFs.listStatus(dirPath,
FileUtils.HIDDEN_FILES_PATH_FILTER);
- if (fStats.length > 0) {
- return false;
- }
- } catch(FileNotFoundException fnf) {
- return true;
+ return inpFs.listStatus(dirPath, FileUtils.HIDDEN_FILES_PATH_FILTER);
+ } catch (FileNotFoundException e) {
+ return new FileStatus[] {};
}
- return true;
}
public static List<TezTask> getTezTasks(List<Task<?>> tasks) {
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
index 9b5a099..b46459c 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/NullScanTaskDispatcher.java
@@ -33,12 +33,14 @@ import java.util.Stack;
import java.util.stream.Collectors;
import org.apache.commons.collections.CollectionUtils;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.StringInternUtils;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.io.AcidUtils;
import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.NullScanFileSystem;
import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat;
@@ -96,20 +98,42 @@ public class NullScanTaskDispatcher implements
SemanticDispatcher {
if (desc == null) {
return null;
}
- boolean isEmpty = false;
+ FileStatus[] filesFoundInPartitionDir = null;
try {
- isEmpty = Utilities.isEmptyPath(physicalContext.getConf(), path);
+ filesFoundInPartitionDir =
Utilities.listNonHiddenFileStatus(physicalContext.getConf(), path);
} catch (IOException e) {
LOG.error("Cannot determine if the table is empty", e);
}
- desc.setInputFileFormatClass(
- isEmpty ? ZeroRowsInputFormat.class : OneNullRowInputFormat.class);
+ if (!isMetadataOnlyAllowed(filesFoundInPartitionDir)) {
+ return desc;
+ }
+
+ boolean isEmpty = filesFoundInPartitionDir == null ||
filesFoundInPartitionDir.length == 0;
+ desc.setInputFileFormatClass(isEmpty ? ZeroRowsInputFormat.class :
OneNullRowInputFormat.class);
desc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class);
desc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB,
NullStructSerDe.class.getName());
return desc;
}
+ private boolean isMetadataOnlyAllowed(FileStatus[] filesFoundInPartitionDir)
{
+ if (filesFoundInPartitionDir == null || filesFoundInPartitionDir.length ==
0) {
+ return true; // empty folders are safe to convert to metadata-only
+ }
+ for (FileStatus f : filesFoundInPartitionDir) {
+ if (AcidUtils.isDeleteDelta(f.getPath())) {
+ /*
+ * as described in HIVE-23712, an acid partition is not a safe subject
of metadata-only
+ * optimization, because there is a chance that it contains no data
but contains folders
+ * (e.g: delta_0000002_0000002_0000,
delete_delta_0000003_0000003_0000), without scanning
+ * the underlying file contents, we cannot tell whether this partition
contains data or not
+ */
+ return false;
+ }
+ }
+ return true;
+ }
+
private void processAlias(MapWork work, Path path,
Collection<String> aliasesAffected, Set<String> aliases) {
// the aliases that are allowed to map to a null scan.
diff --git
a/ql/src/test/queries/clientpositive/metadataonly_acid_empty_partition.q
b/ql/src/test/queries/clientpositive/metadataonly_acid_empty_partition.q
new file mode 100644
index 0000000..f97814a
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/metadataonly_acid_empty_partition.q
@@ -0,0 +1,22 @@
+set hive.support.concurrency=true;
+set hive.exec.dynamic.partition.mode=nonstrict;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+
+set hive.optimize.metadataonly=true;
+
+create table test1 (id int, val string) partitioned by (val2 string) STORED AS
ORC TBLPROPERTIES ('transactional'='true');
+describe formatted test1;
+
+alter table test1 add partition (val2='foo');
+alter table test1 add partition (val2='bar');
+insert into test1 partition (val2='foo') values (1, 'abc');
+insert into test1 partition (val2='bar') values (1, 'def');
+delete from test1 where val2 = 'bar';
+
+select '--> hive.optimize.metadataonly=true';
+select distinct val2 from test1;
+
+
+set hive.optimize.metadataonly=false;
+select '--> hive.optimize.metadataonly=false';
+select distinct val2 from test1;
diff --git
a/ql/src/test/results/clientpositive/llap/metadataonly_acid_empty_partition.q.out
b/ql/src/test/results/clientpositive/llap/metadataonly_acid_empty_partition.q.out
new file mode 100644
index 0000000..4c2c9a5
--- /dev/null
+++
b/ql/src/test/results/clientpositive/llap/metadataonly_acid_empty_partition.q.out
@@ -0,0 +1,136 @@
+PREHOOK: query: create table test1 (id int, val string) partitioned by (val2
string) STORED AS ORC TBLPROPERTIES ('transactional'='true')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@test1
+POSTHOOK: query: create table test1 (id int, val string) partitioned by (val2
string) STORED AS ORC TBLPROPERTIES ('transactional'='true')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@test1
+PREHOOK: query: describe formatted test1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@test1
+POSTHOOK: query: describe formatted test1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@test1
+# col_name data_type comment
+id int
+val string
+
+# Partition Information
+# col_name data_type comment
+val2 string
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"}
+ bucketing_version 2
+ numFiles 0
+ numPartitions 0
+ numRows 0
+ rawDataSize 0
+ totalSize 0
+ transactional true
+ transactional_properties default
+#### A masked pattern was here ####
+
+# Storage Information
+SerDe Library: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+InputFormat: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+OutputFormat: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+Compressed: No
+Num Buckets: -1
+Bucket Columns: []
+Sort Columns: []
+PREHOOK: query: alter table test1 add partition (val2='foo')
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@test1
+POSTHOOK: query: alter table test1 add partition (val2='foo')
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@test1
+POSTHOOK: Output: default@test1@val2=foo
+PREHOOK: query: alter table test1 add partition (val2='bar')
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@test1
+POSTHOOK: query: alter table test1 add partition (val2='bar')
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@test1
+POSTHOOK: Output: default@test1@val2=bar
+PREHOOK: query: insert into test1 partition (val2='foo') values (1, 'abc')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test1@val2=foo
+POSTHOOK: query: insert into test1 partition (val2='foo') values (1, 'abc')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test1@val2=foo
+POSTHOOK: Lineage: test1 PARTITION(val2=foo).id SCRIPT []
+POSTHOOK: Lineage: test1 PARTITION(val2=foo).val SCRIPT []
+PREHOOK: query: insert into test1 partition (val2='bar') values (1, 'def')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@test1@val2=bar
+POSTHOOK: query: insert into test1 partition (val2='bar') values (1, 'def')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@test1@val2=bar
+POSTHOOK: Lineage: test1 PARTITION(val2=bar).id SCRIPT []
+POSTHOOK: Lineage: test1 PARTITION(val2=bar).val SCRIPT []
+PREHOOK: query: delete from test1 where val2 = 'bar'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test1
+PREHOOK: Input: default@test1@val2=bar
+PREHOOK: Output: default@test1@val2=bar
+POSTHOOK: query: delete from test1 where val2 = 'bar'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test1
+POSTHOOK: Input: default@test1@val2=bar
+POSTHOOK: Output: default@test1@val2=bar
+PREHOOK: query: select '--> hive.optimize.metadataonly=true'
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select '--> hive.optimize.metadataonly=true'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+--> hive.optimize.metadataonly=true
+PREHOOK: query: select distinct val2 from test1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test1
+PREHOOK: Input: default@test1@val2=bar
+PREHOOK: Input: default@test1@val2=foo
+#### A masked pattern was here ####
+POSTHOOK: query: select distinct val2 from test1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test1
+POSTHOOK: Input: default@test1@val2=bar
+POSTHOOK: Input: default@test1@val2=foo
+#### A masked pattern was here ####
+foo
+PREHOOK: query: select '--> hive.optimize.metadataonly=false'
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+POSTHOOK: query: select '--> hive.optimize.metadataonly=false'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+#### A masked pattern was here ####
+--> hive.optimize.metadataonly=false
+PREHOOK: query: select distinct val2 from test1
+PREHOOK: type: QUERY
+PREHOOK: Input: default@test1
+PREHOOK: Input: default@test1@val2=bar
+PREHOOK: Input: default@test1@val2=foo
+#### A masked pattern was here ####
+POSTHOOK: query: select distinct val2 from test1
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@test1
+POSTHOOK: Input: default@test1@val2=bar
+POSTHOOK: Input: default@test1@val2=foo
+#### A masked pattern was here ####
+foo