hive git commit: HIVE-14448: Queries with predicate fail when ETL split strategy is chosen for ACID tables (Matt McCline, reviewed by Sergey Shelukhin)

mmccline Fri, 12 Aug 2016 23:24:06 -0700

Repository: hive
Updated Branches:
  refs/heads/branch-2.1 24ffc98f5 -> b1188b45d



HIVE-14448: Queries with predicate fail when ETL split strategy is chosen for 
ACID tables (Matt McCline, reviewed by Sergey Shelukhin)

Conflicts:
        ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/b1188b45
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/b1188b45
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/b1188b45

Branch: refs/heads/branch-2.1
Commit: b1188b45d02c3badc61cf06b879c5f883a18aa42
Parents: 24ffc98
Author: Matt McCline <[email protected]>
Authored: Fri Aug 12 23:00:42 2016 -0700
Committer: Matt McCline <[email protected]>
Committed: Fri Aug 12 23:22:55 2016 -0700

----------------------------------------------------------------------
 .../hadoop/hive/ql/io/orc/OrcInputFormat.java   | 22 +++--
 .../apache/hadoop/hive/ql/TestTxnCommands2.java | 96 ++++++++++++++++++++
 2 files changed, 111 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/b1188b45/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java 
b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
index 717d28c..9973dd6 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcInputFormat.java
@@ -1069,7 +1069,7 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
     private final long blockSize;
     private final TreeMap<Long, BlockLocation> locations;
     private OrcTail orcTail;
-    private final List<OrcProto.Type> readerTypes;
+    private List<OrcProto.Type> readerTypes;
     private List<StripeInformation> stripes;
     private List<StripeStatistics> stripeStats;
     private List<OrcProto.Type> fileTypes;
@@ -1402,6 +1402,11 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
         }
         TypeDescription readerSchema = 
OrcUtils.convertTypeFromProtobuf(readerTypes, 0);
         evolution = new SchemaEvolution(fileSchema, readerSchema, 
readerIncluded);
+        if (!isOriginal) {
+          // The SchemaEvolution class has added the ACID metadata columns.  
Let's update our
+          // readerTypes so PPD code will work correctly.
+          readerTypes = OrcUtils.getOrcTypes(evolution.getReaderSchema());
+        }
       }
       writerVersion = orcTail.getWriterVersion();
       List<OrcProto.ColumnStatistics> fileColStats = 
orcTail.getFooter().getStatisticsList();
@@ -1418,21 +1423,24 @@ public class OrcInputFormat implements 
InputFormat<NullWritable, OrcStruct>,
           }
         }
       }
-      projColsUncompressedSize = computeProjectionSize(fileTypes, 
fileColStats, fileIncluded,
-          isOriginal);
+      projColsUncompressedSize = computeProjectionSize(fileTypes, 
fileColStats, fileIncluded);
       if (!context.footerInSplits) {
         orcTail = null;
       }
     }
 
     private long computeProjectionSize(List<OrcProto.Type> fileTypes,
-        List<OrcProto.ColumnStatistics> stats, boolean[] fileIncluded, boolean 
isOriginal) {
-      final int rootIdx = getRootColumn(isOriginal);
+        List<OrcProto.ColumnStatistics> stats, boolean[] fileIncluded) {
       List<Integer> internalColIds = Lists.newArrayList();
-      if (fileIncluded != null) {
+      if (fileIncluded == null) {
+        // Add all.
+        for (int i = 0; i < fileTypes.size(); i++) {
+          internalColIds.add(i);
+        }
+      } else {
         for (int i = 0; i < fileIncluded.length; i++) {
           if (fileIncluded[i]) {
-            internalColIds.add(rootIdx + i);
+            internalColIds.add(i);
           }
         }
       }

http://git-wip-us.apache.org/repos/asf/hive/blob/b1188b45/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java
----------------------------------------------------------------------
diff --git a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java 
b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java
index ed55979..45acd8c 100644
--- a/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java
+++ b/ql/src/test/org/apache/hadoop/hive/ql/TestTxnCommands2.java
@@ -1223,6 +1223,102 @@ public class TestTxnCommands2 {
     runCleaner(hiveConf);
     runStatementOnDriver("select count(*) from " + Table.ACIDTBL);
   }
+
+  @Test
+  public void testACIDwithSchemaEvolutionAndCompaction() throws Exception {
+    
testACIDwithSchemaEvolutionForVariousTblProperties("'transactional'='true'");
+  }
+
+  protected void testACIDwithSchemaEvolutionForVariousTblProperties(String 
tblProperties) throws Exception {
+    String tblName = "acidWithSchemaEvol";
+    int numBuckets = 1;
+    runStatementOnDriver("drop table if exists " + tblName);
+    runStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
+      " CLUSTERED BY(a) INTO " + numBuckets +" BUCKETS" + //currently ACID 
requires table to be bucketed
+      " STORED AS ORC  TBLPROPERTIES ( " + tblProperties + " )");
+
+    // create some data
+    runStatementOnDriver("insert into " + tblName + " values(1, 'foo'),(2, 
'bar'),(3, 'baz')");
+    runStatementOnDriver("update " + tblName + " set b = 'blah' where a = 3");
+
+    // apply schema evolution by adding some columns
+    runStatementOnDriver("alter table " + tblName + " add columns(c int, d 
string)");
+
+    // insert some data in new schema
+    runStatementOnDriver("insert into " + tblName + " values(4, 'acid', 100, 
'orc'),"
+        + "(5, 'llap', 200, 'tez')");
+
+    // update old data with values for the new schema columns
+    runStatementOnDriver("update " + tblName + " set d = 'hive' where a <= 3");
+    runStatementOnDriver("update " + tblName + " set c = 999 where a <= 3");
+
+    // read the entire data back and see if did everything right
+    List<String> rs = runStatementOnDriver("select * from " + tblName + " 
order by a");
+    String[] expectedResult = { "1\tfoo\t999\thive", "2\tbar\t999\thive", 
"3\tblah\t999\thive", "4\tacid\t100\torc", "5\tllap\t200\ttez" };
+    Assert.assertEquals(Arrays.asList(expectedResult), rs);
+
+    // now compact and see if compaction still preserves the data correctness
+    runStatementOnDriver("alter table "+ tblName + " compact 'MAJOR'");
+    runWorker(hiveConf);
+    runCleaner(hiveConf); // Cleaner would remove the obsolete files.
+
+    // Verify that there is now only 1 new directory: base_xxxxxxx and the 
rest have have been cleaned.
+    FileSystem fs = FileSystem.get(hiveConf);
+    FileStatus[] status;
+    status = fs.listStatus(new Path(TEST_WAREHOUSE_DIR + "/" + 
tblName.toString().toLowerCase()),
+        FileUtils.STAGING_DIR_PATH_FILTER);
+    Assert.assertEquals(1, status.length);
+    boolean sawNewBase = false;
+    for (int i = 0; i < status.length; i++) {
+      if (status[i].getPath().getName().matches("base_.*")) {
+        sawNewBase = true;
+        FileStatus[] buckets = fs.listStatus(status[i].getPath(), 
FileUtils.STAGING_DIR_PATH_FILTER);
+        Assert.assertEquals(numBuckets, buckets.length);
+        
Assert.assertTrue(buckets[0].getPath().getName().matches("bucket_00000"));
+      }
+    }
+    Assert.assertTrue(sawNewBase);
+
+    rs = runStatementOnDriver("select * from " + tblName + " order by a");
+    Assert.assertEquals(Arrays.asList(expectedResult), rs);
+  }
+
+  @Test
+  public void testETLSplitStrategyForACID() throws Exception {
+    hiveConf.setVar(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY, "ETL");
+    hiveConf.setBoolVar(HiveConf.ConfVars.HIVEOPTINDEXFILTER, true);
+    runStatementOnDriver("insert into " + Table.ACIDTBL + " values(1,2)");
+    runStatementOnDriver("alter table " + Table.ACIDTBL + " compact 'MAJOR'");
+    runWorker(hiveConf);
+    List<String> rs = runStatementOnDriver("select * from " +  Table.ACIDTBL  
+ " where a = 1");
+    int[][] resultData = new int[][] {{1,2}};
+    Assert.assertEquals(stringifyValues(resultData), rs);
+  }
+
+  @Test
+  public void testAcidWithSchemaEvolution() throws Exception {
+    hiveConf.setVar(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY, "ETL");
+    String tblName = "acidTblWithSchemaEvol";
+    runStatementOnDriver("drop table if exists " + tblName);
+    runStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " +
+      " CLUSTERED BY(a) INTO 2 BUCKETS" + //currently ACID requires table to 
be bucketed
+      " STORED AS ORC TBLPROPERTIES ('transactional'='true')");
+
+    runStatementOnDriver("INSERT INTO " + tblName + " VALUES (1, 'foo'), (2, 
'bar')");
+
+    // Major compact to create a base that has ACID schema.
+    runStatementOnDriver("ALTER TABLE " + tblName + " COMPACT 'MAJOR'");
+    runWorker(hiveConf);
+
+    // Alter table for perform schema evolution.
+    runStatementOnDriver("ALTER TABLE " + tblName + " ADD COLUMNS(c int)");
+
+    // Validate there is an added NULL for column c.
+    List<String> rs = runStatementOnDriver("SELECT * FROM " + tblName + " 
ORDER BY a");
+    String[] expectedResult = { "1\tfoo\tNULL", "2\tbar\tNULL" };
+    Assert.assertEquals(Arrays.asList(expectedResult), rs);
+  }
+
   /**
    * takes raw data and turns it into a string as if from Driver.getResults()
    * sorts rows in dictionary order

hive git commit: HIVE-14448: Queries with predicate fail when ETL split strategy is chosen for ACID tables (Matt McCline, reviewed by Sergey Shelukhin)

Reply via email to