[ 
https://issues.apache.org/jira/browse/DRILL-5660?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16127167#comment-16127167
 ] 

ASF GitHub Bot commented on DRILL-5660:
---------------------------------------

Github user arina-ielchiieva commented on a diff in the pull request:

    https://github.com/apache/drill/pull/877#discussion_r133177759
  
    --- Diff: 
exec/java-exec/src/main/java/org/apache/drill/exec/store/parquet/Metadata.java 
---
    @@ -543,78 +584,72 @@ private void readBlockMeta(String path,
         mapper.registerModule(serialModule);
         mapper.registerModule(module);
         mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, 
false);
    -    FSDataInputStream is = fs.open(p);
    -
    -    boolean alreadyCheckedModification = false;
    -    boolean newMetadata = false;
    -
    -    if (metaContext != null) {
    -      alreadyCheckedModification = metaContext.getStatus(parentDirString);
    -    }
    -
    -    if (dirsOnly) {
    -      parquetTableMetadataDirs = mapper.readValue(is, 
ParquetTableMetadataDirs.class);
    -      logger.info("Took {} ms to read directories from directory cache 
file", timer.elapsed(TimeUnit.MILLISECONDS));
    -      timer.stop();
    -      parquetTableMetadataDirs.updateRelativePaths(parentDirString);
    -      if (!alreadyCheckedModification && 
tableModified(parquetTableMetadataDirs.getDirectories(), p, parentDir, 
metaContext)) {
    -        parquetTableMetadataDirs =
    -            
(createMetaFilesRecursively(Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getRight();
    -        newMetadata = true;
    -      }
    -    } else {
    -      parquetTableMetadata = mapper.readValue(is, 
ParquetTableMetadataBase.class);
    -      logger.info("Took {} ms to read metadata from cache file", 
timer.elapsed(TimeUnit.MILLISECONDS));
    -      timer.stop();
    -      if (parquetTableMetadata instanceof ParquetTableMetadata_v3) {
    -        ((ParquetTableMetadata_v3) 
parquetTableMetadata).updateRelativePaths(parentDirString);
    -      }
    -      if (!alreadyCheckedModification && 
tableModified(parquetTableMetadata.getDirectories(), p, parentDir, 
metaContext)) {
    -        parquetTableMetadata =
    -            
(createMetaFilesRecursively(Path.getPathWithoutSchemeAndAuthority(p.getParent()).toString())).getLeft();
    -        newMetadata = true;
    -      }
    +    try (FSDataInputStream is = fs.open(path)) {
    +      boolean alreadyCheckedModification = false;
    +      boolean newMetadata = false;
    +        alreadyCheckedModification = 
metaContext.getStatus(metadataParentDirPath);
    +
    +      if (dirsOnly) {
    +        parquetTableMetadataDirs = mapper.readValue(is, 
ParquetTableMetadataDirs.class);
    +        logger.info("Took {} ms to read directories from directory cache 
file", timer.elapsed(TimeUnit.MILLISECONDS));
    +        timer.stop();
    +        
parquetTableMetadataDirs.updateRelativePaths(metadataParentDirPath);
    +        if (!alreadyCheckedModification && 
tableModified(parquetTableMetadataDirs.getDirectories(), path, 
metadataParentDir, metaContext)) {
    +          parquetTableMetadataDirs =
    +              
(createMetaFilesRecursively(Path.getPathWithoutSchemeAndAuthority(path.getParent()).toString())).getRight();
    +          newMetadata = true;
    +        }
    +      } else {
    +        parquetTableMetadata = mapper.readValue(is, 
ParquetTableMetadataBase.class);
    +        logger.info("Took {} ms to read metadata from cache file", 
timer.elapsed(TimeUnit.MILLISECONDS));
    +        timer.stop();
    +        if (new 
MetadataVersion(parquetTableMetadata.getMetadataVersion()).compareTo(new 
MetadataVersion(3, 0)) >= 0) {
    +          ((ParquetTableMetadata_v3) 
parquetTableMetadata).updateRelativePaths(metadataParentDirPath);
    +        }
    +        if (!alreadyCheckedModification && 
tableModified(parquetTableMetadata.getDirectories(), path, metadataParentDir, 
metaContext)) {
    +          parquetTableMetadata =
    +              
(createMetaFilesRecursively(Path.getPathWithoutSchemeAndAuthority(path.getParent()).toString())).getLeft();
    +          newMetadata = true;
    +        }
     
    -      // DRILL-5009: Remove the RowGroup if it is empty
    -      List<? extends ParquetFileMetadata> files = 
parquetTableMetadata.getFiles();
    -      for (ParquetFileMetadata file : files) {
    -        List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
    -        for (Iterator<? extends RowGroupMetadata> iter = 
rowGroups.iterator(); iter.hasNext(); ) {
    -          RowGroupMetadata r = iter.next();
    -          if (r.getRowCount() == 0) {
    -            iter.remove();
    +        // DRILL-5009: Remove the RowGroup if it is empty
    +        List<? extends ParquetFileMetadata> files = 
parquetTableMetadata.getFiles();
    +        for (ParquetFileMetadata file : files) {
    +          List<? extends RowGroupMetadata> rowGroups = file.getRowGroups();
    +          for (Iterator<? extends RowGroupMetadata> iter = 
rowGroups.iterator(); iter.hasNext(); ) {
    +            RowGroupMetadata r = iter.next();
    +            if (r.getRowCount() == 0) {
    +              iter.remove();
    +            }
               }
             }
    -      }
    -
    -    }
     
    -    if (newMetadata && metaContext != null) {
    -      // if new metadata files were created, invalidate the existing 
metadata context
    -      metaContext.clear();
    +      }
    +      if (newMetadata) {
    +        // if new metadata files were created, invalidate the existing 
metadata context
    +        metaContext.clear();
    +      }
    +    } catch (IOException e) {
    +      logger.error("Failed to read '{}' metadata file", path, e);
    +      metaContext.setMetadataCacheCorrupted(true);
         }
    -
       }
     
       /**
        * Check if the parquet metadata needs to be updated by comparing the 
modification time of the directories with
        * the modification time of the metadata file
        *
    -   * @param directories
    -   * @param metaFilePath
    -   * @return
    -   * @throws IOException
    +   * @param directories List of directories
    +   * @param metaFilePath path of parquet metadata cache file
    +   * @return true if metadata needs to be updated, false otherwise
    +   * @throws IOException if some resources are not accessible
        */
    -  private boolean tableModified(List<String> directories, Path 
metaFilePath,
    -      Path parentDir,
    -      MetadataContext metaContext)
    +  private boolean tableModified(List<String> directories, Path 
metaFilePath, Path parentDir, MetadataContext metaContext)
           throws IOException {
     
         Stopwatch timer = Stopwatch.createStarted();
     
    -    if (metaContext != null) {
    -      metaContext.setStatus(parentDir.toString());
    -    }
    +    metaContext.setStatus(parentDir.toString());
    --- End diff --
    
    `parentDir.toUri().getPath()`?


> Drill 1.10 queries fail due to Parquet Metadata "corruption" from DRILL-3867
> ----------------------------------------------------------------------------
>
>                 Key: DRILL-5660
>                 URL: https://issues.apache.org/jira/browse/DRILL-5660
>             Project: Apache Drill
>          Issue Type: Bug
>    Affects Versions: 1.11.0
>            Reporter: Paul Rogers
>            Assignee: Vitalii Diravka
>              Labels: doc-impacting
>             Fix For: 1.12.0
>
>
> Drill recently accepted a PR for the following bug:
> DRILL-3867: Store relative paths in metadata file
> This PR turned out to have a flaw which affects version compatibility.
> The DRILL-3867 PR changed the format of Parquet metadata files to store 
> relative paths. All Drill servers after that PR create files with relative 
> paths. But, the version number of the file is unchanged, so that older 
> Drillbits don't know that they can't understand the file.
> Instead, if an older Drill tries to read the file, queries fail left and 
> right. Drill will resolve the paths, but does so relative to the user's HDFS 
> home directory, which is wrong.
> What should have happened is that we should have bumped the parquet metadata 
> file version number so that older Drillbits can’t read the file. This ticket 
> requests that we do that.
> Now, one could argue that the lack of version number change is fine. Once a 
> user upgrades Drill, they won't use an old Drillbit. But, things are not that 
> simple:
> * A developer tests a branch based on a pre-DRILL-3867 build on a cluster in 
> which metadata files have been created by a post-DRILL-3867 build. (This has 
> already occurred multiple times in our shop.)
> * A user tries to upgrade to Drill 1.11, finds an issue, and needs to roll 
> back to Drill 1.10. Doing so will cause queries to fail due to 
> seemingly-corrupt metadata files.
> * A user tries to do a rolling upgrade: running 1.11 on some servers, 1.10 on 
> others. Once a 1.11 server is installed, the metadata is updated ("corrupted" 
> from the perspective of 1.10) and queries fail.
> Standard practice in this scenario is to:
> * Bump the file version number when the file format changes, and
> * Software refuses to read files with a version newer than the software was 
> designed for.
> Of course, it is highly desirable that newer servers read old files, but that 
> is not the issue here.
> *Main technical points of working of parquet metadata caching for now.*
> Only process of reading the parquet metadata is changed (the process of 
> writing isn't changed):
> +1. Metadata files are valid:+
> Metadata objects are created by deserialization of parquet metadata files in 
> the process of creating ParquetGroupScan physical operator. 
> All supported versions are stored in the "MetadataVersion.Constants" class 
> and in the Jackson annotations for Metadata.ParquetTableMetadataBase class.
> +2. Metadata files version isn't supported (created by newer Drill version). 
> Drill table has at least one metadata file of unsupported version:+
> JsonMappingException is obtained and swallowed without creating metadata 
> object. Error message is logged. The state is stored in MetadataContext, 
> therefore further there will be no attempt to deserialize metadata file again 
> in context of performing current query. The physical plan will be created 
> without using parquet metadata caching. Warning message is logged for every 
> further check "is metadata corrupted".
> +3. Drill table has at least one corrupted metadata file, which can't be 
> deserialized:+
> JsonParseException is obtained. Then the same behaviour as for the 
> unsupported version files.
> +4. The metadata file was removed by other process:+
> FileNotFound is obtained. Then the same behaviour as for the unsupported 
> version files.
> The new versions of metadata should be added in such manner:
> 1. Increasing of the metadata major version if metadata structure is changed.
> 2. Increasing of the metadata minor version if only metadata content is 
> changed, but metadata structure is the same.
> For the first case a new metadata structure (class) should be created 
> (possible an improvement of deserializing metadata files of any version into 
> one strucure by using special converting)
> For the second case only annotation for the last metadata structure can be 
> updated.
> *Summary*
> 1. Drill will read and use metadata files if files are valid, all present and 
> supported. Under supported we mean that files were created before and under 
> current Drill version.
> 2. Drill will ignore reading metadata files if at least one file is missing, 
> empty, corrupted or unsupported. Under unsupported we mean files created 
> after current Drill version. When metadata files are not used, warning 
> message will be written to the logs.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to