[ https://issues.apache.org/jira/browse/ORC-54?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15423575#comment-15423575 ]
ASF GitHub Bot commented on ORC-54: ----------------------------------- Github user prasanthj commented on a diff in the pull request: https://github.com/apache/orc/pull/55#discussion_r75038675 --- Diff: java/core/src/java/org/apache/orc/impl/SchemaEvolution.java --- @@ -20,59 +20,132 @@ import java.util.ArrayList; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; +import org.apache.orc.Reader; import org.apache.orc.TypeDescription; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * Take the file types and the (optional) configuration column names/types and - * see if there has been schema evolution. + * Infer and track the evolution between the schema as stored in the file and + * the schema that has been requested by the reader. */ public class SchemaEvolution { // indexed by reader column id private final TypeDescription[] readerFileTypes; // indexed by reader column id - private final boolean[] included; + private final boolean[] readerIncluded; + // indexed by file column id + private final boolean[] fileIncluded; private final TypeDescription fileSchema; private final TypeDescription readerSchema; private boolean hasConversion = false; + private final boolean isAcid; + // indexed by reader column id private final boolean[] ppdSafeConversion; - public SchemaEvolution(TypeDescription fileSchema, boolean[] includedCols) { - this(fileSchema, null, includedCols); + private static final Logger LOG = + LoggerFactory.getLogger(SchemaEvolution.class); + private static final Pattern missingMetadataPattern = + Pattern.compile("_col\\d+"); + + public static class IllegalEvolutionException extends RuntimeException { + public IllegalEvolutionException(String msg) { + super(msg); + } + } + + public SchemaEvolution(TypeDescription fileSchema, + Reader.Options options) { + this(fileSchema, null, options); } public SchemaEvolution(TypeDescription fileSchema, TypeDescription readerSchema, - boolean[] includedCols) { - this.included = includedCols == null ? null : + Reader.Options options) { + boolean allowMissingMetadata = options.getTolerateMissingSchema(); + boolean[] includedCols = options.getInclude(); + this.readerIncluded = includedCols == null ? null : Arrays.copyOf(includedCols, includedCols.length); + this.fileIncluded = new boolean[fileSchema.getMaximumId() + 1]; this.hasConversion = false; this.fileSchema = fileSchema; + isAcid = checkAcidSchema(fileSchema); if (readerSchema != null) { - if (checkAcidSchema(fileSchema)) { + if (isAcid) { this.readerSchema = createEventSchema(readerSchema); } else { this.readerSchema = readerSchema; } - this.readerFileTypes = new TypeDescription[this.readerSchema.getMaximumId() + 1]; - buildConversionFileTypesArray(fileSchema, this.readerSchema); + this.readerFileTypes = + new TypeDescription[this.readerSchema.getMaximumId() + 1]; + int positionalLevels = 0; + if (!hasColumnNames(isAcid? getBaseRow(fileSchema) : fileSchema)){ + if (!this.fileSchema.equals(this.readerSchema)) { + if (!allowMissingMetadata) { + throw new RuntimeException("Found that schema metadata is missing" + + " from file. This is likely caused by" + + " a writer earlier than HIVE-4243. Will" + + " not try to reconcile schemas"); + } else { + LOG.warn("Column names are missing from this file. This is" + + " caused by a writer earlier than HIVE-4243. The reader will" + + " reconcile schemas based on index. File type: " + + this.fileSchema + ", reader type: " + this.readerSchema); + positionalLevels = isAcid ? 2 : 1; --- End diff -- What does positional level mean? Is it real row level? Does acid file schema look like this struct<struct<[acid_cols]>,struct[real_cols]>>? If so can you leave a comment about it? > Evolve schemas based on field name rather than index > ---------------------------------------------------- > > Key: ORC-54 > URL: https://issues.apache.org/jira/browse/ORC-54 > Project: Orc > Issue Type: Improvement > Reporter: Mark Wagner > Assignee: Mark Wagner > > Schema evolution as it stands today allows adding fields to the end of > schemas or removing them from the end. However, because it is based on the > index of the column, you can only ever add or remove -- not both. > ORC files have the full schema information of their contents, so there's > actually enough metadata to support changing columns anywhere in the schema. -- This message was sent by Atlassian JIRA (v6.3.4#6332)