sudhar91 commented on code in PR #592: URL: https://github.com/apache/incubator-xtable/pull/592#discussion_r1885712084
########## xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java: ########## @@ -0,0 +1,209 @@ +package org.apache.xtable.parquet; + +import java.io.IOException; +import java.time.Instant; +import java.util.*; +import java.util.stream.Collectors; +import lombok.Builder; +import lombok.NonNull; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.xtable.avro.AvroSchemaConverter; +import org.apache.xtable.model.*; +import org.apache.xtable.model.schema.InternalPartitionField; +import org.apache.xtable.model.schema.InternalSchema; +import org.apache.xtable.model.storage.*; +import org.apache.xtable.spi.extractor.ConversionSource; + +@Builder +public class ParquetConversionSource implements ConversionSource<Long> { + + private final String tableName; + private final String basePath; + @NonNull private final Configuration hadoopConf; + + @Builder.Default + private static final AvroSchemaConverter schemaExtractor = AvroSchemaConverter.getInstance(); + + @Builder.Default private static final FileSystemHelper fsHelper = FileSystemHelper.getInstance(); + + @Builder.Default + private static final ParquetMetadataExtractor parquetMetadataExtractor = + ParquetMetadataExtractor.getInstance(); + + @Builder.Default + private static final ParquetPartitionHelper parquetPartitionHelper = + ParquetPartitionHelper.getInstance(); + + private Map<String, List<String>> initPartitionInfo() { + return fsHelper.getPartitionFromDirectoryStructure( + hadoopConf, basePath, Collections.emptyMap()); + } + + /** + * To infer schema getting the latest file assumption is that latest file will have new fields + * + * @param modificationTime the commit to consider for reading the table state + * @return + */ + @Override + public InternalTable getTable(Long modificationTime) { + + Optional<LocatedFileStatus> latestFile = + fsHelper + .getParquetFiles(hadoopConf, basePath) + .max(Comparator.comparing(FileStatus::getModificationTime)); Review Comment: i dont think we can push down with the api , even to filter files greater than modification we have to first list and then filter out, do u have any other idea on your mind for this? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@xtable.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org