TheR1sing3un commented on code in PR #14205:
URL: https://github.com/apache/hudi/pull/14205#discussion_r2516655681


##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/HoodieSparkParquetReader.java:
##########
@@ -121,20 +130,61 @@ public ClosableIterator<String> getRecordKeyIterator() 
throws IOException {
   }
 
   public ClosableIterator<UnsafeRow> getUnsafeRowIterator(Schema 
requestedSchema) throws IOException {
+    return getUnsafeRowIterator(requestedSchema, Collections.emptyList());
+  }
+
+  /**
+   * Read parquet with requested schema and filters.
+   * WARN:
+   * Currently, the filter must only contain field references related to the 
primary key, as the primary key does not involve schema evolution.
+   * If it is necessary to expand to push down more fields in the future, 
please consider the issue of schema evolution carefully
+   */
+  public ClosableIterator<UnsafeRow> getUnsafeRowIterator(Schema 
requestedSchema, List<Filter> readFilters) throws IOException {
     Schema nonNullSchema = 
AvroSchemaUtils.getNonNullTypeFromUnion(requestedSchema);
     StructType structSchema = 
HoodieInternalRowUtils.getCachedSchema(nonNullSchema);
     Option<MessageType> messageSchema = 
Option.of(getAvroSchemaConverter(storage.getConf().unwrapAs(Configuration.class)).convert(nonNullSchema));
     boolean enableTimestampFieldRepair = 
storage.getConf().getBoolean(ENABLE_LOGICAL_TIMESTAMP_REPAIR, true);
     StructType dataStructType = convertToStruct(enableTimestampFieldRepair ? 
SchemaRepair.repairLogicalTypes(getFileSchema(), messageSchema) : 
getFileSchema());
     SparkBasicSchemaEvolution evolution = new 
SparkBasicSchemaEvolution(dataStructType, structSchema, 
SQLConf.get().sessionLocalTimeZone());
     String readSchemaJson = evolution.getRequestSchema().json();
+    SQLConf sqlConf = SQLConf.get();
     storage.getConf().set(ParquetReadSupport.PARQUET_READ_SCHEMA, 
readSchemaJson);
     storage.getConf().set(ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA(), 
readSchemaJson);
-    storage.getConf().set(SQLConf.PARQUET_BINARY_AS_STRING().key(), 
SQLConf.get().getConf(SQLConf.PARQUET_BINARY_AS_STRING()).toString());
-    storage.getConf().set(SQLConf.PARQUET_INT96_AS_TIMESTAMP().key(), 
SQLConf.get().getConf(SQLConf.PARQUET_INT96_AS_TIMESTAMP()).toString());
+    storage.getConf().set(SQLConf.PARQUET_BINARY_AS_STRING().key(), 
sqlConf.getConf(SQLConf.PARQUET_BINARY_AS_STRING()).toString());
+    storage.getConf().set(SQLConf.PARQUET_INT96_AS_TIMESTAMP().key(), 
sqlConf.getConf(SQLConf.PARQUET_INT96_AS_TIMESTAMP()).toString());
+    RebaseDateTime.RebaseSpec rebaseDateSpec = 
SparkAdapterSupport$.MODULE$.sparkAdapter().getRebaseSpec("CORRECTED");
+    boolean parquetFilterPushDown = 
storage.getConf().getBoolean(SQLConf.PARQUET_RECORD_FILTER_ENABLED().key(), 
sqlConf.parquetRecordFilterEnabled());
+    if (parquetFilterPushDown && readFilters != null && 
!readFilters.isEmpty()) {
+      ParquetMetadata parquetMetadataWithoutRowGroup = 
getParquetMetadataWithoutRowGroup();
+      ParquetFilters parquetFilters = new ParquetFilters(
+          parquetMetadataWithoutRowGroup.getFileMetaData().getSchema(),
+          
storage.getConf().getBoolean(SQLConf.PARQUET_FILTER_PUSHDOWN_DATE_ENABLED().key(),
 sqlConf.parquetFilterPushDownDate()),
+          
storage.getConf().getBoolean(SQLConf.PARQUET_FILTER_PUSHDOWN_TIMESTAMP_ENABLED().key(),
 sqlConf.parquetFilterPushDownTimestamp()),
+          
storage.getConf().getBoolean(SQLConf.PARQUET_FILTER_PUSHDOWN_DECIMAL_ENABLED().key(),
 sqlConf.parquetFilterPushDownDecimal()),
+          
storage.getConf().getBoolean(SQLConf.PARQUET_FILTER_PUSHDOWN_STRING_STARTSWITH_ENABLED().key(),

Review Comment:
   > is the string predicate push down a superset for startswith predicate?
   
   Right, 
   <img width="926" height="208" alt="image" 
src="https://github.com/user-attachments/assets/b0d242bb-8c06-4d4a-9a93-aaf1e28dc6ca";
 />
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to