soumilshah1995 commented on issue #10110:
URL: https://github.com/apache/hudi/issues/10110#issuecomment-2275701103

   ```
   from pyspark.sql import SparkSession
   from pyspark.sql.types import StructType, StructField, StringType, 
TimestampType, FloatType
   from pyspark.sql.functions import hour, col
   from datetime import datetime, timedelta
   import os
   import sys
   import random
   
   # Configuration
   HUDI_VERSION = '1.0.0-beta2'
   SPARK_VERSION = '3.4'
   
   os.environ["JAVA_HOME"] = "/opt/homebrew/opt/openjdk@11"
   SUBMIT_ARGS = f"--packages 
org.apache.hudi:hudi-spark{SPARK_VERSION}-bundle_2.12:{HUDI_VERSION} 
pyspark-shell"
   os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
   os.environ['PYSPARK_PYTHON'] = sys.executable
   
   # Initialize Spark session
   spark = SparkSession.builder \
       .config('spark.serializer', 
'org.apache.spark.serializer.KryoSerializer') \
       .config('spark.sql.extensions', 
'org.apache.spark.sql.hudi.HoodieSparkSessionExtension') \
       .config('spark.sql.hive.convertMetastoreParquet', 'false') \
       .getOrCreate()
   
   # Generate mock event data
   def generate_event_data(num_events):
       event_types = ["click", "view", "purchase", "signup"]
       start_time = datetime(2023, 1, 1)
       data = []
   
       for i in range(num_events):
           event = {
               "event_id": i + 1,
               "user_id": random.randint(1, 100),
               "event_type": random.choice(event_types),
               "timestamp": (start_time + timedelta(hours=random.randint(0, 
5000))).strftime("%Y-%m-%d %H:%M:%S")
           }
           data.append(event)
   
       return data
   
   # Create DataFrame
   num_events = 10000
   events_data = generate_event_data(num_events)
   df = spark.createDataFrame(events_data)
   df.show()
   
   # Write DataFrame to Hudi table
   table_name = "web_events"
   path = f'file:///Users/soumilshah/Desktop/{table_name}/'
   
   df.write.format("hudi") \
       .option("hoodie.table.name", table_name) \
       .option("hoodie.datasource.write.recordkey.field", "event_id") \
       .option("hoodie.datasource.write.partitionpath.field", "") \
       .option("hoodie.datasource.write.precombine.field", "timestamp") \
       .option("hoodie.table.metadata.enable", "true") \
       .option("hoodie.metadata.index.column.stats.enable", "true") \
       .option("path", path) \
       .mode("overwrite") \
       .saveAsTable(table_name)
   
   # Create functional index on timestamp column
   query_create_ts_datestr = """
   CREATE INDEX IF NOT EXISTS ts_datestr ON web_events 
   USING column_stats(timestamp) 
   OPTIONS(func='from_unixtime', format='yyyy-MM-dd')
   """
   spark.sql(query_create_ts_datestr).show()
   
   # Query data for a specific date
   spark.sql("""
   SELECT event_type, user_id, event_id
   FROM web_events
   WHERE date_format(timestamp, 'yyyy-MM-dd') = '2023-06-17'
   """).show()
   
   # Explain query plan for date-based query
   spark.sql("""
   EXPLAIN
   SELECT event_type, user_id, event_id
   FROM web_events
   WHERE date_format(timestamp, 'yyyy-MM-dd') = '2023-06-17'
   """).show(truncate=False)
   
   # Create functional index on hour of timestamp
   query_create_ts_hour = """
   CREATE INDEX ts_hour ON web_events 
   USING column_stats(timestamp) 
   OPTIONS(func='hour')
   """
   spark.sql(query_create_ts_hour)
   
   # Query data aggregated by hour
   spark.sql("""
   SELECT hour(timestamp) AS hour_of_day, COUNT(*) AS event_count
   FROM web_events
   GROUP BY hour(timestamp)
   """).show()
   
   ```
   
   
   Is this a good example to observe a reduction in the number of files read 
before and after creating a functional index? If not, what changes should I 
make to the dataset or my query method to clearly see a decrease in the number 
of files read? I look forward to your guidance on this.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to