[orc] branch branch-1.8 updated: ORC-1191: Updated TLC Taxi Benchmark Dataset

dongjoon Sun, 29 May 2022 19:36:45 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-1.8
in repository https://gitbox.apache.org/repos/asf/orc.git



The following commit(s) were added to refs/heads/branch-1.8 by this push:
     new 3d40cb608 ORC-1191: Updated TLC Taxi Benchmark Dataset
3d40cb608 is described below

commit 3d40cb608c8eb5cc472603e1cbf6bb354b5bd6c9
Author: mwlon <[email protected]>
AuthorDate: Sun May 29 16:12:09 2022 -0700

    ORC-1191: Updated TLC Taxi Benchmark Dataset
    
    ### What changes were proposed in this pull request?
    Update to new NYC TLC Yellow Taxi dataset
    
    * 
https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
    * replace non-existent CSV URLs with new Parquet URLs
    * update file format in referencing benchmarks
    * change util that converted from CSV string -> decimal to convert from 
parquet double -> decimal
    
    ### Why are the changes needed?
    
    As of 2022-05-13, NYC TLC switched from CSVs to Parquet files. They also 
changed their schema slightly. Now the CSVs are no longer publicly available, 
so to have a replicable benchmark suite, we should refer to the new version.
    
    ### How was this patch tested?
    This does not change functionality or tests (except for the updated taxi 
dataset).
    
    ```
    java -jar hive/target/orc-benchmarks-hive-1.9.0-SNAPSHOT-uber.jar decimal 
data
    ...
    # Run progress: 50.00% complete, ETA 00:00:02
    # Fork: 1 of 1
    # Warmup Iteration   1: [WARN ] Unable to load native-hadoop library for 
your platform... using builtin-java classes where applicable
    122173.181 us/op
    # Warmup Iteration   2: 116629.135 us/op
    Iteration   1: 118590.860 us/op
    Iteration   2: 116885.746 us/op
    Iteration   3: 117235.593 us/op
    ...
    ```
    
    Closes #1141 from mwlon/benchmark-updates.
    
    Authored-by: mwlon <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
    (cherry picked from commit 4373102c33cad900e52bb3e2f9c684f90b68375f)
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 java/bench/README.md                               |  2 +-
 .../orc/bench/core/convert/GenerateVariants.java   |  4 ++--
 .../orc/bench/core/convert/avro/AvroReader.java    | 12 +++--------
 java/bench/core/src/resources/taxi.schema          | 24 +++++++++++-----------
 java/bench/fetch-data.sh                           |  4 ++--
 .../org/apache/orc/bench/hive/DecimalBench.java    |  8 ++++----
 6 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/java/bench/README.md b/java/bench/README.md
index 3c9493410..d1ede6ff5 100644
--- a/java/bench/README.md
+++ b/java/bench/README.md
@@ -24,7 +24,7 @@ To fetch the source data:
 
 ```% ./fetch-data.sh```
 
-> :warning: Script will fetch 7GB of data
+> :warning: Script will fetch 4GB of data
 
 To generate the derived data:
 
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
index f06ec9f51..0450088d5 100644
--- 
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
+++ 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
@@ -221,8 +221,8 @@ public class GenerateVariants implements OrcBenchmark {
                                          long salesRecords) throws IOException 
{
     switch (dataName) {
       case "taxi":
-        return new RecursiveReader(new Path(root, "sources/" + dataName), 
"csv",
-            schema, conf, CompressionKind.ZLIB);
+        return new RecursiveReader(new Path(root, "sources/" + dataName), 
"parquet",
+            schema, conf, CompressionKind.NONE);
       case "sales":
         return new SalesGenerator(salesRecords);
       case "github":
diff --git 
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java
 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java
index 0db7746f5..97b58a8fe 100644
--- 
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java
+++ 
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java
@@ -41,7 +41,6 @@ import org.apache.orc.TypeDescription;
 import org.apache.orc.bench.core.convert.BatchReader;
 
 import java.io.IOException;
-import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.util.List;
 
@@ -191,8 +190,10 @@ public class AvroReader implements BatchReader {
 
   private static class DecimalConverter implements AvroConverter {
     final int scale;
+    final double multiplier;
     DecimalConverter(int scale) {
       this.scale = scale;
+      this.multiplier = Math.pow(10.0, this.scale);
     }
     public void convert(ColumnVector cv, int row, Object value) {
       if (value == null) {
@@ -200,7 +201,7 @@ public class AvroReader implements BatchReader {
         cv.isNull[row] = true;
       } else {
         DecimalColumnVector tc = (DecimalColumnVector) cv;
-        tc.vector[row].set(getHiveDecimalFromByteBuffer((ByteBuffer) value, 
scale));
+        tc.vector[row].set(HiveDecimal.create(Math.round((double) value * 
multiplier)));
       }
     }
   }
@@ -294,11 +295,4 @@ public class AvroReader implements BatchReader {
     byteBuffer.get(result);
     return result;
   }
-
-  static HiveDecimal getHiveDecimalFromByteBuffer(ByteBuffer byteBuffer,
-                                                  int scale) {
-    byte[] result = getBytesFromByteBuffer(byteBuffer);
-    HiveDecimal dec = HiveDecimal.create(new BigInteger(result), scale);
-    return dec;
-  }
 }
diff --git a/java/bench/core/src/resources/taxi.schema 
b/java/bench/core/src/resources/taxi.schema
index 3ccfa93d3..720848faa 100644
--- a/java/bench/core/src/resources/taxi.schema
+++ b/java/bench/core/src/resources/taxi.schema
@@ -1,21 +1,21 @@
 struct<
-  vendor_id:int,
-  pickup_time: timestamp,
-  dropoff_time: timestamp,
-  passenger_count: int,
+  VendorID: bigint,
+  tpep_pickup_datetime: timestamp,
+  tpep_dropoff_datetime: timestamp,
+  passenger_count: bigint,
   trip_distance: double,
-  pickup_longitude: double,
-  pickup_latitude: double,
-  ratecode_id: int,
+  RatecodeID: bigint,
   store_and_fwd_flag: string,
-  dropoff_longitude: double,
-  dropoff_latitude: double,
-  payment_type: int,
+  PULocationID: bigint,
+  DOLocationID: bigint,
+  payment_type: bigint,
   fare_amount: decimal(8,2),
   extra: decimal(8,2),
   mta_tax: decimal(8,2),
   tip_amount: decimal(8,2),
   tolls_amount: decimal(8,2),
-  improvement_surcharge : decimal(8,2),
-  total_amount: decimal(8,2)
+  improvement_surcharge: decimal(8,2),
+  total_amount: decimal(8,2),
+  congestion_surcharge: int,
+  airport_fee: int
 >
diff --git a/java/bench/fetch-data.sh b/java/bench/fetch-data.sh
index 068810cfe..27c21bf80 100755
--- a/java/bench/fetch-data.sh
+++ b/java/bench/fetch-data.sh
@@ -15,8 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 mkdir -p data/sources/taxi
-(cd data/sources/taxi; wget -O - 
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-11.csv | gzip > 
yellow_tripdata_2015-11.csv.gz )
-(cd data/sources/taxi; wget -O - 
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv | gzip > 
yellow_tripdata_2015-12.csv.gz )
+(cd data/sources/taxi; wget 
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-11.parquet )
+(cd data/sources/taxi; wget 
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.parquet )
 
 mkdir -p data/sources/github
 (cd data/sources/github; wget 
http://data.gharchive.org/2015-11-{01..15}-{0..23}.json.gz)
diff --git 
a/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java 
b/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java
index 7fae5b10d..ea2ecf2eb 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java
@@ -157,7 +157,7 @@ public class DecimalBench implements OrcBenchmark {
       schema = TypeDescription.createDecimal()
           .withScale(2)
           .withPrecision(precision);
-      readCsvData(total_amount, root, "total_amount", conf);
+      readRawData(total_amount, root, "total_amount", conf);
       batch = schema.createRowBatchV2();
     }
   }
@@ -180,7 +180,7 @@ public class DecimalBench implements OrcBenchmark {
     writer.close();
   }
 
-  static void readCsvData(long[] data,
+  static void readRawData(long[] data,
                           Path root,
                           String column,
                           Configuration conf) throws IOException {
@@ -188,8 +188,8 @@ public class DecimalBench implements OrcBenchmark {
     int row = 0;
     int batchPosn = 0;
     BatchReader reader =
-        new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"), 
"csv",
-        schema, conf, org.apache.orc.bench.core.CompressionKind.ZLIB);
+        new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"), 
"parquet",
+        schema, conf, org.apache.orc.bench.core.CompressionKind.NONE);
     VectorizedRowBatch batch = schema.createRowBatch();
     batch.size = 0;
     TypeDescription columnSchema = schema.findSubtype(column);

[orc] branch branch-1.8 updated: ORC-1191: Updated TLC Taxi Benchmark Dataset

Reply via email to