This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.8
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.8 by this push:
new 3d40cb608 ORC-1191: Updated TLC Taxi Benchmark Dataset
3d40cb608 is described below
commit 3d40cb608c8eb5cc472603e1cbf6bb354b5bd6c9
Author: mwlon <[email protected]>
AuthorDate: Sun May 29 16:12:09 2022 -0700
ORC-1191: Updated TLC Taxi Benchmark Dataset
### What changes were proposed in this pull request?
Update to new NYC TLC Yellow Taxi dataset
*
https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf
* replace non-existent CSV URLs with new Parquet URLs
* update file format in referencing benchmarks
* change util that converted from CSV string -> decimal to convert from
parquet double -> decimal
### Why are the changes needed?
As of 2022-05-13, NYC TLC switched from CSVs to Parquet files. They also
changed their schema slightly. Now the CSVs are no longer publicly available,
so to have a replicable benchmark suite, we should refer to the new version.
### How was this patch tested?
This does not change functionality or tests (except for the updated taxi
dataset).
```
java -jar hive/target/orc-benchmarks-hive-1.9.0-SNAPSHOT-uber.jar decimal
data
...
# Run progress: 50.00% complete, ETA 00:00:02
# Fork: 1 of 1
# Warmup Iteration 1: [WARN ] Unable to load native-hadoop library for
your platform... using builtin-java classes where applicable
122173.181 us/op
# Warmup Iteration 2: 116629.135 us/op
Iteration 1: 118590.860 us/op
Iteration 2: 116885.746 us/op
Iteration 3: 117235.593 us/op
...
```
Closes #1141 from mwlon/benchmark-updates.
Authored-by: mwlon <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
(cherry picked from commit 4373102c33cad900e52bb3e2f9c684f90b68375f)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
java/bench/README.md | 2 +-
.../orc/bench/core/convert/GenerateVariants.java | 4 ++--
.../orc/bench/core/convert/avro/AvroReader.java | 12 +++--------
java/bench/core/src/resources/taxi.schema | 24 +++++++++++-----------
java/bench/fetch-data.sh | 4 ++--
.../org/apache/orc/bench/hive/DecimalBench.java | 8 ++++----
6 files changed, 24 insertions(+), 30 deletions(-)
diff --git a/java/bench/README.md b/java/bench/README.md
index 3c9493410..d1ede6ff5 100644
--- a/java/bench/README.md
+++ b/java/bench/README.md
@@ -24,7 +24,7 @@ To fetch the source data:
```% ./fetch-data.sh```
-> :warning: Script will fetch 7GB of data
+> :warning: Script will fetch 4GB of data
To generate the derived data:
diff --git
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
index f06ec9f51..0450088d5 100644
---
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
+++
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/GenerateVariants.java
@@ -221,8 +221,8 @@ public class GenerateVariants implements OrcBenchmark {
long salesRecords) throws IOException
{
switch (dataName) {
case "taxi":
- return new RecursiveReader(new Path(root, "sources/" + dataName),
"csv",
- schema, conf, CompressionKind.ZLIB);
+ return new RecursiveReader(new Path(root, "sources/" + dataName),
"parquet",
+ schema, conf, CompressionKind.NONE);
case "sales":
return new SalesGenerator(salesRecords);
case "github":
diff --git
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java
index 0db7746f5..97b58a8fe 100644
---
a/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java
+++
b/java/bench/core/src/java/org/apache/orc/bench/core/convert/avro/AvroReader.java
@@ -41,7 +41,6 @@ import org.apache.orc.TypeDescription;
import org.apache.orc.bench.core.convert.BatchReader;
import java.io.IOException;
-import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.util.List;
@@ -191,8 +190,10 @@ public class AvroReader implements BatchReader {
private static class DecimalConverter implements AvroConverter {
final int scale;
+ final double multiplier;
DecimalConverter(int scale) {
this.scale = scale;
+ this.multiplier = Math.pow(10.0, this.scale);
}
public void convert(ColumnVector cv, int row, Object value) {
if (value == null) {
@@ -200,7 +201,7 @@ public class AvroReader implements BatchReader {
cv.isNull[row] = true;
} else {
DecimalColumnVector tc = (DecimalColumnVector) cv;
- tc.vector[row].set(getHiveDecimalFromByteBuffer((ByteBuffer) value,
scale));
+ tc.vector[row].set(HiveDecimal.create(Math.round((double) value *
multiplier)));
}
}
}
@@ -294,11 +295,4 @@ public class AvroReader implements BatchReader {
byteBuffer.get(result);
return result;
}
-
- static HiveDecimal getHiveDecimalFromByteBuffer(ByteBuffer byteBuffer,
- int scale) {
- byte[] result = getBytesFromByteBuffer(byteBuffer);
- HiveDecimal dec = HiveDecimal.create(new BigInteger(result), scale);
- return dec;
- }
}
diff --git a/java/bench/core/src/resources/taxi.schema
b/java/bench/core/src/resources/taxi.schema
index 3ccfa93d3..720848faa 100644
--- a/java/bench/core/src/resources/taxi.schema
+++ b/java/bench/core/src/resources/taxi.schema
@@ -1,21 +1,21 @@
struct<
- vendor_id:int,
- pickup_time: timestamp,
- dropoff_time: timestamp,
- passenger_count: int,
+ VendorID: bigint,
+ tpep_pickup_datetime: timestamp,
+ tpep_dropoff_datetime: timestamp,
+ passenger_count: bigint,
trip_distance: double,
- pickup_longitude: double,
- pickup_latitude: double,
- ratecode_id: int,
+ RatecodeID: bigint,
store_and_fwd_flag: string,
- dropoff_longitude: double,
- dropoff_latitude: double,
- payment_type: int,
+ PULocationID: bigint,
+ DOLocationID: bigint,
+ payment_type: bigint,
fare_amount: decimal(8,2),
extra: decimal(8,2),
mta_tax: decimal(8,2),
tip_amount: decimal(8,2),
tolls_amount: decimal(8,2),
- improvement_surcharge : decimal(8,2),
- total_amount: decimal(8,2)
+ improvement_surcharge: decimal(8,2),
+ total_amount: decimal(8,2),
+ congestion_surcharge: int,
+ airport_fee: int
>
diff --git a/java/bench/fetch-data.sh b/java/bench/fetch-data.sh
index 068810cfe..27c21bf80 100755
--- a/java/bench/fetch-data.sh
+++ b/java/bench/fetch-data.sh
@@ -15,8 +15,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
mkdir -p data/sources/taxi
-(cd data/sources/taxi; wget -O -
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-11.csv | gzip >
yellow_tripdata_2015-11.csv.gz )
-(cd data/sources/taxi; wget -O -
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv | gzip >
yellow_tripdata_2015-12.csv.gz )
+(cd data/sources/taxi; wget
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-11.parquet )
+(cd data/sources/taxi; wget
https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.parquet )
mkdir -p data/sources/github
(cd data/sources/github; wget
http://data.gharchive.org/2015-11-{01..15}-{0..23}.json.gz)
diff --git
a/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java
b/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java
index 7fae5b10d..ea2ecf2eb 100644
--- a/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java
+++ b/java/bench/hive/src/java/org/apache/orc/bench/hive/DecimalBench.java
@@ -157,7 +157,7 @@ public class DecimalBench implements OrcBenchmark {
schema = TypeDescription.createDecimal()
.withScale(2)
.withPrecision(precision);
- readCsvData(total_amount, root, "total_amount", conf);
+ readRawData(total_amount, root, "total_amount", conf);
batch = schema.createRowBatchV2();
}
}
@@ -180,7 +180,7 @@ public class DecimalBench implements OrcBenchmark {
writer.close();
}
- static void readCsvData(long[] data,
+ static void readRawData(long[] data,
Path root,
String column,
Configuration conf) throws IOException {
@@ -188,8 +188,8 @@ public class DecimalBench implements OrcBenchmark {
int row = 0;
int batchPosn = 0;
BatchReader reader =
- new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"),
"csv",
- schema, conf, org.apache.orc.bench.core.CompressionKind.ZLIB);
+ new GenerateVariants.RecursiveReader(new Path(root, "sources/taxi"),
"parquet",
+ schema, conf, org.apache.orc.bench.core.CompressionKind.NONE);
VectorizedRowBatch batch = schema.createRowBatch();
batch.size = 0;
TypeDescription columnSchema = schema.findSubtype(column);