Xikui Wang has submitted this change and it was merged. Change subject: [NO ISSUE] Avoid overwriting files in SparkSQL perf test ......................................................................
[NO ISSUE] Avoid overwriting files in SparkSQL perf test - user model changes: no - storage format changes: no - interface changes: no Details: 1. Overwritting file on HDFS sometimes causes uploading issue. Now we generate different scripts for cc and ncs. 2. Fix query issues in SparkSQL test. 3. Add commented disk option. Change-Id: Ia0e04b3c80cc83322def2a949fc0ddf01fd8e7a8 Reviewed-on: https://asterix-gerrit.ics.uci.edu/2011 Sonar-Qube: Jenkins <[email protected]> Tested-by: Jenkins <[email protected]> Contrib: Jenkins <[email protected]> Integration-Tests: Jenkins <[email protected]> Reviewed-by: Till Westmann <[email protected]> --- M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/create_aws_cluster.yml M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/install_hdfs.yml M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/load_tpch.yml M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/spark_sql_settings.yml M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q20.sql M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q9.sql M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/conf/execute-query.tmpl M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q13.sql M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q16.sql M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q17.sql M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q19.sql M asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q3.sql 12 files changed, 57 insertions(+), 31 deletions(-) Approvals: Till Westmann: Looks good to me, approved Jenkins: Verified; No violations found; ; Verified Objections: Anon. E. Moose #1000171: diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/create_aws_cluster.yml b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/create_aws_cluster.yml index 4529e8b..9fc4c9a 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/create_aws_cluster.yml +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/create_aws_cluster.yml @@ -50,6 +50,12 @@ Name: "{{ spark_instance_name }}" aws_access_key: "{{ access_key_id }}" aws_secret_key: "{{ secret_access_key }}" + # Enable the volume section if you need extra disk space + #volumes: + # - device_name: /dev/xvda + # volume_type: gp2 + # volume_size: 50 + # delete_on_termination: true register: ec2 diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/install_hdfs.yml b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/install_hdfs.yml index 267ab2a..155515f 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/install_hdfs.yml +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/install_hdfs.yml @@ -23,7 +23,7 @@ - include_vars: spark_sql_settings.yml - name: Download Hadoop get_url: - url: http://apache.mirrors.hoobly.com/hadoop/common/hadoop-2.8.0/hadoop-2.8.0.tar.gz + url: https://archive.apache.org/dist/hadoop/core/hadoop-2.8.0/hadoop-2.8.0.tar.gz dest: "{{ home_dir }}/hadoop.tar.gz" - name: Unzip Hadoop unarchive: diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/load_tpch.yml b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/load_tpch.yml index 5b07eed..651d166 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/load_tpch.yml +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/load_tpch.yml @@ -24,10 +24,24 @@ - name: Create TPCH data dir on HDFS shell: 'bash {{ hadoop_base }}/bin/hdfs dfs -mkdir -p {{ data_dir }}' -- hosts: ncs +- hosts: ncs[0] tasks: - include_vars: spark_sql_settings.yml - include_vars: "{{ playbook_dir }}/../conf/private_ip.yml" - include_vars: ../../../benchmarks/tpch/gen/settings.yml - - name: Put data on to HDFS - shell: for i in `ls -rS {{ data_dir }}/`; do {{ hadoop_base }}/bin/hdfs dfs -put -f {{ data_dir }}/$i hdfs://{{ cc_ip }}:9000/{{ data_dir }}; rm {{ data_dir }}/$i; done \ No newline at end of file + - name: Populate data uploading script for 1st node + shell: echo "for i in \`ls -rS {{ data_dir }}/*.tbl*\`; do {{ hadoop_base }}/bin/hdfs dfs -put -f \$i hdfs://{{ cc_ip }}:9000/{{ data_dir }}; done" > "{{ upload_script }}" + + +- hosts: ncs[1:] + tasks: + - include_vars: spark_sql_settings.yml + - include_vars: "{{ playbook_dir }}/../conf/private_ip.yml" + - include_vars: ../../../benchmarks/tpch/gen/settings.yml + - name: Put data uploading generation script for the rest + shell: echo "for i in \`ls -rS {{ data_dir }}/*.tbl.*\`; do {{ hadoop_base }}/bin/hdfs dfs -put -f \$i hdfs://{{ cc_ip }}:9000/{{ data_dir }}; done" > "{{ upload_script }}" + +- hosts: ncs + tasks: + - name: Upload data + shell: sh {{ upload_script }} \ No newline at end of file diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/spark_sql_settings.yml b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/spark_sql_settings.yml index 1ff3f67..da13d3d 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/spark_sql_settings.yml +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/ansible/spark_sql_settings.yml @@ -42,3 +42,5 @@ binary_dir: "{{ home_dir }}/{{ generator }}" local_result: "/tmp/sparkSQL_Result.txt" + +upload_script: "{{ home_dir }}/upload.sh" diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q20.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q20.sql index e45fea1..f40cd2c 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q20.sql +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q20.sql @@ -20,7 +20,7 @@ WITH Q20_TMP1 AS (SELECT DISTINCT P_PARTKEY FROM PART - WHERE P_NAME LIKE "FOREST%"), + WHERE P_NAME LIKE "forest%"), Q20_TMP2 AS (SELECT L_PARTKEY, L_SUPPKEY, diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q9.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q9.sql index db73e62..49cb859 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q9.sql +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/backup_queries/q9.sql @@ -56,7 +56,7 @@ FROM NATION N JOIN SUPPLIER S ON N.N_NATIONKEY = S.S_NATIONKEY) S1 JOIN LINEITEM L ON S1.S_SUPPKEY = L.L_SUPPKEY) L1 ON PS.PS_SUPPKEY = L1.L_SUPPKEY - AND PS.S_PARTKEY = L1.L_PARTKEY) L2 ON P.P_NAME LIKE "%GREEN%" + AND PS.S_PARTKEY = L1.L_PARTKEY) L2 ON P.P_NAME LIKE "%green%" AND P.P_PARTKEY = L2.L_PARTKEY) L3 ON O.O_ORDERKEY = L3.L_ORDERKEY) PROFIT GROUP BY NATION, O_YEAR diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/conf/execute-query.tmpl b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/conf/execute-query.tmpl index cf5bc36..4762135 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/conf/execute-query.tmpl +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/conf/execute-query.tmpl @@ -72,6 +72,7 @@ spark.sqlContext.cacheTable("LINEITEM") // Execute Query +val writer0 = new PrintWriter(new File("{{home_dir}}/detail.txt")) val queries_dir = new File(queries_root) val etime = collection.mutable.Map[String, Float]() for (i <- 0 to round) { @@ -82,7 +83,7 @@ val t0 = System.nanoTime() var query = "" queries.getLines.foreach { line => query += (line + "\n")} - spark.sql(query).count() + spark.sql(query).collect().foreach(println) val t1 = System.nanoTime() val elapsed = (t1 - t0) / 1000000000.0f if (i > 0) { @@ -91,8 +92,12 @@ } etime(file_name) += elapsed } + writer0.print(file_name + " " + elapsed + " ") + writer0.flush() } + writer0.print("\n") } +writer0.close() // Write result val writer = new PrintWriter(new File("{{ result_file }}")) diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q13.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q13.sql index 4590598..c90dd7c 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q13.sql +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q13.sql @@ -23,7 +23,7 @@ FROM ( SELECT C.C_CUSTKEY, COUNT(O.O_ORDERKEY) AS O_ORDERKEY_COUNT FROM (CUSTOMER C LEFT OUTER JOIN ORDERS O) - WHERE C.C_CUSTKEY = O.O_CUSTKEY AND O.O_COMMENT NOT LIKE "%SPECIAL%REQUESTS%" + WHERE C.C_CUSTKEY = O.O_CUSTKEY AND O.O_COMMENT NOT LIKE "%special%requests%" GROUP BY C.C_CUSTKEY ) CO GROUP BY C_CUSTKEY diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q16.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q16.sql index aea9188..b0686a4 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q16.sql +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q16.sql @@ -30,11 +30,11 @@ FROM PARTSUPP PS, PART P WHERE P.P_PARTKEY = PS.S_PARTKEY - AND P.P_BRAND != "BRAND#45" + AND P.P_BRAND != "Brand#45" AND P.P_TYPE NOT LIKE "MEDIUM POLISHED%") AS PSP, SUPPLIER S WHERE PSP.PS_SUPPKEY = S.S_SUPPKEY - AND S.S_COMMENT NOT LIKE "%CUSTOMER%COMPLAINTS%") + AND S.S_COMMENT NOT LIKE "%Customer%Complaints%") SELECT P_BRAND, P_TYPE, P_SIZE, diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q17.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q17.sql index c3894fe..b51e530 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q17.sql +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q17.sql @@ -22,12 +22,12 @@ 0.2 * AVG(L_QUANTITY) T_AVG_QUANTITY FROM LINEITEM GROUP BY L_PARTKEY) -SELECT * +SELECT SUM(L.L_EXTENDEDPRICE) / 7.0 FROM tmp T, LINEITEM L, PART P WHERE P.P_PARTKEY = L.L_PARTKEY AND P.P_CONTAINER = "MED BOX" - AND P.P_BRAND = "BRAND#23" + AND P.P_BRAND = "Brand#23" AND L.L_PARTKEY = T.T_PARTKEY AND L.L_QUANTITY < T.T_AVG_QUANTITY \ No newline at end of file diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q19.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q19.sql index 5cacbdb..01f8692 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q19.sql +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q19.sql @@ -29,19 +29,19 @@ SELECT SUM(L.EXTNDPRICE * (1 - L.DISCOUNT)) FROM tmp L JOIN PART P ON P.P_PARTKEY = L.LPKEY -WHERE (P.P_BRAND = "BRAND#12" +WHERE (P.P_BRAND = "Brand#12" AND P.P_CONTAINER REGEXP "SM CASE|SM BOX|SM PACK|SM PKG" AND L.QUANTITY >= 1 AND L.QUANTITY <= 11 AND P.P_SIZE >= 1 AND P.P_SIZE <= 5) - OR (P.P_BRAND = "BRAND#23" + OR (P.P_BRAND = "Brand#23" AND P.P_CONTAINER REGEXP "MED BAG|MED BOX|MED PKG|MED PACK" AND L.QUANTITY >= 10 AND L.QUANTITY <= 20 AND P.P_SIZE >= 1 AND P.P_SIZE <= 10) - OR (P.P_BRAND = "BRAND#34" + OR (P.P_BRAND = "Brand#34" AND P.P_CONTAINER REGEXP "LG CASE|LG BOX|LG PACK|LG PKG" AND L.QUANTITY >= 20 AND L.QUANTITY <= 30 diff --git a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q3.sql b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q3.sql index aadbb55..a932192 100644 --- a/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q3.sql +++ b/asterixdb/asterix-benchmark/src/main/resources/others/SparkSQL/queries/q3.sql @@ -17,19 +17,18 @@ -- under the License. -- ------------------------------------------------------------ -SELECT l.L_RETURNFLAG, - l.L_LINESTATUS, - sum(l.L_QUANTITY) AS sum_qty, - sum(l.L_EXTENDEDPRICE) AS sum_base_price, - sum(l.L_EXTENDEDPRICE * (1 - l.L_DISCOUNT)) AS sum_disc_price, - sum(l.L_EXTENDEDPRICE * (1 - l.L_DISCOUNT) * (1 + l.L_TAX)) AS sum_charge, - avg(l.l_quantity) AS ave_qty, - avg(l.L_EXTENDEDPRICE) AS ave_price, - avg(l.L_DISCOUNT) AS ave_disc, - count(*) AS count_order -FROM LINEITEM AS l -WHERE l.L_SHIPDATE <= "1998-09-02" -GROUP BY l.L_RETURNFLAG, - l.L_LINESTATUS -ORDER BY l.L_RETURNFLAG, - l.L_LINESTATUS \ No newline at end of file +SELECT l.L_ORDERKEY, + sum(l.L_EXTENDEDPRICE * (1 - l.L_DISCOUNT)) AS REVENUE, + o.O_ORDERDATE, + o.O_SHIPPRIORITY +FROM CUSTOMER AS c, + ORDERS AS o, + LINEITEM AS l +where c.C_MKTSEGMENT = 'BUILDING' + AND c.C_CUSTKEY = o.O_CUSTKEY + AND l.L_ORDERKEY = o.O_ORDERKEY + AND o.O_ORDERDATE < '1995-03-15' + AND l.L_SHIPDATE > '1995-03-15' +GROUP BY l.L_ORDERKEY, o.O_ORDERDATE, o.O_SHIPPRIORITY +ORDER BY REVENUE DESC,O_ORDERDATE +LIMIT 10 \ No newline at end of file -- To view, visit https://asterix-gerrit.ics.uci.edu/2011 To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ia0e04b3c80cc83322def2a949fc0ddf01fd8e7a8 Gerrit-PatchSet: 2 Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Owner: Xikui Wang <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Till Westmann <[email protected]> Gerrit-Reviewer: Xikui Wang <[email protected]>
