zhztheplayer commented on code in PR #5278:
URL: https://github.com/apache/incubator-gluten/pull/5278#discussion_r1549007953
##########
docs/get-started/Work-with-pyspark.ipynb:
##########
@@ -0,0 +1,145 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "509d5a1e-10f2-4294-a104-9fdb85a29b0c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "SLF4J: Class path contains multiple SLF4J bindings.\n",
+ "SLF4J: Found binding in
[jar:file:/root/spark-3.3.1-bin-hadoop2/jars/log4j-slf4j-impl-2.17.2.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n",
+ "SLF4J: Found binding in
[jar:file:/root/hadoop-2.7.7/share/hadoop/common/lib/slf4j-log4j12-1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n",
+ "SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an
explanation.\n",
+ "SLF4J: Actual binding is of type
[org.apache.logging.slf4j.Log4jLoggerFactory]\n",
+ "24/04/03 22:04:56 WARN Utils: Your hostname, vsr542 resolves to a
loopback address: 127.0.1.1; using 10.0.2.142 instead (on interface eno1)\n",
+ "24/04/03 22:04:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to
another address\n",
+ "Setting default log level to \"WARN\".\n",
+ "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use
setLogLevel(newLevel).\n",
+ "24/04/03 22:04:57 WARN NativeCodeLoader: Unable to load native-hadoop
library for your platform... using builtin-java classes where applicable\n",
+ "24/04/03 22:05:00 WARN Client: Neither spark.yarn.jars nor
spark.yarn.archive is set, falling back to uploading libraries under
SPARK_HOME.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import findspark\n",
+ "findspark.init()\n",
+ "\n",
+ "from pyspark import SparkConf, SparkContext\n",
+ "nativesql_jars = \"/path/to/gluten-XXXX.jar\"\n",
+ "conf = SparkConf().setAppName(\"PySpark Gluten\").setMaster(\"yarn\")\n",
+ "conf.set(\"spark.executor.instances\", \"2\")\n",
+ "conf.set(\"spark.executor.memory\", \"6g\")\n",
+ "conf.set(\"spark.executor.cores\", \"2\")\n",
+ "conf.set(\"spark.driver.memory\", \"2g\")\n",
+ "conf.set(\"spark.memory.offHeap.enabled\", \"true\")\n",
+ "conf.set(\"spark.memory.offHeap.size\", \"2g\")\n",
+ "conf.set(\"spark.executor.memoryOverhead\", \"384M\")\n",
+ "conf.set(\"spark.driver.extraClassPath\", nativesql_jars)\n",
+ "conf.set(\"spark.executor.extraClassPath\", nativesql_jars)\n",
+ "conf.set(\"spark.plugins\", \"org.apache.gluten.GlutenPlugin\")\n",
+ "conf.set(\"spark.gluten.sql.columnar.backend.lib\", \"velox\")\n",
+ "conf.set(\"spark.gluten.loadLibFromJar\", \"false\")\n",
+ "conf.set(\"spark.shuffle.manager\",
\"org.apache.spark.shuffle.sort.ColumnarShuffleManager\")\n",
+ "sc = SparkContext(conf=conf)\n",
+ "from pyspark.sql import SparkSession\n",
+ "spark_session = SparkSession(sc)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "6c4ec66a-3b8c-4a65-b948-8420b492333d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "== Physical Plan ==\n",
+ "VeloxColumnarToRowExec\n",
+ "+- ^(1) FilterExecTransformer (isnotnull(category#1) AND (category#1 =
Cellphone))\n",
+ " +- ^(1) InputIteratorTransformer[product#0, category#1,
revenue#2L]\n",
+ " +- ^(1) InputAdapter\n",
+ " +- ^(1) RowToVeloxColumnar\n",
+ " +- *(1) Scan
ExistingRDD[product#0,category#1,revenue#2L]\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "df = spark_session.createDataFrame(\n",
+ " [\n",
+ " (\"Normal\", \"Cellphone\", 6000),\n",
+ " (\"Normal\", \"Tablet\", 1500),\n",
+ " (\"Mini\", \"Tablet\", 5500),\n",
+ " (\"Mini\", \"Cellphone\", 5000),\n",
+ " (\"Foldable\", \"Cellphone\", 6500),\n",
+ " (\"Foldable\", \"Tablet\", 2500),\n",
+ " (\"Pro\", \"Cellphone\", 3000),\n",
+ " (\"Pro\", \"Tablet\", 4000),\n",
+ " (\"Pro Max\", \"Cellphone\", 4500)\n",
+ " ],\n",
+ " [\"product\", \"category\", \"revenue\"]\n",
+ ")\n",
+ "df.filter(\"category = 'Cellphone'\").explain()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "47e7593d-aaf7-4608-b13a-f0d948f39a55",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
Review Comment:
This one rendered as an empty block. Is it intended?
https://github.com/apache/incubator-gluten/blob/36bff654a017a19c2a3acd9223f71d0b330bdd41/docs/get-started/Work-with-pyspark.ipynb
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]