[
https://issues.apache.org/jira/browse/SPARK-25733?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Bihui Jin updated SPARK-25733:
------------------------------
Attachment: report_dataset.zip.001
> The method toLocalIterator() with dataframe doesn't work
> --------------------------------------------------------
>
> Key: SPARK-25733
> URL: https://issues.apache.org/jira/browse/SPARK-25733
> Project: Spark
> Issue Type: Bug
> Components: PySpark
> Affects Versions: 2.3.1
> Environment: Spark in standalone mode, and 48 cores are avaiable.
> Reporter: Bihui Jin
> Priority: Major
> Attachments: report_dataset.zip.001, report_dataset.zip.002
>
>
> {color:#FF0000}The dataset which I used attached.{color}
>
> First I loaded a dataframe from local disk:
> df = spark.read.load('report_dataset')
> there are about 200 partitions stored in s3, and the max size of partitions
> is 28.37MB.
>
> after data loaded, I execute "df.take(1)" to test the dataframe, and
> expected output printed
> "[Row(s3_link='https://dcm-ul-phy.s3-china-1.eecloud.nsn-net.net/normal/run2/pool1/Tests.NbIot.NBCellSetupDelete.LTE3374_CellSetup_4x5M_2RX_3CELevel_Loop100.html',
> sequences=[364, 15, 184, 34, 524, 49, 30, 527, 44, 366, 125, 85, 69, 524,
> 49, 389, 575, 29, 179, 447, 168, 3, 223, 116, 573, 524, 49, 30, 527, 56, 366,
> 125, 85, 524, 118, 295, 440, 123, 389, 32, 575, 529, 192, 524, 49, 389, 575,
> 29, 179, 29, 140, 268, 96, 508, 389, 32, 575, 529, 192, 524, 49, 389, 575,
> 29, 179, 180, 451, 69, 286, 524, 49, 389, 575, 29, 42, 553, 451, 37, 125,
> 524, 49, 389, 575, 29, 42, 553, 451, 37, 125, 524, 49, 389, 575, 29, 42, 553,
> 451, 368, 125, 88, 588, 524, 49, 389, 575, 29, 42, 553, 451, 368, 125, 88,
> 588, 524, 49, 389, 575, 29, 42, 553, 451, 368, 125, 88, 588, 524, 49, 389],
> next_word=575, line_num=12)]"
>
> Then I try to convert dataframe to the local iterator and want to print one
> row in dataframe for testing, and blew code is used:
> for row in df.toLocalIterator():
> print(row)
> break
> {color:#ff0000}*But there is no output printed after that code
> executed.*{color}
>
> Then I execute "df.take(1)" and blew error is reported:
> ERROR:root:Exception while sending command.
> Traceback (most recent call last):
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 1159, in send_command
> raise Py4JNetworkError("Answer from Java side is empty")
> py4j.protocol.Py4JNetworkError: Answer from Java side is empty
> During handling of the above exception, another exception occurred:
> ERROR:root:Exception while sending command.
> Traceback (most recent call last):
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 1159, in send_command
> raise Py4JNetworkError("Answer from Java side is empty")
> py4j.protocol.Py4JNetworkError: Answer from Java side is empty
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 985, in send_command
> response = connection.send_command(command)
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 1164, in send_command
> "Error while receiving", e, proto.ERROR_ON_RECEIVE)
> py4j.protocol.Py4JNetworkError: Error while receiving
> ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java
> server (127.0.0.1:37735)
> Traceback (most recent call last):
> File
> "/opt/k2-v02/lib/python3.6/site-packages/IPython/core/interactiveshell.py",
> line 2963, in run_code
> exec(code_obj, self.user_global_ns, self.user_ns)
> File "<ipython-input-7-3959105b378f>", line 1, in <module>
> df.take(1)
> File "/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/dataframe.py", line
> 504, in take
> return self.limit(num).collect()
> File "/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/dataframe.py", line
> 493, in limit
> jdf = self._jdf.limit(num)
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 1257, in __call__
> answer, self.gateway_client, self.target_id, self.name)
> File "/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/utils.py", line 63,
> in deco
> return f(*a, **kw)
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/protocol.py", line 336, in
> get_return_value
> format(target_id, ".", name))
> py4j.protocol.Py4JError: An error occurred while calling o29.limit
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File
> "/opt/k2-v02/lib/python3.6/site-packages/IPython/core/interactiveshell.py",
> line 1863, in showtraceback
> stb = value._render_traceback_()
> AttributeError: 'Py4JError' object has no attribute '_render_traceback_'
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 929, in _get_connection
> connection = self.deque.pop()
> IndexError: pop from an empty deque
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 1067, in start
> self.socket.connect((self.address, self.port))
> ConnectionRefusedError: [Errno 111] Connection refused
> ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java
> server (127.0.0.1:37735)
> Traceback (most recent call last):
> File
> "/opt/k2-v02/lib/python3.6/site-packages/IPython/core/interactiveshell.py",
> line 2963, in run_code
> exec(code_obj, self.user_global_ns, self.user_ns)
> File "<ipython-input-7-3959105b378f>", line 1, in <module>
> df.take(1)
> File "/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/dataframe.py", line
> 504, in take
> return self.limit(num).collect()
> File "/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/dataframe.py", line
> 493, in limit
> jdf = self._jdf.limit(num)
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 1257, in __call__
> answer, self.gateway_client, self.target_id, self.name)
> File "/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/utils.py", line 63,
> in deco
> return f(*a, **kw)
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/protocol.py", line 336, in
> get_return_value
> format(target_id, ".", name))
> py4j.protocol.Py4JError: An error occurred while calling o29.limit
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File
> "/opt/k2-v02/lib/python3.6/site-packages/IPython/core/interactiveshell.py",
> line 1863, in showtraceback
> stb = value._render_traceback_()
> AttributeError: 'Py4JError' object has no attribute '_render_traceback_'
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 929, in _get_connection
> connection = self.deque.pop()
> IndexError: pop from an empty deque
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 1067, in start
> self.socket.connect((self.address, self.port))
> ConnectionRefusedError: [Errno 111] Connection refused
> ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java
> server (127.0.0.1:37735)
> Traceback (most recent call last):
> File
> "/opt/k2-v02/lib/python3.6/site-packages/IPython/core/interactiveshell.py",
> line 2963, in run_code
> exec(code_obj, self.user_global_ns, self.user_ns)
> File "<ipython-input-7-3959105b378f>", line 1, in <module>
> df.take(1)
> File "/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/dataframe.py", line
> 504, in take
> return self.limit(num).collect()
> File "/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/dataframe.py", line
> 493, in limit
> jdf = self._jdf.limit(num)
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 1257, in __call__
> answer, self.gateway_client, self.target_id, self.name)
> File "/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/utils.py", line 63,
> in deco
> return f(*a, **kw)
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/protocol.py", line 336, in
> get_return_value
> format(target_id, ".", name))
> py4j.protocol.Py4JError: An error occurred while calling o29.limit
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File
> "/opt/k2-v02/lib/python3.6/site-packages/IPython/core/interactiveshell.py",
> line 1863, in showtraceback
> stb = value._render_traceback_()
> AttributeError: 'Py4JError' object has no attribute '_render_traceback_'
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 929, in _get_connection
> connection = self.deque.pop()
> IndexError: pop from an empty deque
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File "/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py", line
> 1067, in start
> self.socket.connect((self.address, self.port))
> ConnectionRefusedError: [Errno 111] Connection refused
>
>
> {color:#e75c58}---------------------------------------------------------------------------{color}{color:#e75c58}Py4JError{color}
> Traceback (most recent call
> last){color:#00a250}<ipython-input-7-3959105b378f>{color} in
> {color:#60c6c8}<module>{color}{color:#208ffb}(){color}{color:#00a250}---->
> 1{color}
> df{color:#208ffb}.{color}take{color:#208ffb}({color}{color:#60c6c8}1{color}{color:#208ffb}){color}{color:#00a250}/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/dataframe.py{color}
> in {color:#60c6c8}take{color}{color:#208ffb}(self,
> num){color}{color:#00a250} 502{color}
> {color:#208ffb}[{color}Row{color:#208ffb}({color}age{color:#208ffb}={color}{color:#60c6c8}2{color}{color:#208ffb},{color}
>
> name{color:#208ffb}={color}{color:#208ffb}u'Alice'{color}{color:#208ffb}){color}{color:#208ffb},{color}
>
> Row{color:#208ffb}({color}age{color:#208ffb}={color}{color:#60c6c8}5{color}{color:#208ffb},{color}
>
> name{color:#208ffb}={color}{color:#208ffb}u'Bob'{color}{color:#208ffb}){color}{color:#208ffb}]{color}{color:#00a250}
> 503{color} """{color:#00a250}--> 504{color} {color:#00a250}return{color}
> self{color:#208ffb}.{color}limit{color:#208ffb}({color}num{color:#208ffb}){color}{color:#208ffb}.{color}collect{color:#208ffb}({color}{color:#208ffb}){color}{color:#00a250}
> 505{color}{color:#00a250} 506{color}
> {color:#208ffb}@{color}since{color:#208ffb}({color}{color:#60c6c8}1.3{color}{color:#208ffb}){color}{color:#00a250}/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/dataframe.py{color}
> in {color:#60c6c8}limit{color}{color:#208ffb}(self,
> num){color}{color:#00a250} 491{color}
> {color:#208ffb}[{color}{color:#208ffb}]{color}{color:#00a250} 492{color}
> """{color:#00a250}--> 493{color} jdf {color:#208ffb}={color}
> self{color:#208ffb}.{color}_jdf{color:#208ffb}.{color}limit{color:#208ffb}({color}num{color:#208ffb}){color}{color:#00a250}
> 494{color} {color:#00a250}return{color}
> DataFrame{color:#208ffb}({color}jdf{color:#208ffb},{color}
> self{color:#208ffb}.{color}sql_ctx{color:#208ffb}){color}{color:#00a250}
> 495{color}{color:#00a250}/opt/k2-v02/lib/python3.6/site-packages/py4j/java_gateway.py{color}
> in {color:#60c6c8}__call__{color}{color:#208ffb}(self,
> *args){color}{color:#00a250} 1255{color} answer {color:#208ffb}={color}
> self{color:#208ffb}.{color}gateway_client{color:#208ffb}.{color}send_command{color:#208ffb}({color}command{color:#208ffb}){color}{color:#00a250}
> 1256{color} return_value = get_return_value({color:#00a250}->
> 1257{color}{color:#e75c58} answer, self.gateway_client, self.target_id,
> self.name){color}{color:#00a250} 1258{color}{color:#00a250} 1259{color}
> {color:#00a250}for{color} temp_arg {color:#00a250}in{color}
> temp_args{color:#208ffb}:{color}{color:#00a250}/opt/k2-v02/lib/python3.6/site-packages/pyspark/sql/utils.py{color}
> in {color:#60c6c8}deco{color}{color:#208ffb}(*a, **kw){color}{color:#00a250}
> 61{color} {color:#00a250}def{color}
> deco{color:#208ffb}({color}{color:#208ffb}*{color}a{color:#208ffb},{color}
> {color:#208ffb}**{color}kw{color:#208ffb}){color}{color:#208ffb}:{color}{color:#00a250}
> 62{color}
> {color:#00a250}try{color}{color:#208ffb}:{color}{color:#00a250}---> 63{color}
> {color:#00a250}return{color}
> f{color:#208ffb}({color}{color:#208ffb}*{color}a{color:#208ffb},{color}
> {color:#208ffb}**{color}kw{color:#208ffb}){color}{color:#00a250} 64{color}
> {color:#00a250}except{color}
> py4j{color:#208ffb}.{color}protocol{color:#208ffb}.{color}Py4JJavaError
> {color:#00a250}as{color} e{color:#208ffb}:{color}{color:#00a250} 65{color} s
> {color:#208ffb}={color}
> e{color:#208ffb}.{color}java_exception{color:#208ffb}.{color}toString{color:#208ffb}({color}{color:#208ffb}){color}{color:#00a250}/opt/k2-v02/lib/python3.6/site-packages/py4j/protocol.py{color}
> in {color:#60c6c8}get_return_value{color}{color:#208ffb}(answer,
> gateway_client, target_id, name){color}{color:#00a250} 334{color} raise
> Py4JError({color:#00a250} 335{color} {color:#208ffb}"An error occurred while
> calling \{0}{1}\{2}"{color}{color:#208ffb}.{color}{color:#00a250}-->
> 336{color}{color:#e75c58} format(target_id, ".", name)){color}{color:#00a250}
> 337{color} {color:#00a250}else{color}{color:#208ffb}:{color}{color:#00a250}
> 338{color} type {color:#208ffb}={color}
> answer{color:#208ffb}[{color}{color:#60c6c8}1{color}{color:#208ffb}]{color}{color:#e75c58}Py4JError{color}:
> An error occurred while calling o29.limit
>
>
>
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]