It's a bug, could you file a JIRA for this? Thanks! Davies
On Thu, Oct 16, 2014 at 8:28 AM, Griffiths, Michael (NYC-RPM) <michael.griffi...@reprisemedia.com> wrote: > Hi, > > > > I’m running into an error on Windows (x64, 8.1) running Spark 1.1.0 > (pre-builet for Hadoop 2.4: spark-1.1.0-bin-hadoop2.4.tgz) with Java SE > Version 8 Update 20 (build 1.8.0_20-b26); just getting started with Spark. > > > > When running sc.wholeTextFiles() on a directory, I can run the command but > not do anything with the resulting RDD – specifically, I get an error in > py4j.protocol.Py4JJavaError; the error is unspecified, though the location > is included. I’ve attached the traceback below. > > > > In this situation, I’m trying to load all files from a folder on the local > filesystem, located at D:\testdata. The folder contains one file, which can > be loaded successfully with sc.textFile(“d:/testdata/filename”) – no > problems at all – so I do not believe the file is throwing the error. > > > > Is there any advice on what I should look at further to isolate or fix the > error? Am I doing something obviously wrong? > > > > Thanks, > > Michael > > > > > > Welcome to > > ____ __ > > / __/__ ___ _____/ /__ > > _\ \/ _ \/ _ `/ __/ '_/ > > /__ / .__/\_,_/_/ /_/\_\ version 1.1.0 > > /_/ > > > > Using Python version 2.7.7 (default, Jun 11 2014 10:40:02) > > SparkContext available as sc. > >>>> file = >>>> sc.textFile("d:/testdata/0000cbcc5b470ec06f212990c68c8f76e887b884") > >>>> file.count() > > 732 > >>>> file.first() > > u'<!DOCTYPE html>' > >>>> data = sc.wholeTextFiles('d:/testdata') > >>>> data.first() > > Traceback (most recent call last): > > File "<stdin>", line 1, in <module> > > File "D:\spark\python\pyspark\rdd.py", line 1167, in first > > return self.take(1)[0] > > File "D:\spark\python\pyspark\rdd.py", line 1126, in take > > totalParts = self._jrdd.partitions().size() > > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py", line > 538, in __call__ > > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py", line > 300, in get_return_value > > py4j.protocol.Py4JJavaError: An error occurred while calling o21.partitions. > > : java.lang.NullPointerException > > at java.lang.ProcessBuilder.start(Unknown Source) > > at org.apache.hadoop.util.Shell.runCommand(Shell.java:445) > > at org.apache.hadoop.util.Shell.run(Shell.java:418) > > at > org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:650) > > at org.apache.hadoop.util.Shell.execCommand(Shell.java:739) > > at org.apache.hadoop.util.Shell.execCommand(Shell.java:722) > > at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097) > > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:559) > > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:534) > > at > org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:42) > > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1697) > > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1679) > > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:302) > > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:263) > > at > org.apache.spark.input.WholeTextFileInputFormat.setMaxSplitSize(WholeTextFileInputFormat.scala:54) > > at > org.apache.spark.rdd.WholeTextFileRDD.getPartitions(NewHadoopRDD.scala:219) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > > at > org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:50) > > at > org.apache.spark.api.java.JavaPairRDD.partitions(JavaPairRDD.scala:44) > > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > > at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) > > at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) > > at java.lang.reflect.Method.invoke(Unknown Source) > > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) > > at > py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) > > at py4j.Gateway.invoke(Gateway.java:259) > > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) > > at py4j.commands.CallCommand.execute(CallCommand.java:79) > > at py4j.GatewayConnection.run(GatewayConnection.java:207) > > at java.lang.Thread.run(Unknown Source) > > > >>>> data.count() > > Traceback (most recent call last): > > File "<stdin>", line 1, in <module> > > File "D:\spark\python\pyspark\rdd.py", line 847, in count > > return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() > > File "D:\spark\python\pyspark\rdd.py", line 838, in sum > > return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add) > > File "D:\spark\python\pyspark\rdd.py", line 759, in reduce > > vals = self.mapPartitions(func).collect() > > File "D:\spark\python\pyspark\rdd.py", line 723, in collect > > bytesInJava = self._jrdd.collect().iterator() > > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py", line > 538, in __call__ > > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py", line > 300, in get_return_value > > py4j.protocol.Py4JJavaError: An error occurred while calling o28.collect. > > : java.lang.NullPointerException > > at java.lang.ProcessBuilder.start(Unknown Source) > > at org.apache.hadoop.util.Shell.runCommand(Shell.java:445) > > at org.apache.hadoop.util.Shell.run(Shell.java:418) > > at > org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:650) > > at org.apache.hadoop.util.Shell.execCommand(Shell.java:739) > > at org.apache.hadoop.util.Shell.execCommand(Shell.java:722) > > at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097) > > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:559) > > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:534) > > at > org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:42) > > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1697) > > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1679) > > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:302) > > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:263) > > at > org.apache.spark.input.WholeTextFileInputFormat.setMaxSplitSize(WholeTextFileInputFormat.scala:54) > > at > org.apache.spark.rdd.WholeTextFileRDD.getPartitions(NewHadoopRDD.scala:219) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > > at > org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:56) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1135) > > at org.apache.spark.rdd.RDD.collect(RDD.scala:774) > > at > org.apache.spark.api.java.JavaRDDLike$class.collect(JavaRDDLike.scala:305) > > at org.apache.spark.api.java.JavaRDD.collect(JavaRDD.scala:32) > > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > > at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) > > at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) > > at java.lang.reflect.Method.invoke(Unknown Source) > > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) > > at > py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) > > at py4j.Gateway.invoke(Gateway.java:259) > > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) > > at py4j.commands.CallCommand.execute(CallCommand.java:79) > > at py4j.GatewayConnection.run(GatewayConnection.java:207) > > at java.lang.Thread.run(Unknown Source) > >>>> data.map(lambda x: len(x)).take(1) > > Traceback (most recent call last): > > File "<stdin>", line 1, in <module> > > File "D:\spark\python\pyspark\rdd.py", line 1126, in take > > totalParts = self._jrdd.partitions().size() > > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py", line > 538, in __call__ > > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py", line > 300, in get_return_value > > py4j.protocol.Py4JJavaError: An error occurred while calling o61.partitions. > > : java.lang.NullPointerException > > at java.lang.ProcessBuilder.start(Unknown Source) > > at org.apache.hadoop.util.Shell.runCommand(Shell.java:445) > > at org.apache.hadoop.util.Shell.run(Shell.java:418) > > at > org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:650) > > at org.apache.hadoop.util.Shell.execCommand(Shell.java:739) > > at org.apache.hadoop.util.Shell.execCommand(Shell.java:722) > > at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097) > > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:559) > > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:534) > > at > org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:42) > > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1697) > > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1679) > > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:302) > > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:263) > > at > org.apache.spark.input.WholeTextFileInputFormat.setMaxSplitSize(WholeTextFileInputFormat.scala:54) > > at > org.apache.spark.rdd.WholeTextFileRDD.getPartitions(NewHadoopRDD.scala:219) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > > at > org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:56) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > > at > org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > > at scala.Option.getOrElse(Option.scala:120) > > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > > at > org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:50) > > at org.apache.spark.api.java.JavaRDD.partitions(JavaRDD.scala:32) > > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > > at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) > > at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) > > at java.lang.reflect.Method.invoke(Unknown Source) > > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) > > at > py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) > > at py4j.Gateway.invoke(Gateway.java:259) > > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) > > at py4j.commands.CallCommand.execute(CallCommand.java:79) > > at py4j.GatewayConnection.run(GatewayConnection.java:207) > > at java.lang.Thread.run(Unknown Source) --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org