This code is in python. Also I tried with fwd slash at the end with same result On 26 Apr 2015 01:36, "Jeetendra Gangele" <gangele...@gmail.com> wrote:
> also if this code is in scala why not val in newsY? is this define above? > loc = "D:\\Project\\Spark\\code\\news\\jsonfeeds" > newsY = sc.textFile(loc) > print newsY.count() > > On 25 April 2015 at 20:08, ayan guha <guha.a...@gmail.com> wrote: > >> Hi >> >> I am facing this weird issue..... >> >> I am on Windows, and I am trying to load all files within a folder. Here >> is my code - >> >> loc = "D:\\Project\\Spark\\code\\news\\jsonfeeds" >> newsY = sc.textFile(loc) >> print newsY.count() >> >> Even this simple code fails. I have tried with giving exact file names, >> everything works. >> >> Am I missing something stupid here? Anyone facing this (anyone still use >> windows?:)) >> >> Here is error trace: >> >> D:\Project\Spark\code\news\jsonfeeds >> >> Traceback (most recent call last): >> File "D:/Project/Spark/code/newsfeeder.py", line 28, in <module> >> print newsY.count() >> File >> "D:\spark\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\python\pyspark\rdd.py", >> line 932, in count >> return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() >> File >> "D:\spark\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\python\pyspark\rdd.py", >> line 923, in sum >> return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add) >> File >> "D:\spark\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\python\pyspark\rdd.py", >> line 739, in reduce >> vals = self.mapPartitions(func).collect() >> File >> "D:\spark\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\python\pyspark\rdd.py", >> line 713, in collect >> port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) >> File "C:\Python27\lib\site-packages\py4j\java_gateway.py", line 537, in >> __call__ >> self.target_id, self.name) >> File "C:\Python27\lib\site-packages\py4j\protocol.py", line 300, in >> get_return_value >> format(target_id, '.', name), value) >> Py4JJavaError: An error occurred while calling >> z:org.apache.spark.api.python.PythonRDD.collectAndServe. >> : java.lang.NullPointerException >> >> at java.lang.ProcessBuilder.start(Unknown Source) >> >> at org.apache.hadoop.util.Shell.runCommand(Shell.java:482) >> >> at org.apache.hadoop.util.Shell.run(Shell.java:455) >> >> at >> org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:715) >> >> at org.apache.hadoop.util.Shell.execCommand(Shell.java:808) >> >> at org.apache.hadoop.util.Shell.execCommand(Shell.java:791) >> >> at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097) >> >> at >> org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:582) >> >> at >> org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:557) >> >> at >> org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:42) >> >> at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1699) >> >> at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1681) >> >> at >> org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:268) >> >> at >> org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:228) >> >> at >> org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313) >> >> at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:203) >> >> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) >> >> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) >> >> at scala.Option.getOrElse(Option.scala:120) >> >> at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) >> >> at >> org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32) >> >> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) >> >> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) >> >> at scala.Option.getOrElse(Option.scala:120) >> >> at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) >> >> at org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:57) >> >> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) >> >> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) >> >> at scala.Option.getOrElse(Option.scala:120) >> >> at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) >> >> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1512) >> >> at org.apache.spark.rdd.RDD.collect(RDD.scala:813) >> >> at >> org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:374) >> >> at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala) >> >> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) >> >> at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) >> >> at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) >> >> at java.lang.reflect.Method.invoke(Unknown Source) >> >> at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) >> >> at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) >> >> at py4j.Gateway.invoke(Gateway.java:259) >> >> at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) >> >> at py4j.commands.CallCommand.execute(CallCommand.java:79) >> >> at py4j.GatewayConnection.run(GatewayConnection.java:207) >> >> at java.lang.Thread.run(Unknown Source) >> >> -- >> Best Regards, >> Ayan Guha >> > > > > >