Re: Reading from Amazon S3

Steve Loughran Thu, 28 Apr 2016 02:08:27 -0700

On 26 Apr 2016, at 18:49, Ted Yu 
<yuzhih...@gmail.com<mailto:yuzhih...@gmail.com>> wrote:


Looking at the cause of the error, it seems hadoop-aws-xx.jar (corresponding to 
the version of hadoop you use) was missing in classpath.

yes, that s3n was moved from hadoop-common to the new hadoop-aws, and without 
realising it broke a lot of things.

you'll need hadoop-aws and jets3t on the classpath

If you are using Hadoop 2.7, I'd recommend s3a instead, which means hadoop-aws 
and the exact same amazon-sdk that comes bundled with the hadoop binaries your 
version of spark is built with (it's a bit brittle API-wise)


FYI

On Tue, Apr 26, 2016 at 9:06 AM, Jinan Alhajjaj 
<j.r.alhaj...@hotmail.com<mailto:j.r.alhaj...@hotmail.com>> wrote:
Hi All,
I am trying to read a file stored in Amazon S3.
I wrote this code:

import java.util.List;

import java.util.Scanner;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.api.java.function.Function;

import org.apache.spark.sql.DataFrame;

import org.apache.spark.sql.Row;

import org.apache.spark.sql.SQLContext;

public class WordAnalysis {

public static void main(String[] args) {

    int startYear=0;

    int endyear=0;

    Scanner input = new Scanner(System.in);

    System.out.println("Please, Enter 1 if you want 1599-2008 or enter 2 for 
specific range: ");

    int choice=input.nextInt();



    if(choice==1)

    {

    startYear=1500;

    endyear=2008;

    }

    if(choice==2)

    {

    System.out.print("please,Enter the start year : ");

    startYear = input.nextInt();

    System.out.print("please,Enter the end year : ");

    endyear = input.nextInt();

    }

SparkConf conf = new SparkConf().setAppName("jinantry").setMaster("local");

JavaSparkContext spark = new JavaSparkContext(conf);

SQLContext sqlContext = new org.apache.spark.sql.SQLContext(spark);

JavaRDD<Items> ngram = 
spark.textFile("s3n://google-books-ngram/1gram/googlebooks-eng-all-1gram-20120701-x.gz‏")

.map(new Function<String, Items>() {

public Items call(String line) throws Exception {

String[] parts = line.split("\t");

Items item = new Items();

if (parts.length == 4) {

item.setWord(parts[0]);

item.setYear(Integer.parseInt(parts[1]));

item.setCount(Integer.parseInt(parts[2]));

item.setVolume(Integer.parseInt(parts[3]));

return item;

} else {

item.setWord(" ");

item.setYear(Integer.parseInt(" "));

item.setCount(Integer.parseInt(" "));

item.setVolume(Integer.parseInt(" "));

return item;

}

}

});

DataFrame schemangram = sqlContext.createDataFrame(ngram, Items.class);

schemangram.registerTempTable("ngram");

String sql="SELECT word,SUM(count) FROM ngram where year >="+startYear+" AND 
year<="+endyear+" And word LIKE '%_NOUN' GROUP BY word ORDER BY SUM(count) 
DESC";

DataFrame matchyear = sqlContext.sql(sql);

List<Row> words=matchyear.collectAsList();

int i=1;

    for (Row scholar : words) {

System.out.println(scholar);

if(i==10)

break;

i++;

  }


}


}


When I run it this error appear to me:

Exception in thread "main" 
org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:

Exchange rangepartitioning(aggOrder#5L DESC,200), None

+- ConvertToSafe

   +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Final,isDistinct=false)], output=[word#2,_c1#4L,aggOrder#5L])

      +- TungstenExchange hashpartitioning(word#2,200), None

         +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Partial,isDistinct=false)], output=[word#2,sum#8L])

            +- Project [word#2,count#0]

               +- Filter (((year#3 >= 1500) && (year#3 <= 1600)) && word#2 LIKE 
%_NOUN)

                  +- Scan ExistingRDD[count#0,volume#1,word#2,year#3]


at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)

at org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:247)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)

at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)

at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)

at 
org.apache.spark.sql.execution.ConvertToUnsafe.doExecute(rowFormatConverters.scala:38)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)

at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)

at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)

at org.apache.spark.sql.execution.Sort.doExecute(Sort.scala:64)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)

at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)

at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)

at org.apache.spark.sql.execution.Project.doExecute(basicOperators.scala:46)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)

at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)

at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)

at 
org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:55)

at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:55)

at org.apache.spark.sql.DataFrame.rdd$lzycompute(DataFrame.scala:1637)

at org.apache.spark.sql.DataFrame.rdd(DataFrame.scala:1634)

at 
org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1$$anonfun$apply$12.apply(DataFrame.scala:1493)

at 
org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1$$anonfun$apply$12.apply(DataFrame.scala:1493)

at 
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:56)

at org.apache.spark.sql.DataFrame.withNewExecutionId(DataFrame.scala:2086)

at 
org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1.apply(DataFrame.scala:1492)

at 
org.apache.spark.sql.DataFrame$$anonfun$collectAsList$1.apply(DataFrame.scala:1491)

at org.apache.spark.sql.DataFrame.withCallback(DataFrame.scala:2099)

at org.apache.spark.sql.DataFrame.collectAsList(DataFrame.scala:1491)

at WordAnalysis.main(WordAnalysis.java:60)

Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: 
execute, tree:

TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Final,isDistinct=false)], output=[word#2,_c1#4L,aggOrder#5L])

+- TungstenExchange hashpartitioning(word#2,200), None

   +- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Partial,isDistinct=false)], output=[word#2,sum#8L])

      +- Project [word#2,count#0]

         +- Filter (((year#3 >= 1500) && (year#3 <= 1600)) && word#2 LIKE 
%_NOUN)

            +- Scan ExistingRDD[count#0,volume#1,word#2,year#3]


at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)

at 
org.apache.spark.sql.execution.aggregate.TungstenAggregate.doExecute(TungstenAggregate.scala:80)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)

at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)

at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)

at 
org.apache.spark.sql.execution.ConvertToSafe.doExecute(rowFormatConverters.scala:56)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)

at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)

at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)

at 
org.apache.spark.sql.execution.Exchange.prepareShuffleDependency(Exchange.scala:164)

at 
org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:254)

at 
org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:248)

at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)

... 33 more

Caused by: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: 
execute, tree:

TungstenExchange hashpartitioning(word#2,200), None

+- TungstenAggregate(key=[word#2], functions=[(sum(cast(count#0 as 
bigint)),mode=Partial,isDistinct=false)], output=[word#2,sum#8L])

   +- Project [word#2,count#0]

      +- Filter (((year#3 >= 1500) && (year#3 <= 1600)) && word#2 LIKE %_NOUN)

         +- Scan ExistingRDD[count#0,volume#1,word#2,year#3]


at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:49)

at org.apache.spark.sql.execution.Exchange.doExecute(Exchange.scala:247)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:132)

at 
org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$5.apply(SparkPlan.scala:130)

at 
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)

at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:130)

at 
org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1.apply(TungstenAggregate.scala:86)

at 
org.apache.spark.sql.execution.aggregate.TungstenAggregate$$anonfun$doExecute$1.apply(TungstenAggregate.scala:80)

at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)

... 47 more

Caused by: java.io.IOException: No FileSystem for scheme: s3n

at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2584)

at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2591)

at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:91)

at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2630)

at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2612)

at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:370)

at org.apache.hadoop.fs.Path.getFileSystem(Path.java:296)

at 
org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:256)

at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:228)

at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313)

at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:199)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)

at scala.Option.getOrElse(Option.scala:120)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)

at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)

at scala.Option.getOrElse(Option.scala:120)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)

at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)

at scala.Option.getOrElse(Option.scala:120)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)

at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)

at scala.Option.getOrElse(Option.scala:120)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)

at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)

at scala.Option.getOrElse(Option.scala:120)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)

at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)

at scala.Option.getOrElse(Option.scala:120)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)

at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)

at scala.Option.getOrElse(Option.scala:120)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)

at 
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)

at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)

at scala.Option.getOrElse(Option.scala:120)

at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)

at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91)

at 
org.apache.spark.sql.execution.Exchange.prepareShuffleDependency(Exchange.scala:220)

at 
org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:254)

at 
org.apache.spark.sql.execution.Exchange$$anonfun$doExecute$1.apply(Exchange.scala:248)

at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:48)

... 55 more

 could any one help me in this.

Thank you

Re: Reading from Amazon S3

Reply via email to