full code example: def main(args: Array[String]) { val conf = new SparkConf().setAppName("ErrorExample").setMaster("local[8]") .set("spark.serializer", classOf[KryoSerializer].getName) val sc = new SparkContext(conf)
val rdd = sc.hadoopFile( "hdfs://...../user.avro", classOf[org.apache.avro.mapred.AvroInputFormat[User]], classOf[org.apache.avro.mapred.AvroWrapper[User]], classOf[org.apache.hadoop.io.NullWritable], 1) val usersRDD = rdd.map({ case (u, _) => u.datum()}) usersRDD.foreach(println) println("-----------------") val collected = usersRDD.collect() collected.foreach(println) } output (without info loggind etc): {"id": "1", "name": "a"} {"id": "2", "name": "b"} {"id": "3", "name": "c"} {"id": "4", "name": "d"} {"id": "5", "name": "e"} {"id": "6", "name": "f"} ----------------- {"id": "6", "name": "f"} {"id": "6", "name": "f"} {"id": "6", "name": "f"} {"id": "6", "name": "f"} {"id": "6", "name": "f"} {"id": "6", "name": "f"} -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/collect-on-hadoopFile-RDD-returns-wrong-results-tp14368p14428.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org