[
https://issues.apache.org/jira/browse/SPARK-23814?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
bharath kumar avusherla updated SPARK-23814:
--------------------------------------------
Description:
When the file name has colon and new line character in data, while reading
using spark.read.option("multiLine","true").csv("s3n://Directory/") function.
It is throwing *"**java.lang.IllegalArgumentException:
java.net.URISyntaxException: Relative path in absolute URI:
2017-08-01T00:00:00Z.csv.gz"* error. If we remove the
option("multiLine","true"), it is working just fine though the file name has
colon in it. It is working fine, If i apply this option
*option("multiLine","true")* on any other file which doesn't have colon in it.
But when both are present (colon in file name and new line in the data), it's
not working.
{quote}java.lang.IllegalArgumentException: java.net.URISyntaxException:
Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz
at org.apache.hadoop.fs.Path.initialize(Path.java:205)
at org.apache.hadoop.fs.Path.<init>(Path.java:171)
at org.apache.hadoop.fs.Path.<init>(Path.java:93)
at org.apache.hadoop.fs.Globber.glob(Globber.java:253)
at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:1676)
at
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:294)
at
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:265)
at
org.apache.spark.input.StreamFileInputFormat.setMinPartitions(PortableDataStream.scala:51)
at org.apache.spark.rdd.BinaryFileRDD.getPartitions(BinaryFileRDD.scala:46)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1333)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.take(RDD.scala:1327)
at
org.apache.spark.sql.execution.datasources.csv.MultiLineCSVDataSource$.infer(CSVDataSource.scala:224)
at
org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:62)
at
org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:57)
at
org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177)
at
org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177)
at scala.Option.orElse(Option.scala:289)
at
org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:176)
at
org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412)
... 48 elided
Caused by: java.net.URISyntaxException: Relative path in absolute URI:
2017-08-01T00:00:00Z.csv.gz
at java.net.URI.checkPath(URI.java:1823)
at java.net.URI.<init>(URI.java:745)
at org.apache.hadoop.fs.Path.initialize(Path.java:202)
... 86 more
{quote}
was:
When the file name has colon and new line character in data, while reading
using spark.read.option("multiLine","true").csv("sn://Directory/") function. It
is throwing *"**java.lang.IllegalArgumentException:
java.net.URISyntaxException: Relative path in absolute URI:
2017-08-01T00:00:00Z.csv.gz"* error. If we remove the
option("multiLine","true"), it is working just fine though the file name has
colon in it. It is working fine, If i apply this option
*option("multiLine","true")* on any other file which doesn't have colon in it.
But when both are present (colon in file name and new line in the data), it's
not working.
{quote}java.lang.IllegalArgumentException: java.net.URISyntaxException:
Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz
at org.apache.hadoop.fs.Path.initialize(Path.java:205)
at org.apache.hadoop.fs.Path.<init>(Path.java:171)
at org.apache.hadoop.fs.Path.<init>(Path.java:93)
at org.apache.hadoop.fs.Globber.glob(Globber.java:253)
at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:1676)
at
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:294)
at
org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:265)
at
org.apache.spark.input.StreamFileInputFormat.setMinPartitions(PortableDataStream.scala:51)
at org.apache.spark.rdd.BinaryFileRDD.getPartitions(BinaryFileRDD.scala:46)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at
org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
at scala.Option.getOrElse(Option.scala:121)
at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1333)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at
org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.take(RDD.scala:1327)
at
org.apache.spark.sql.execution.datasources.csv.MultiLineCSVDataSource$.infer(CSVDataSource.scala:224)
at
org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:62)
at
org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:57)
at
org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177)
at
org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177)
at scala.Option.orElse(Option.scala:289)
at
org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:176)
at
org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412)
... 48 elided
Caused by: java.net.URISyntaxException: Relative path in absolute URI:
2017-08-01T00:00:00Z.csv.gz
at java.net.URI.checkPath(URI.java:1823)
at java.net.URI.<init>(URI.java:745)
at org.apache.hadoop.fs.Path.initialize(Path.java:202)
... 86 more
{quote}
> Couldn't read file with colon in name and new line character in one of the
> field.
> ---------------------------------------------------------------------------------
>
> Key: SPARK-23814
> URL: https://issues.apache.org/jira/browse/SPARK-23814
> Project: Spark
> Issue Type: Bug
> Components: Spark Core, Spark Shell
> Affects Versions: 2.2.0
> Reporter: bharath kumar avusherla
> Priority: Major
>
> When the file name has colon and new line character in data, while reading
> using spark.read.option("multiLine","true").csv("s3n://Directory/") function.
> It is throwing *"**java.lang.IllegalArgumentException:
> java.net.URISyntaxException: Relative path in absolute URI:
> 2017-08-01T00:00:00Z.csv.gz"* error. If we remove the
> option("multiLine","true"), it is working just fine though the file name has
> colon in it. It is working fine, If i apply this option
> *option("multiLine","true")* on any other file which doesn't have colon in
> it. But when both are present (colon in file name and new line in the data),
> it's not working.
> {quote}java.lang.IllegalArgumentException: java.net.URISyntaxException:
> Relative path in absolute URI: 2017-08-01T00:00:00Z.csv.gz
> at org.apache.hadoop.fs.Path.initialize(Path.java:205)
> at org.apache.hadoop.fs.Path.<init>(Path.java:171)
> at org.apache.hadoop.fs.Path.<init>(Path.java:93)
> at org.apache.hadoop.fs.Globber.glob(Globber.java:253)
> at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:1676)
> at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:294)
> at
> org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:265)
> at
> org.apache.spark.input.StreamFileInputFormat.setMinPartitions(PortableDataStream.scala:51)
> at org.apache.spark.rdd.BinaryFileRDD.getPartitions(BinaryFileRDD.scala:46)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
> at scala.Option.getOrElse(Option.scala:121)
> at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
> at
> org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
> at scala.Option.getOrElse(Option.scala:121)
> at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
> at
> org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:252)
> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:250)
> at scala.Option.getOrElse(Option.scala:121)
> at org.apache.spark.rdd.RDD.partitions(RDD.scala:250)
> at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1333)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
> at org.apache.spark.rdd.RDD.take(RDD.scala:1327)
> at
> org.apache.spark.sql.execution.datasources.csv.MultiLineCSVDataSource$.infer(CSVDataSource.scala:224)
> at
> org.apache.spark.sql.execution.datasources.csv.CSVDataSource.inferSchema(CSVDataSource.scala:62)
> at
> org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.inferSchema(CSVFileFormat.scala:57)
> at
> org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177)
> at
> org.apache.spark.sql.execution.datasources.DataSource$$anonfun$7.apply(DataSource.scala:177)
> at scala.Option.orElse(Option.scala:289)
> at
> org.apache.spark.sql.execution.datasources.DataSource.getOrInferFileFormatSchema(DataSource.scala:176)
> at
> org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366)
> at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
> at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:533)
> at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:412)
> ... 48 elided
> Caused by: java.net.URISyntaxException: Relative path in absolute URI:
> 2017-08-01T00:00:00Z.csv.gz
> at java.net.URI.checkPath(URI.java:1823)
> at java.net.URI.<init>(URI.java:745)
> at org.apache.hadoop.fs.Path.initialize(Path.java:202)
> ... 86 more
> {quote}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]