[ https://issues.apache.org/jira/browse/SPARK-42013?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17676570#comment-17676570 ]
Apache Spark commented on SPARK-42013: -------------------------------------- User 'zhengruifeng' has created a pull request for this issue: https://github.com/apache/spark/pull/39553 > Implement DataFrameReader.text to take multiple paths > ----------------------------------------------------- > > Key: SPARK-42013 > URL: https://issues.apache.org/jira/browse/SPARK-42013 > Project: Spark > Issue Type: Sub-task > Components: Connect > Affects Versions: 3.4.0 > Reporter: Hyukjin Kwon > Priority: Major > > {code} > java.io.IOException: Illegal file pattern: error parsing regexp: Unclosed > character class at pos 8: `['python` > at org.apache.hadoop.fs.GlobFilter.init(GlobFilter.java:71) > at org.apache.hadoop.fs.GlobFilter.<init>(GlobFilter.java:50) > at org.apache.hadoop.fs.Globber.doGlob(Globber.java:265) > at org.apache.hadoop.fs.Globber.glob(Globber.java:202) > at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2124) > at > org.apache.spark.deploy.SparkHadoopUtil.globPath(SparkHadoopUtil.scala:254) > at > org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$3(DataSource.scala:736) > at > org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:393) > at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659) > at scala.util.Success.$anonfun$map$1(Try.scala:255) > at scala.util.Success.map(Try.scala:213) > at scala.concurrent.Future.$anonfun$map$1(Future.scala:292) > at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33) > at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33) > at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64) > at > java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(ForkJoinTask.java:1402) > at java.util.concurrent.ForkJoinTask.doExec(ForkJoinTask.java:289) > at > java.util.concurrent.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1067) > at java.util.concurrent.ForkJoinPool.runWorker(ForkJoinPool.java:1703) > at > java.util.concurrent.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:172) > Caused by: org.apache.hadoop.shaded.com.google.re2j.PatternSyntaxException: > error parsing regexp: Unclosed character class at pos 8: `['python` > at org.apache.hadoop.fs.GlobPattern.error(GlobPattern.java:168) > at org.apache.hadoop.fs.GlobPattern.set(GlobPattern.java:151) > at org.apache.hadoop.fs.GlobPattern.<init>(GlobPattern.java:42) > at org.apache.hadoop.fs.GlobFilter.init(GlobFilter.java:67) > ... 19 more > pyspark/sql/tests/test_datasources.py:123 > (DataSourcesParityTests.test_read_text_file_list) > self = > <pyspark.sql.tests.connect.test_parity_datasources.DataSourcesParityTests > testMethod=test_read_text_file_list> > def test_read_text_file_list(self): > df = self.spark.read.text( > ["python/test_support/sql/text-test.txt", > "python/test_support/sql/text-test.txt"] > ) > > count = df.count() > ../test_datasources.py:128: > _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ > _ > ../../connect/dataframe.py:177: in count > pdd = self.agg(_invoke_function("count", lit(1))).toPandas() > ../../connect/dataframe.py:1297: in toPandas > return self._session.client.to_pandas(query) > ../../connect/client.py:422: in to_pandas > table, metrics = self._execute_and_fetch(req) > ../../connect/client.py:593: in _execute_and_fetch > self._handle_error(rpc_error) > _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ > _ > self = <pyspark.sql.connect.client.SparkConnectClient object at > 0x7fb160b85580> > rpc_error = <_MultiThreadedRendezvous of RPC that terminated with: > status = StatusCode.INTERNAL > details = "Illegal file pattern:...tatus:13, grpc_message:"Illegal file > pattern: error parsing regexp: Unclosed character class at pos 8: > `[\'python`"}" > > > def _handle_error(self, rpc_error: grpc.RpcError) -> NoReturn: > """ > Error handling helper for dealing with GRPC Errors. On the server > side, certain > exceptions are enriched with additional RPC Status information. These > are > unpacked in this function and put into the exception. > > To avoid overloading the user with GRPC errors, this message > explicitly > swallows the error context from the call. This GRPC Error is logged > however, > and can be enabled. > > Parameters > ---------- > rpc_error : grpc.RpcError > RPC Error containing the details of the exception. > > Returns > ------- > Throws the appropriate internal Python exception. > """ > logger.exception("GRPC Error received") > # We have to cast the value here because, a RpcError is a Call as > well. > # > https://grpc.github.io/grpc/python/grpc.html#grpc.UnaryUnaryMultiCallable.__call__ > status = rpc_status.from_call(cast(grpc.Call, rpc_error)) > if status: > for d in status.details: > if d.Is(error_details_pb2.ErrorInfo.DESCRIPTOR): > info = error_details_pb2.ErrorInfo() > d.Unpack(info) > if info.reason == > "org.apache.spark.sql.AnalysisException": > raise SparkConnectAnalysisException( > info.reason, info.metadata["message"], > info.metadata["plan"] > ) from None > else: > raise SparkConnectException(status.message, > info.reason) from None > > > raise SparkConnectException(status.message) from None > E pyspark.sql.connect.client.SparkConnectException: Illegal file > pattern: error parsing regexp: Unclosed character class at pos 8: `['python` > ../../connect/client.py:638: SparkConnectException > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org