Yes, it's a bug, please file a JIRA. On Sun, May 3, 2015 at 10:36 AM, Ali Bajwa <ali.ba...@gmail.com> wrote: > Friendly reminder on this one. Just wanted to get a confirmation that this > is not by design before I logged a JIRA > > Thanks! > Ali > > > On Tue, Apr 28, 2015 at 9:53 AM, Ali Bajwa <ali.ba...@gmail.com> wrote: >> >> Hi experts, >> >> Trying to use the "slicing" functionality in strings as part of a Spark >> program (PySpark) I get this error: >> >> **** Code **** >> >> import pandas as pd >> from pyspark.sql import SQLContext >> hc = SQLContext(sc) >> A = pd.DataFrame({'Firstname': ['James', 'Ali', 'Daniel'], 'Lastname': >> ['Jones', 'Bajwa', 'Day']}) >> a = hc.createDataFrame(A) >> print A >> >> b = a.select(a.Firstname[:2]) >> print b.toPandas() >> c = a.select(a.Lastname[2:]) >> print c.toPandas() >> >> Output: >> >> Firstname Lastname >> 0 James Jones >> 1 Ali Bajwa >> 2 Daniel Day >> SUBSTR(Firstname, 0, 2) >> 0 Ja >> 1 Al >> 2 Da >> >> >> --------------------------------------------------------------------------- >> Py4JError Traceback (most recent call >> last) >> <ipython-input-17-6ee5d7d069ce> in <module>() >> 10 b = a.select(a.Firstname[:2]) >> 11 print b.toPandas() >> ---> 12 c = a.select(a.Lastname[2:]) >> 13 print c.toPandas() >> >> /home/jupyter/spark-1.3.1/python/pyspark/sql/dataframe.pyc in substr(self, >> startPos, length) >> 1089 raise TypeError("Can not mix the type") >> 1090 if isinstance(startPos, (int, long)): >> -> 1091 jc = self._jc.substr(startPos, length) >> 1092 elif isinstance(startPos, Column): >> 1093 jc = self._jc.substr(startPos._jc, length._jc) >> >> >> /home/jupyter/spark-1.3.1/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py >> in __call__(self, *args) >> 536 answer = self.gateway_client.send_command(command) >> 537 return_value = get_return_value(answer, >> self.gateway_client, >> --> 538 self.target_id, self.name) >> 539 >> 540 for temp_arg in temp_args: >> >> /home/jupyter/spark-1.3.1/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py >> in get_return_value(answer, gateway_client, target_id, name) >> 302 raise Py4JError( >> 303 'An error occurred while calling {0}{1}{2}. >> Trace:\n{3}\n'. >> --> 304 format(target_id, '.', name, value)) >> 305 else: >> 306 raise Py4JError( >> >> Py4JError: An error occurred while calling o1887.substr. Trace: >> py4j.Py4JException: Method substr([class java.lang.Integer, class >> java.lang.Long]) does not exist >> at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:333) >> at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:342) >> at py4j.Gateway.invoke(Gateway.java:252) >> at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) >> at py4j.commands.CallCommand.execute(CallCommand.java:79) >> at py4j.GatewayConnection.run(GatewayConnection.java:207) >> at java.lang.Thread.run(Thread.java:745) >> >> Looks like X[:2] works but X[2:] fails with the error above >> Anyone else have this issue? >> >> Clearly I can use substr() to workaround this, but if this is a confirmed >> bug we should open a JIRA. >> >> Thanks, >> Ali > >
--------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org