Chen Luo created ASTERIXDB-1874:
-----------------------------------
Summary: ArrayIndexOutOfBoundsException when joining a dataset
after groupby
Key: ASTERIXDB-1874
URL: https://issues.apache.org/jira/browse/ASTERIXDB-1874
Project: Apache AsterixDB
Issue Type: Bug
Components: Hyracks
Reporter: Chen Luo
Priority: Minor
Basically, I have two dataset, ds_tweet and US_population, and I performed a
left outer join after group by using SQL++. Executing the query gives
ArrayIndexOutOfBoundsException.
The detailed stacktrace is as follows:
{code}
java.lang.ArrayIndexOutOfBoundsException: 2
org.apache.hyracks.api.exceptions.HyracksDataException:
java.lang.ArrayIndexOutOfBoundsException: 2
at
org.apache.hyracks.api.exceptions.HyracksDataException.create(HyracksDataException.java:50)
at
org.apache.hyracks.control.common.utils.ExceptionUtils.setNodeIds(ExceptionUtils.java:62)
at org.apache.hyracks.control.nc.Task.run(Task.java:330)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ArrayIndexOutOfBoundsException: 2
at
org.apache.asterix.builders.RecordBuilder.addField(RecordBuilder.java:166)
at
org.apache.asterix.runtime.evaluators.constructors.OpenRecordConstructorDescriptor$2$1.evaluate(OpenRecordConstructorDescriptor.java:103)
at
org.apache.hyracks.algebricks.runtime.operators.std.AssignRuntimeFactory$1.produceTuple(AssignRuntimeFactory.java:168)
at
org.apache.hyracks.algebricks.runtime.operators.std.AssignRuntimeFactory$1.nextFrame(AssignRuntimeFactory.java:137)
at
org.apache.hyracks.algebricks.runtime.operators.meta.AlgebricksMetaOperatorDescriptor$2.nextFrame(AlgebricksMetaOperatorDescriptor.java:134)
at
org.apache.hyracks.dataflow.common.comm.io.AbstractFrameAppender.write(AbstractFrameAppender.java:92)
at
org.apache.hyracks.dataflow.std.join.InMemoryHashJoin.completeJoin(InMemoryHashJoin.java:200)
at
org.apache.hyracks.dataflow.std.join.OptimizedHybridHashJoin.completeProbe(OptimizedHybridHashJoin.java:551)
at
org.apache.hyracks.dataflow.std.join.OptimizedHybridHashJoinOperatorDescriptor$ProbeAndJoinActivityNode$1.close(OptimizedHybridHashJoinOperatorDescriptor.java:429)
at org.apache.hyracks.control.nc.Task.pushFrames(Task.java:367)
at org.apache.hyracks.control.nc.Task.run(Task.java:308)
... 3 more
{code}
Steps to reproduce:
1. Prepare the data set using the following queries (SQL++):
{code}
create type typeTweet if not exists as open{
create_at : datetime,
id: int64,
`text`: string,
in_reply_to_status : int64,
in_reply_to_user : int64,
favorite_count : int64,
coordinate: point?,
retweet_count : int64,
lang : string,
is_retweet: boolean,
hashtags : {{ string }} ?,
user_mentions : {{ int64 }} ?
}
create dataset ds_tweet(typeTweet) if not exists primary key id ;
insert into ds_tweet(
{ "create_at": datetime("2016-10-05T10:00:00.000Z"), "id":
783713358558081024, "text": "@Nycee_Otf oh so you're saying you're a lakers fan
right?", "in_reply_to_status": 783713068996067328, "in_reply_to_user":
560583246, "favorite_count": 0, "retweet_count": 0, "lang": "en", "is_retweet":
false, "user_mentions": {{ 560583246 }}, "user": { "id": 4004894956, "name":
"Dalton Sterling", "screen_name": "DNS_2232", "lang": "en", "location":
"Vineland, NJ", "create_at": date("2015-10-20"), "description": "Rowan
University CO 2016, Law and Justice Major Soc Minor , #CowboysNation 3x Vba
champ, all green lights on this road to success.", "followers_count": 311,
"friends_count": 285, "statues_count": 10980 }, "place": { "country": "United
States", "country_code": "United States", "full_name": "Vineland, NJ", "id":
"ecc2e1285c7d074f", "name": "Vineland", "place_type": "city", "bounding_box":
rectangle("-75.076284,39.401507 -74.945245,39.568715") }, "geo_tag": {
"stateID": 34, "stateName": "New Jersey", "countyID": 34011, "countyName":
"Cumberland", "cityID": 3476070, "cityName": "Vineland" } }
)
create type typePopulation if not exists as open {
id: int64,
create_at: date,
stateID:int64,
population:int64
}
create dataset US_population(typePopulation) if not exists primary key id;
{code}
2. Execute the following query (SQL++):
{code}
select t1.state, t1.count, l0.state
from (select state, coll_count(g) as `count`
from twitter.ds_tweet t
group by t.geo_tag.stateID as `state` group as g) t1
left outer join twitter.US_population l0 on t1.state = l0. state;
{code}
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)