[
https://issues.apache.org/jira/browse/ASTERIXDB-1879?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Jianfeng Jia updated ASTERIXDB-1879:
------------------------------------
Description:
It's difficult for me to find the exact code, but here is the symptom
Previously, if we run a query with a high-selective time range (which is the
filter field) like following
{code}
count(for $d in dataset twitter.ds_tweet
where $d.'create_at' >= datetime('2017-04-02T16:23:13.333Z') and
$d.'create_at' < datetime('2017-04-03T16:23:13.333Z')
return $d)
{code}
The running time is fast. Now the running time is the same as scan query.
Here is the ddl
{code}
drop dataverse twitter if exists;
create dataverse twitter if not exists;
use dataverse twitter
create type typeUser if not exists as open {
id: int64,
name: string,
screen_name : string,
lang : string,
location: string,
create_at: date,
description: string,
followers_count: int32,
friends_count: int32,
statues_count: int64
}
create type typePlace if not exists as open{
country : string,
country_code : string,
full_name : string,
id : string,
name : string,
place_type : string,
bounding_box : rectangle
}
create type typeGeoTag if not exists as open {
stateID: int32,
stateName: string,
countyID: int32,
countyName: string,
cityID: int32?,
cityName: string?
}
create type typeTweet if not exists as open{
create_at : datetime,
id: int64,
"text": string,
in_reply_to_status : int64,
in_reply_to_user : int64,
favorite_count : int64,
coordinate: point?,
retweet_count : int64,
lang : string,
is_retweet: boolean,
hashtags : {{ string }} ?,
user_mentions : {{ int64 }} ? ,
user : typeUser,
place : typePlace?,
geo_tag: typeGeoTag
}
create dataset ds_tweet(typeTweet) if not exists primary key id using
compaction policy prefix
(("max-mergable-component-size"="134217728"),("max-tolerance-component-count"="10")
) with filter on create_at ;
create index text_idx if not exists on ds_tweet("text") type keyword;
{code}
The optimized logical plan is exactly the same as before. I'm wondering maybe
it is the problem of the implementation?
was:
It's difficult for me to find the exact code, but here is the symptom
Previously, if we run a query with a high-selective time range (which is the
filter field) like following
{code}
count(for $d in dataset twitter.ds_tweet
where $d.'create_at' >= datetime('2017-04-02T16:23:13.333Z') and
$d.'create_at' < datetime('2017-04-03T16:23:13.333Z')
return $d)
{code}
The running time is fast. Now the running time is the same as scan query.
Here is the ddl
{code}
drop dataverse twitter if exists;
create dataverse twitter if not exists;
use dataverse twitter
create type typeUser if not exists as open {
id: int64,
name: string,
screen_name : string,
lang : string,
location: string,
create_at: date,
description: string,
followers_count: int32,
friends_count: int32,
statues_count: int64
}
create type typePlace if not exists as open{
country : string,
country_code : string,
full_name : string,
id : string,
name : string,
place_type : string,
bounding_box : rectangle
}
create type typeGeoTag if not exists as open {
stateID: int32,
stateName: string,
countyID: int32,
countyName: string,
cityID: int32?,
cityName: string?
}
create type typeTweet if not exists as open{
create_at : datetime,
id: int64,
"text": string,
in_reply_to_status : int64,
in_reply_to_user : int64,
favorite_count : int64,
coordinate: point?,
retweet_count : int64,
lang : string,
is_retweet: boolean,
hashtags : {{ string }} ?,
user_mentions : {{ int64 }} ? ,
user : typeUser,
place : typePlace?,
geo_tag: typeGeoTag
}
create dataset ds_tweet(typeTweet) if not exists primary key id using
compaction policy prefix
(("max-mergable-component-size"="134217728"),("max-tolerance-component-count"="10")
) with filter on create_at ;
create index text_idx if not exists on ds_tweet("text") type keyword;
{code}
> Filter doesn't filter out components
> ------------------------------------
>
> Key: ASTERIXDB-1879
> URL: https://issues.apache.org/jira/browse/ASTERIXDB-1879
> Project: Apache AsterixDB
> Issue Type: Bug
> Components: Storage
> Environment: master
> "git.commit.id": "342444fcfed850b4078c9cd46e9a12d7f875867d"
> Reporter: Jianfeng Jia
> Assignee: Abdullah Alamoudi
>
> It's difficult for me to find the exact code, but here is the symptom
> Previously, if we run a query with a high-selective time range (which is the
> filter field) like following
> {code}
> count(for $d in dataset twitter.ds_tweet
> where $d.'create_at' >= datetime('2017-04-02T16:23:13.333Z') and
> $d.'create_at' < datetime('2017-04-03T16:23:13.333Z')
> return $d)
> {code}
> The running time is fast. Now the running time is the same as scan query.
> Here is the ddl
> {code}
> drop dataverse twitter if exists;
> create dataverse twitter if not exists;
> use dataverse twitter
> create type typeUser if not exists as open {
> id: int64,
> name: string,
> screen_name : string,
> lang : string,
> location: string,
> create_at: date,
> description: string,
> followers_count: int32,
> friends_count: int32,
> statues_count: int64
> }
> create type typePlace if not exists as open{
> country : string,
> country_code : string,
> full_name : string,
> id : string,
> name : string,
> place_type : string,
> bounding_box : rectangle
> }
> create type typeGeoTag if not exists as open {
> stateID: int32,
> stateName: string,
> countyID: int32,
> countyName: string,
> cityID: int32?,
> cityName: string?
> }
> create type typeTweet if not exists as open{
> create_at : datetime,
> id: int64,
> "text": string,
> in_reply_to_status : int64,
> in_reply_to_user : int64,
> favorite_count : int64,
> coordinate: point?,
> retweet_count : int64,
> lang : string,
> is_retweet: boolean,
> hashtags : {{ string }} ?,
> user_mentions : {{ int64 }} ? ,
> user : typeUser,
> place : typePlace?,
> geo_tag: typeGeoTag
> }
> create dataset ds_tweet(typeTweet) if not exists primary key id using
> compaction policy prefix
> (("max-mergable-component-size"="134217728"),("max-tolerance-component-count"="10")
> ) with filter on create_at ;
> create index text_idx if not exists on ds_tweet("text") type keyword;
> {code}
> The optimized logical plan is exactly the same as before. I'm wondering maybe
> it is the problem of the implementation?
--
This message was sent by Atlassian JIRA
(v6.3.15#6346)