Enhancing perf for my cluster

Pierrick Boutruche Mon, 18 Aug 2014 03:30:08 -0700

Hi everyone !

I'm currently working on a tool with *ES and Twitter Streaming API*, in 
which I try to find interesting profiles on Twitter, based on what they 
tweet, RT and which of their interactions are shared/RT.


Anyway, I use ES to index and search among tweets. To do that, I get 
Twitter stream data and put in a *single index users & tweets (2 types)*, 
linked by the user id via un parent-child relation. Actually, I thought of 
my indexing a lot and it is the best way to do it. 
- I need to update very often users (because i score them and because they 
update their profile quite often), so get the user nested in the tweet is 
not an option (too many replicas)
- I could put user's tweets directly in the user object but I would have 
huge objects and I don't really want that.

I work on a SoYouStart Server, 4c/4t 3.2GHz, 32Go RAM, 4To HDD.

My settings for the index are :

settings = {
>
> "index" : {
>
>         "number_of_replicas" : 0,
>
>         "refresh_interval" : '10s',
>
>         "routing.allocation.disable_allocation": False
>
>     },
>
> "analysis": {
>
>         "analyzer": {
>
>             "snowFrench":{
>
>                 "type": "snowball",
>
>                 "language": "French"
>
>             },
>
>             "snowEnglish":{
>
>                 "type": "snowball",
>
>                 "language": "English"
>
>             },
>
>             "snowGerman":{
>
>                 "type": "snowball",
>
>                 "language": "German"
>
>             },
>
>             "snowRussian":{
>
>                 "type": "snowball",
>
>                 "language": "Russian"
>
>             },
>
>             "snowSpanish":{
>
>                 "type": "snowball",
>
>                 "language": "Spanish"
>
>             },
>
>             "snowJapanese":{
>
>                 "type": "snowball",
>
>                 "language": "Japanese"
>
>             },
>
>             "edgeNGramAnalyzer":{
>
>             "tokenizer": "myEdgeNGram"
>
>             },
>
>             "name_analyzer": {
>
> "tokenizer": "whitespace",
>
> "type": "custom",
>
> "filter": ["lowercase", "multi_words", "name_filter"]
>
> },
>
>             "city_analyzer" : {
>
>                 "type" : "snowball",
>
>                 "language" : "English"
>
>             }
>
>         },
>
>         "tokenizer" : {
>
>             "myEdgeNGram" : {
>
>                 "type" : "edgeNGram",
>
>                 "min_gram" : 2,
>
>                 "max_gram" : 5
>
>             },
>
>             "name_tokenizer": {
>
> "type": "edgeNGram",
>
> "max_gram": 100,
>
> "min_gram": 4
>
> }
>
>         },
>
>         "filter": {
>
> "multi_words": {
>
> "type": "shingle",
>
> "min_shingle_size": 2,
>
> "max_shingle_size": 10
>
> },
>
> "name_filter": {
>
> "type": "edgeNGram",
>
> "max_gram": 100,
>
> "min_gram": 4
>
> }          
>
> }
>
>     }
>
> }
>
>
And my mappings are :

> tweet_mapping = {
>
> "_all" : {
> "enabled" : False
> },
> "_ttl" : { 
> "enabled" : True, 
> "default" : "400d" 
> },
> "_parent" : {
> "type" : 'user'
> },
> "properties": {
> "textfr": {
> 'type': 'string',
> '_analyzer': 'snowFrench',
> 'copy_to': 'text'
> },
> "texten": {
> 'type': 'string',
> '_analyzer': 'snowEnglish',
> 'copy_to': 'text'
> },
> "textde": {
> 'type': 'string',
> '_analyzer': 'snowGerman',
> 'copy_to': 'text'
> },
> "textja": {
> 'type': 'string',
> '_analyzer': 'snowJapanese',
> 'copy_to': 'text'
> },
> "textru": {
> 'type': 'string',
> '_analyzer': 'snowRussian',
> 'copy_to': 'text'
> },
> "textes": {
> 'type': 'string',
> '_analyzer': 'snowSpanish',
> 'copy_to': 'text'
> },
> "text": {
> 'type': 'string',
> 'null_value': '',
> 'index': 'analyzed',
> 'store': 'yes'
> },
> "entities": {
> 'type': 'object',
> 'index': 'analyzed',
> 'store': 'yes',
> 'properties': {
>    "hashtags": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> "_analyzer": "edgeNGramAnalyzer"
> },
> "mentions": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'long',
> 'precision_step': 64
> }
> }
> },  
>         "lang": {
> 'index': 'not_analyzed',
> 'store': 'yes', 
> 'type': 'string'
> }, 
>         "created_at": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'date',
> 'format' : 'dd-MM-YYYY HH:mm:ss'
> }
> }
> }
> user_mapping = {
> "_all" : {
> "enabled" : False
> },
> "_ttl" : { 
> "enabled" : True, 
> "default" : "600d" 
> },
> "properties": {
>       "lang": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'string'
> },
>         "name": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> "_analyzer": "edgeNGramAnalyzer"
> }, 
>         "screen_name": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> "_analyzer": "edgeNGramAnalyzer"
> }, 
>         "descfr": {
> 'type': 'string',
> '_analyzer': 'snowFrench',
> 'copy_to': 'description'
> },
> "descen": {
> 'type': 'string',
> '_analyzer': 'snowEnglish',
> 'copy_to': 'description'
> },
> "descde": {
> 'type': 'string',
> '_analyzer': 'snowGerman',
> 'copy_to': 'description'
> },
> "descja": {
> 'type': 'string',
> '_analyzer': 'snowJapanese',
> 'copy_to': 'description'
> },
> "descru": {
> 'type': 'string',
> '_analyzer': 'snowRussian',
> 'copy_to': 'description'
> },
> "desces": {
> 'type': 'string',
> '_analyzer': 'snowSpanish',
> 'copy_to': 'description'
> },
> "description": {
> 'type': 'string',
> 'null_value': '',
> 'index': 'analyzed',
> 'store': 'yes'
> },
>         "created_at": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'date',
> 'format' : 'dd-MM-YYYY HH:mm:ss'
> },
> "profile_image_url": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'string'
> },
> "analysis": {
> 'type': 'object',
> 'index': 'analyzed',
> 'store': 'yes',
> 'properties': {
>    "hashtags": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object'
> },
> "relations": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object'
> },
> "score": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object'
> }
> }
> },
> "location" : {
> 'type': 'object',
> 'index': 'analyzed',
> 'store': 'yes',
> "properties" : {
> "search_field": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'analyzer': 'city_analyzer',
> 'null_value': ''
> },
> "name": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'null_value': ''
> },
> "city": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object',
> 'properties': {
> 'name': {
> 'boost': 3.0,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> },
> 'full_name': {
> 'boost': 3.0,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': ['location.search_field', 'location.name']
> },
> 'alternate_names': {
> 'boost': 2.0,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> }
> }
> },
> "admin2": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object',
> 'properties': {
> 'name': {
> 'boost': 1.5,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> },
> 'full_name': {
> 'boost': 1.5,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': ['location.search_field', 'location.name']
> }
> }
> },
> "admin1": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object',
> 'properties': {
> 'name': {
> 'boost': 1.2,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> },
> 'full_name': {
> 'boost': 1.2,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': ['location.search_field', 'location.name']
> }
> }
> },
> "country": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object',
> 'properties': {
> 'name': {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': ['location.search_field', 'location.name']
> },
> 'fips': {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> },
> 'capital': {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string'
> }
> }
> },
> "location": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'geo_point'
> },
> "population": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'long'
> },
> 'capital': {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'boolean'
> }
> }
> }
> }
> } 
>
>
Currently my cluster contains 60M docs (40M tweets, 20M users). I have only 
one node, no replicas, because if I create another node, no data will go in 
there... :/

When I index user, I localize them with the string they put in their 
profile (actually geoloc in tweet is present 5% of the time so it's not 
very interesting). So I indexed (in another index) the biggest cities in 
the world and I assign a city for each user. 

Something other you should know : *I use Python and PyES lib to work*. 

*SO. Let's talk about the problem :*

My goal is to sort users by pertinence in their tweets. To do that, I 
analyze the user's profile, timeline and the tweets in which they are 
mentioned (RT, messages). 
So what happens in my script ? 

I have a REST API based on Django REST Framework and a frontend with 
AngularJS

1/ I type a keyword (for ex : java, python, nodejs) and a location (not 
required, for ex: paris)
2/ I use count API to find every user that speaks about "java" in "paris"
3/ Then I get the 20 first results of this query.
4/ I do a multi_search query to get users' timeline and mentions
5/ I score them
6/ When they're scored, AngularJS displays the sorted results and send 
another request to the API to score the page 2, until there's no more page 
available.

The queries I do are  :
1/ To get users :

> {
>
>     'query': {
>
>         'bool': {
>
>             'should': [
>
>                 {
>
>                     'multi_match': {
>
>                         'use_dis_max': True,
>
>                         'query': 'java',
>
>                         'type': 'boolean',
>
>                         'operator': 'or',
>
>                         'fields': [
>
>                             'name',
>
>                             'screen_name',
>
>                             'description'
>
>                         ]
>
>                     }
>
>                 },
>
>                 {
>
>                     'has_child': {
>
>                         'query': {
>
>                             'match': {
>
>                                 'text': {
>
>                                     'operator': 'or',
>
>                                     'query': 'java',
>
>                                     'type': 'boolean'
>
>                                 }
>
>                             }
>
>                         },
>
>                         'type': 'tweet'
>
>                     }
>
>                 }
>
>             ],
>
>             'minimum_number_should_match': 1,
>
>             'must': [
>
>                 {
>
>                     'function_score': {
>
>                         'query': {
>
>                             'match': {
>
>                                 'location.search_field': {
>
>                                     'operator': 'or',
>
>                                     'query': 'paris',
>
>                                     'type': 'boolean'
>
>                                 }
>
>                             }
>
>                         },
>
>                         'functions': [
>
>                             {
>
>                                 'script_score': {
>
>                                     'script': "_score * 
>> (doc['capital'].value == 'T' ? 2 : 1)"
>
>                                 }
>
>                             },
>
>                             {
>
>                                 'script_score': {
>
>                                     'script': "_score * 
>> doc['search_field'].values.size()"
>
>                                 }
>
>                             }
>
>                         ]
>
>                     }
>
>                 }
>
>             ]
>
>         }
>
>     },
>
>     'from': 20,
>
>     'size': 20
>
> }
>
>
2/ To get timelines and mentions:

{
>     'query': {
>         'match': {
>             'entities.mentions': {
>                 'operator': 'or',
>                 'query': 'userID',
>                 'type': 'boolean'
>             }
>         }
>     },
>     '_source': True
> } 
>

and

{
>
>     'query': {
>
>         'has_parent': {
>
>             'query': {
>
>                 'match': {
>
>                     'id': {
>
>                         'operator': 'or',
>
>                         'query': 'userID',
>
>                         'type': 'boolean'
>
>                     }
>
>                 }
>
>             },
>
>             'type': 'user'
>
>         }
>
>     },
>
>     '_source': True
>
> }
>
>
 
BUT. Scoring one page can take from a few seconds to several minutes !!! I 
don't think it's normal, right ? I profiled my script and this is it : ES 
requests take toooooo long. Usually it's something like 10-20sec (and it's 
still too long), but sometimes it can take up to 90sec... 

I studied quite well ES, I think I understand many things but here, I don't 
know what can I do to change that.... Any ideas ? Thanks !
 

-- 
You received this message because you are subscribed to the Google Groups 
"elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/elasticsearch/5342d476-eed6-40f9-9fa9-93dde23371b2%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Enhancing perf for my cluster

Reply via email to