Hi everyone !
I'm currently working on a tool with *ES and Twitter Streaming API*, in
which I try to find interesting profiles on Twitter, based on what they
tweet, RT and which of their interactions are shared/RT.
Anyway, I use ES to index and search among tweets. To do that, I get
Twitter stream data and put in a *single index users & tweets (2 types)*,
linked by the user id via un parent-child relation. Actually, I thought of
my indexing a lot and it is the best way to do it.
- I need to update very often users (because i score them and because they
update their profile quite often), so get the user nested in the tweet is
not an option (too many replicas)
- I could put user's tweets directly in the user object but I would have
huge objects and I don't really want that.
I work on a SoYouStart Server, 4c/4t 3.2GHz, 32Go RAM, 4To HDD.
My settings for the index are :
settings = {
>
> "index" : {
>
> "number_of_replicas" : 0,
>
> "refresh_interval" : '10s',
>
> "routing.allocation.disable_allocation": False
>
> },
>
> "analysis": {
>
> "analyzer": {
>
> "snowFrench":{
>
> "type": "snowball",
>
> "language": "French"
>
> },
>
> "snowEnglish":{
>
> "type": "snowball",
>
> "language": "English"
>
> },
>
> "snowGerman":{
>
> "type": "snowball",
>
> "language": "German"
>
> },
>
> "snowRussian":{
>
> "type": "snowball",
>
> "language": "Russian"
>
> },
>
> "snowSpanish":{
>
> "type": "snowball",
>
> "language": "Spanish"
>
> },
>
> "snowJapanese":{
>
> "type": "snowball",
>
> "language": "Japanese"
>
> },
>
> "edgeNGramAnalyzer":{
>
> "tokenizer": "myEdgeNGram"
>
> },
>
> "name_analyzer": {
>
> "tokenizer": "whitespace",
>
> "type": "custom",
>
> "filter": ["lowercase", "multi_words", "name_filter"]
>
> },
>
> "city_analyzer" : {
>
> "type" : "snowball",
>
> "language" : "English"
>
> }
>
> },
>
> "tokenizer" : {
>
> "myEdgeNGram" : {
>
> "type" : "edgeNGram",
>
> "min_gram" : 2,
>
> "max_gram" : 5
>
> },
>
> "name_tokenizer": {
>
> "type": "edgeNGram",
>
> "max_gram": 100,
>
> "min_gram": 4
>
> }
>
> },
>
> "filter": {
>
> "multi_words": {
>
> "type": "shingle",
>
> "min_shingle_size": 2,
>
> "max_shingle_size": 10
>
> },
>
> "name_filter": {
>
> "type": "edgeNGram",
>
> "max_gram": 100,
>
> "min_gram": 4
>
> }
>
> }
>
> }
>
> }
>
>
And my mappings are :
> tweet_mapping = {
>
> "_all" : {
> "enabled" : False
> },
> "_ttl" : {
> "enabled" : True,
> "default" : "400d"
> },
> "_parent" : {
> "type" : 'user'
> },
> "properties": {
> "textfr": {
> 'type': 'string',
> '_analyzer': 'snowFrench',
> 'copy_to': 'text'
> },
> "texten": {
> 'type': 'string',
> '_analyzer': 'snowEnglish',
> 'copy_to': 'text'
> },
> "textde": {
> 'type': 'string',
> '_analyzer': 'snowGerman',
> 'copy_to': 'text'
> },
> "textja": {
> 'type': 'string',
> '_analyzer': 'snowJapanese',
> 'copy_to': 'text'
> },
> "textru": {
> 'type': 'string',
> '_analyzer': 'snowRussian',
> 'copy_to': 'text'
> },
> "textes": {
> 'type': 'string',
> '_analyzer': 'snowSpanish',
> 'copy_to': 'text'
> },
> "text": {
> 'type': 'string',
> 'null_value': '',
> 'index': 'analyzed',
> 'store': 'yes'
> },
> "entities": {
> 'type': 'object',
> 'index': 'analyzed',
> 'store': 'yes',
> 'properties': {
> "hashtags": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> "_analyzer": "edgeNGramAnalyzer"
> },
> "mentions": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'long',
> 'precision_step': 64
> }
> }
> },
> "lang": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'string'
> },
> "created_at": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'date',
> 'format' : 'dd-MM-YYYY HH:mm:ss'
> }
> }
> }
> user_mapping = {
> "_all" : {
> "enabled" : False
> },
> "_ttl" : {
> "enabled" : True,
> "default" : "600d"
> },
> "properties": {
> "lang": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'string'
> },
> "name": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> "_analyzer": "edgeNGramAnalyzer"
> },
> "screen_name": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> "_analyzer": "edgeNGramAnalyzer"
> },
> "descfr": {
> 'type': 'string',
> '_analyzer': 'snowFrench',
> 'copy_to': 'description'
> },
> "descen": {
> 'type': 'string',
> '_analyzer': 'snowEnglish',
> 'copy_to': 'description'
> },
> "descde": {
> 'type': 'string',
> '_analyzer': 'snowGerman',
> 'copy_to': 'description'
> },
> "descja": {
> 'type': 'string',
> '_analyzer': 'snowJapanese',
> 'copy_to': 'description'
> },
> "descru": {
> 'type': 'string',
> '_analyzer': 'snowRussian',
> 'copy_to': 'description'
> },
> "desces": {
> 'type': 'string',
> '_analyzer': 'snowSpanish',
> 'copy_to': 'description'
> },
> "description": {
> 'type': 'string',
> 'null_value': '',
> 'index': 'analyzed',
> 'store': 'yes'
> },
> "created_at": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'date',
> 'format' : 'dd-MM-YYYY HH:mm:ss'
> },
> "profile_image_url": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'string'
> },
> "analysis": {
> 'type': 'object',
> 'index': 'analyzed',
> 'store': 'yes',
> 'properties': {
> "hashtags": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object'
> },
> "relations": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object'
> },
> "score": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object'
> }
> }
> },
> "location" : {
> 'type': 'object',
> 'index': 'analyzed',
> 'store': 'yes',
> "properties" : {
> "search_field": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'analyzer': 'city_analyzer',
> 'null_value': ''
> },
> "name": {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'null_value': ''
> },
> "city": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object',
> 'properties': {
> 'name': {
> 'boost': 3.0,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> },
> 'full_name': {
> 'boost': 3.0,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': ['location.search_field', 'location.name']
> },
> 'alternate_names': {
> 'boost': 2.0,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> }
> }
> },
> "admin2": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object',
> 'properties': {
> 'name': {
> 'boost': 1.5,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> },
> 'full_name': {
> 'boost': 1.5,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': ['location.search_field', 'location.name']
> }
> }
> },
> "admin1": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object',
> 'properties': {
> 'name': {
> 'boost': 1.2,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> },
> 'full_name': {
> 'boost': 1.2,
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': ['location.search_field', 'location.name']
> }
> }
> },
> "country": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'object',
> 'properties': {
> 'name': {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': ['location.search_field', 'location.name']
> },
> 'fips': {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string',
> 'copy_to': 'location.search_field'
> },
> 'capital': {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'string'
> }
> }
> },
> "location": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'geo_point'
> },
> "population": {
> 'index': 'analyzed',
> 'store': 'yes',
> 'type': 'long'
> },
> 'capital': {
> 'index': 'not_analyzed',
> 'store': 'yes',
> 'type': 'boolean'
> }
> }
> }
> }
> }
>
>
Currently my cluster contains 60M docs (40M tweets, 20M users). I have only
one node, no replicas, because if I create another node, no data will go in
there... :/
When I index user, I localize them with the string they put in their
profile (actually geoloc in tweet is present 5% of the time so it's not
very interesting). So I indexed (in another index) the biggest cities in
the world and I assign a city for each user.
Something other you should know : *I use Python and PyES lib to work*.
*SO. Let's talk about the problem :*
My goal is to sort users by pertinence in their tweets. To do that, I
analyze the user's profile, timeline and the tweets in which they are
mentioned (RT, messages).
So what happens in my script ?
I have a REST API based on Django REST Framework and a frontend with
AngularJS
1/ I type a keyword (for ex : java, python, nodejs) and a location (not
required, for ex: paris)
2/ I use count API to find every user that speaks about "java" in "paris"
3/ Then I get the 20 first results of this query.
4/ I do a multi_search query to get users' timeline and mentions
5/ I score them
6/ When they're scored, AngularJS displays the sorted results and send
another request to the API to score the page 2, until there's no more page
available.
The queries I do are :
1/ To get users :
> {
>
> 'query': {
>
> 'bool': {
>
> 'should': [
>
> {
>
> 'multi_match': {
>
> 'use_dis_max': True,
>
> 'query': 'java',
>
> 'type': 'boolean',
>
> 'operator': 'or',
>
> 'fields': [
>
> 'name',
>
> 'screen_name',
>
> 'description'
>
> ]
>
> }
>
> },
>
> {
>
> 'has_child': {
>
> 'query': {
>
> 'match': {
>
> 'text': {
>
> 'operator': 'or',
>
> 'query': 'java',
>
> 'type': 'boolean'
>
> }
>
> }
>
> },
>
> 'type': 'tweet'
>
> }
>
> }
>
> ],
>
> 'minimum_number_should_match': 1,
>
> 'must': [
>
> {
>
> 'function_score': {
>
> 'query': {
>
> 'match': {
>
> 'location.search_field': {
>
> 'operator': 'or',
>
> 'query': 'paris',
>
> 'type': 'boolean'
>
> }
>
> }
>
> },
>
> 'functions': [
>
> {
>
> 'script_score': {
>
> 'script': "_score *
>> (doc['capital'].value == 'T' ? 2 : 1)"
>
> }
>
> },
>
> {
>
> 'script_score': {
>
> 'script': "_score *
>> doc['search_field'].values.size()"
>
> }
>
> }
>
> ]
>
> }
>
> }
>
> ]
>
> }
>
> },
>
> 'from': 20,
>
> 'size': 20
>
> }
>
>
2/ To get timelines and mentions:
{
> 'query': {
> 'match': {
> 'entities.mentions': {
> 'operator': 'or',
> 'query': 'userID',
> 'type': 'boolean'
> }
> }
> },
> '_source': True
> }
>
and
{
>
> 'query': {
>
> 'has_parent': {
>
> 'query': {
>
> 'match': {
>
> 'id': {
>
> 'operator': 'or',
>
> 'query': 'userID',
>
> 'type': 'boolean'
>
> }
>
> }
>
> },
>
> 'type': 'user'
>
> }
>
> },
>
> '_source': True
>
> }
>
>
BUT. Scoring one page can take from a few seconds to several minutes !!! I
don't think it's normal, right ? I profiled my script and this is it : ES
requests take toooooo long. Usually it's something like 10-20sec (and it's
still too long), but sometimes it can take up to 90sec...
I studied quite well ES, I think I understand many things but here, I don't
know what can I do to change that.... Any ideas ? Thanks !
--
You received this message because you are subscribed to the Google Groups
"elasticsearch" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/elasticsearch/5342d476-eed6-40f9-9fa9-93dde23371b2%40googlegroups.com.
For more options, visit https://groups.google.com/d/optout.