#!/usr/bin/python

import os
import gzip
import json
from unidecode import unidecode
import string

####  File stemps and names
out_prefix = "/mnt/TwitterOutput/Doug/"
in_prefix = "/mnt/TwitterData2/RTI-Collected-Twitter-Data/"
num_extracted_to_dump = 200
percent_geoloc = 0.0

#### Get list of all files (select only gz) 
all_files = os.listdir(in_prefix)
files = []
for a in all_files:
    if a.find("gz") > 0:
        files.append(a)

# Definegoto line slice if you only want a few (testing)
max_files = 100000000000
files_slice = files[0:max_files]

## Initialize counters
# geolocated tweet count
geoloc_tweet_count = 0
# Total tweets examinied
tot_tweets = 0
# tweets to dump counter
tdc = 0
# number of dumps of geoloc tweets made
num_dumped = 0
# counts per file
fc = 0
file_counts = {}
line_error = False

#keywords to search for in text, any one of the below
keywords = ["influenza", "flu", "cold", "fever", "H1N1", "H3N2", "sneezing", "aching", "ache", "achy", "congested"]


#  Tweets to dump holder
tweets = []

#dates to dump
dateTime = []

#locations to dump
locations = []

#text to dump
tweetText = []

json_dump_name = out_prefix + "Geoloc_extracted"+str(num_dumped)+".json"

out = open(json_dump_name,"w")

###  For each file of tweets    
for fname in files_slice:
    if geoloc_tweet_count >= num_extracted_to_dump:
        break
    fc += 1
    print "Working on file: "+fname+" number: "+str(fc)
    f = gzip.open(in_prefix+fname,"rb")
    raw_lines = f.readlines()
    try:
        raw_lines.remove("\n")
    except:
        print "No empty line to remove in "+fname
        
    print " ... processing "+str(len(raw_lines)) +" lines"

    lc = 0
    for r in raw_lines:
        line_error = False
        if geoloc_tweet_count>=num_extracted_to_dump:
            break
        lc+=1
        try:
            tweet = json.loads(r)
        except:
            line_error = True
            print "Error in file "+fname+" on line "+str(lc)

        tot_tweets+=1
        if (not line_error) and tweet.has_key("delete"):
            found = False
        elif not line_error \
                and  (tweet.has_key("geo") and tweet["geo"] != None) \
                or (tweet.has_key("coordinates") and tweet["coordinates"] != None):
            if tweet.has_key("text"):
                for word in tweet["text"].split():
                    didFind = False
                    for find in keywords:
                        if find.upper() == word.upper():
                            didFind = True
                        if didFind is True:
                            locations.append(tweet["coordinates"]["coordinates"])
                            ##json.dump(tweet,out)
                            tweetText.append(tweet["text"])
                            tweets.append(tweet)
                            dateTime.append(tweet["created_at"])
                            geoloc_tweet_count+= 1
                            tdc+= 1
                            fc += 1
                            percent_geoloc = 100.0 * geoloc_tweet_count / tot_tweets
                            break
                            
        if lc % 50000 == 1:
            print "Tweets dump counter (tdc): ", tdc
            print "Total geoloc with keywords tweets dumped: ", geoloc_tweet_count
            print "Percent of total tweets with valid geolocation data with keywords:",  percent_geoloc
    
    ####  Report count found per file
    file_counts[fname] = fc
    print "Found "+str(fc)+" geolocated tweets in "+fname
    
    ####  Dump tweets once a certain size
    if tdc >= num_extracted_to_dump  or fname == files_slice[len(files_slice)-1]:
        ##out.close()
        print "**********Writing "+str(tdc)+" tweets to "+json_dump_name+ " ****************"

        ###  Dump json version of the tweets
        json_dump_name = out_prefix + "Geoloc_extracted_times_flu_SVS"+str(num_dumped)+".json"
        out = open(json_dump_name,"w")
        for tweetout in tweets:
            json.dump(tweetout,out)
        out.close()

        tdc = 0
        del tweets
        tweets=[]

        #writing out the dateTimes
        out_dateTime=open( out_prefix +"dateTime_test2.csv","w")
        for line in dateTime:
            out_line = line +"\n"
            out_dateTime.write(out_line)
        out_dateTime.close()
        
        #writing out the locations
        out_locations=open( out_prefix +"locations_test2.csv","w")
        for line in locations:
            for ll in line:
                out_line = str(ll) +"\n"
                out_locations.write(out_line)
        out_locations.close()
        
        #writing out the text
        out_tweetText=open( out_prefix +"text_test2.txt","w" )
        for line in tweetText:
            #print "Line: ",line
            filter(lambda x: x in string.printable, line)
            line1 = unidecode (line)
            line2 = line1.replace('"',' ')
            output_line = "\"" + str(line2) + "\"" + "\n"
            #print "Output_line: ",output_line
            out_tweetText.write(output_line)
        out_tweetText.close()

        num_dumped += 1

        
print "Found "+str(geoloc_tweet_count)+" geolocated tweets in "+str(fc)+ " files"
print "Total number of tweets examined:", tot_tweets
percent_geoloc = 100.0 * geoloc_tweet_count / tot_tweets
print "Percent of total tweets with valid geolocation data:",  percent_geoloc