DVX8uMqw

· 6 years ago · Nov 13, 2019, 11:30 PM
1import sys
2import os
3import tweepy
4import json
5from classifier import *
6import spacy
7import pickle
8
9app_key = 'qMFwUUdr9vmKVnFF8i2EeMYrZ'
10app_secret = 'qtiGPNnzf0GcteCCfmHCsGdlEDOk0XZUX6p04hom0ouSeJYJX7'
11
12
13auth = tweepy.AppAuthHandler(app_key, app_secret)
14
15
16api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
17
18
19if (not api):
20    print('paila el api')
21
22clf = SentimentClassifier()
23nlp = spacy.load('es_core_news_sm')
24
25def fun_keyw(text,di):
26    doc = nlp(text)
27
28    for token in doc:
29        if len(str(token))>1 and (not token.is_stop) and token.is_alpha:
30            
31            di[str(token)] = di.get(str(token), 0)+1
32    
33    return di
34
35
36
37
38def get_tweets(query,geocode="4.115350699646785,-72.584710852785,150000km",langu='es',max_tweets=100):
39    print("por aca entro")
40    #geocode = '4.115350699646785,-72.584710852785,1500km'
41    #geocode = '36.890333500000004, -98.99308143101959,4000km'
42    search_query = ' OR '.join('"{}"'.format(x) for x in query.split(';')) #aqui va el queryparam procesado
43
44    print(search_query)
45
46    tweets_per_query = 100 #twitter no permite mas
47    since_id = None #indice inicial [i]
48    max_id = -1 #indice final [j]
49    tweet_count = 0
50    total_tweets=[]
51    new_tweets = None
52    word_cloud = {}
53
54    keywords = query.split(";")
55    keywords_count = {keyword:[] for keyword in keywords}
56
57    while tweet_count < max_tweets:
58        try:
59            #print('starting to download tweets between since_id={0} and max_id={1} -- {2}'.format(since_id, max_id, since_id - max_id if type(since_id) == int else 0) )
60            if max_id <= 0:
61                new_tweets = api.search(q=search_query + ' -filter:retweets', count=tweets_per_query, lang=langu, tweet_mode="extended")
62            else:
63                new_tweets = api.search(q=search_query + ' -filter:retweets', count=tweets_per_query, lang=langu, tweet_mode="extended", max_id=str(max_id-1))
64            
65            if not new_tweets:
66                print('no more tweets found')
67                break
68
69            for tweet in new_tweets:
70                n_tweet = tweet._json
71                
72
73                fun_keyw(n_tweet['full_text'].upper(),word_cloud)
74
75                polaridad = clf.predict(n_tweet['full_text'])
76                tweet = {
77                    "text":n_tweet['full_text'],
78                    "polaridad":polaridad,
79                    "username":n_tweet['user']['screen_name'],
80                    "name":n_tweet['user']['name'],
81                    "location":n_tweet['user']['location'],
82                    "image":n_tweet['user']['profile_image_url_https'],
83                    "retweet_count":n_tweet['retweet_count'],
84                    "favorite_count":n_tweet['favorite_count'],
85                    "followers":n_tweet['user']['followers_count']
86                }
87
88
89                for keyword in keywords:
90                    
91                    if keyword in n_tweet['full_text']:
92
93                        keywords_count[keyword].append(polaridad)
94
95                
96                total_tweets.append(tweet) 
97
98            tweet_count += len(new_tweets)
99            max_id = new_tweets[-1].id
100            print('downloaded {0} tweets, with since_id={1} and max_id={2} and tweets_count={3}'.format(len(new_tweets), since_id, max_id, tweet_count))
101        except tweepy.TweepError as e:
102            print('err:'  + str(e))
103            break
104            
105        keywords_final = []
106        
107        for keyword in keywords_count:
108            
109            positive,negative,mean=0,0,0
110            
111            polarities=keywords_count[keyword]
112            n=len(polarities)
113            for polarity in polarities:
114                
115                if polarity>0.5:positive+=1
116                else:negative+=1
117                    
118                mean+=polarity
119            
120            final={"name":keyword}
121            
122            final['positive']=positive/n if n else 0
123            final['negative']=negative/n if n else 0
124            final['average']=mean/n if n else 0
125            
126            keywords_final.append(final)
127            
128            
129
130        m=[{'text':key, 'value':word_cloud[key]} for key in word_cloud]
131
132            
133        return sorted(total_tweets, key=lambda x: x['retweet_count'],reverse=True)[:],keywords_final,m
134        
135        
136        
137if __name__=="__main__":
138	s="""LaGranEncuesta
139#QuieroUnaMedellínLibre#DeVotoDe
140#EnMiCiudadGana
141#ElDebatePorBogota
142#QueremosUnaAlcaldesa
143gobernacion
144alcalde
145concejal
146diputado
147ediles
148#PorUnasEleccionesTransparentes
149#CNEAlerta
150#VotoResponsableyLibre
151@Invamer
152#EncuestasNoMeRepresentan
153fraudeelectoral""".splitlines()
154
155	i=0
156	l=len(s)
157	total=[]
158	while i<l:
159		nuevos, _,_ = get_tweets(query=';'.join(s[i:min(i+15,l-1)]))
160		total+=nuevos
161		i+=15
162	with open("julian.pkl", 'wb') as file:
163		pickle.dump(total, file)