· 6 years ago · Nov 13, 2019, 11:30 PM
1import sys
2import os
3import tweepy
4import json
5from classifier import *
6import spacy
7import pickle
8
9app_key = 'qMFwUUdr9vmKVnFF8i2EeMYrZ'
10app_secret = 'qtiGPNnzf0GcteCCfmHCsGdlEDOk0XZUX6p04hom0ouSeJYJX7'
11
12
13auth = tweepy.AppAuthHandler(app_key, app_secret)
14
15
16api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
17
18
19if (not api):
20 print('paila el api')
21
22clf = SentimentClassifier()
23nlp = spacy.load('es_core_news_sm')
24
25def fun_keyw(text,di):
26 doc = nlp(text)
27
28 for token in doc:
29 if len(str(token))>1 and (not token.is_stop) and token.is_alpha:
30
31 di[str(token)] = di.get(str(token), 0)+1
32
33 return di
34
35
36
37
38def get_tweets(query,geocode="4.115350699646785,-72.584710852785,150000km",langu='es',max_tweets=100):
39 print("por aca entro")
40 #geocode = '4.115350699646785,-72.584710852785,1500km'
41 #geocode = '36.890333500000004, -98.99308143101959,4000km'
42 search_query = ' OR '.join('"{}"'.format(x) for x in query.split(';')) #aqui va el queryparam procesado
43
44 print(search_query)
45
46 tweets_per_query = 100 #twitter no permite mas
47 since_id = None #indice inicial [i]
48 max_id = -1 #indice final [j]
49 tweet_count = 0
50 total_tweets=[]
51 new_tweets = None
52 word_cloud = {}
53
54 keywords = query.split(";")
55 keywords_count = {keyword:[] for keyword in keywords}
56
57 while tweet_count < max_tweets:
58 try:
59 #print('starting to download tweets between since_id={0} and max_id={1} -- {2}'.format(since_id, max_id, since_id - max_id if type(since_id) == int else 0) )
60 if max_id <= 0:
61 new_tweets = api.search(q=search_query + ' -filter:retweets', count=tweets_per_query, lang=langu, tweet_mode="extended")
62 else:
63 new_tweets = api.search(q=search_query + ' -filter:retweets', count=tweets_per_query, lang=langu, tweet_mode="extended", max_id=str(max_id-1))
64
65 if not new_tweets:
66 print('no more tweets found')
67 break
68
69 for tweet in new_tweets:
70 n_tweet = tweet._json
71
72
73 fun_keyw(n_tweet['full_text'].upper(),word_cloud)
74
75 polaridad = clf.predict(n_tweet['full_text'])
76 tweet = {
77 "text":n_tweet['full_text'],
78 "polaridad":polaridad,
79 "username":n_tweet['user']['screen_name'],
80 "name":n_tweet['user']['name'],
81 "location":n_tweet['user']['location'],
82 "image":n_tweet['user']['profile_image_url_https'],
83 "retweet_count":n_tweet['retweet_count'],
84 "favorite_count":n_tweet['favorite_count'],
85 "followers":n_tweet['user']['followers_count']
86 }
87
88
89 for keyword in keywords:
90
91 if keyword in n_tweet['full_text']:
92
93 keywords_count[keyword].append(polaridad)
94
95
96 total_tweets.append(tweet)
97
98 tweet_count += len(new_tweets)
99 max_id = new_tweets[-1].id
100 print('downloaded {0} tweets, with since_id={1} and max_id={2} and tweets_count={3}'.format(len(new_tweets), since_id, max_id, tweet_count))
101 except tweepy.TweepError as e:
102 print('err:' + str(e))
103 break
104
105 keywords_final = []
106
107 for keyword in keywords_count:
108
109 positive,negative,mean=0,0,0
110
111 polarities=keywords_count[keyword]
112 n=len(polarities)
113 for polarity in polarities:
114
115 if polarity>0.5:positive+=1
116 else:negative+=1
117
118 mean+=polarity
119
120 final={"name":keyword}
121
122 final['positive']=positive/n if n else 0
123 final['negative']=negative/n if n else 0
124 final['average']=mean/n if n else 0
125
126 keywords_final.append(final)
127
128
129
130 m=[{'text':key, 'value':word_cloud[key]} for key in word_cloud]
131
132
133 return sorted(total_tweets, key=lambda x: x['retweet_count'],reverse=True)[:],keywords_final,m
134
135
136
137if __name__=="__main__":
138 s="""LaGranEncuesta
139#QuieroUnaMedellínLibre#DeVotoDe
140#EnMiCiudadGana
141#ElDebatePorBogota
142#QueremosUnaAlcaldesa
143gobernacion
144alcalde
145concejal
146diputado
147ediles
148#PorUnasEleccionesTransparentes
149#CNEAlerta
150#VotoResponsableyLibre
151@Invamer
152#EncuestasNoMeRepresentan
153fraudeelectoral""".splitlines()
154
155 i=0
156 l=len(s)
157 total=[]
158 while i<l:
159 nuevos, _,_ = get_tweets(query=';'.join(s[i:min(i+15,l-1)]))
160 total+=nuevos
161 i+=15
162 with open("julian.pkl", 'wb') as file:
163 pickle.dump(total, file)