· 7 years ago · Apr 20, 2018, 08:28 PM
1#Ryan Sherry, Cletus Andoh, Xin Tang, Vincent Zhang, Eric Liu
2#Cis 400
3#Introduction to Social Media Data Mining
4#Final Project
5#4/4/2018
6
7from datetime import datetime
8import twitter
9import json
10import operator
11import json
12import networkx as nx
13import pandas as pd # handle data
14from textblob import TextBlob #sentiment analysis
15import re
16import IMDB_Data
17import matplotlib.pyplot as plt
18import numpy as np #for number computing
19from operator import neg
20
21import IMDB_Data
22
23#global variables for sentiment
24
25gPos = 0
26gNeut = 0
27gNeg = 0
28
29def oauth_login():
30
31 CONSUMER_KEY = '4A34OfnrNnjRU3LYoDszUgTnM'
32 CONSUMER_SECRET = 'LFlM8myxCr5FicvBdX6QTn93D0iuk79z82kbyscIDDUaG5lOuX'
33 OAUTH_TOKEN = '913439754-YnDyGZaxD8TGJhbCLKQtSQb9e5Kpegcnl0JHJaXI'
34 OAUTH_TOKEN_SECRET = 'XH4OLERcapvG9BGryUWDDj904qRj4VPEMoncD6FN9rBaJ'
35 auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
36
37 twitter_api = twitter.Twitter(auth=auth)
38 return twitter_api
39
40from functools import partial
41from sys import maxint
42
43# code from https://dev.to/rodolfoferro/sentiment-analysis-on-trumpss-tweets-using-python-
44# code used from there includes clean_tweet and analise_sentiment
45
46def clean_tweet(tweet):
47 '''
48 Utility function to clean the text in a tweet by removing
49 links and special characters using regex.
50 '''
51 return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())
52
53def analize_sentiment(tweet):
54 '''
55 Utility function to classify the polarity of a tweet
56 using textblob.
57 '''
58 analysis = TextBlob(clean_tweet(tweet))
59 if analysis.sentiment.polarity > 0:
60 return 1
61 elif analysis.sentiment.polarity == 0:
62 return 0
63 else:
64 return -1
65
66#original code from cookbook fro CIS 400SU
67def twitter_search(twitter_api, q, max_results=200, **kw):
68
69 # See https://dev.twitter.com/docs/api/1.1/get/search/tweets and
70 # https://dev.twitter.com/docs/using-search for details on advanced
71 # search criteria that may be useful for keyword arguments
72
73 # See https://dev.twitter.com/docs/api/1.1/get/search/tweets
74 search_results = twitter_api.search.tweets(q=q, count=100, **kw)
75
76 statuses = search_results['statuses']
77
78 # Iterate through batches of results by following the cursor until we
79 # reach the desired number of results, keeping in mind that OAuth users
80 # can "only" make 180 search queries per 15-minute interval. See
81 # https://dev.twitter.com/docs/rate-limiting/1.1/limits
82 # for details. A reasonable number of results is ~1000, although
83 # that number of results may not exist for all queries.
84
85 # Enforce a reasonable limit
86 max_results = min(1000, max_results)
87
88 for _ in range(10): # 10*100 = 1000
89 try:
90 next_results = search_results['search_metadata']['next_results']
91 except KeyError, e: # No more results when next_results doesn't exist
92 break
93
94 # Create a dictionary from next_results, which has the following form:
95 # ?max_id=313519052523986943&q=NCAA&include_entities=1
96 kwargs = dict([ kv.split('=')
97 for kv in next_results[1:].split("&") ])
98
99 search_results = twitter_api.search.tweets(**kwargs)
100 statuses += search_results['statuses']
101
102 if len(statuses) > max_results:
103 break
104
105 return statuses
106
107#usage of code
108twitter_api = oauth_login()
109
110
111def searchTweets():
112 q = raw_input("What movie would you like to search?")
113 tCount = input("How many tweets would you like to gather?")
114
115 results = twitter_search(twitter_api, q, max_results=tCount)
116
117 # Show one sample search result by slicing the list...
118 #print json.dumps(results[1], indent=1)
119
120 result2 = []
121
122 #harvest data from json
123 x = 0
124 while x < tCount:
125 result2.append(results[x]['text'])
126 #print x
127 x+=1
128
129 #print result2
130
131
132 #create Data Frame
133 data = pd.DataFrame(data=result2, columns=['Tweets'])
134
135 #Display the first 10 elements of the data frame
136 #display(data.head(10))
137
138 #prints all elements of the dataFrame
139 #print data.values
140
141 # add length column to data frame for each tweet
142 data['len'] = np.array([len(result2[tweet]) for tweet in range(tCount)])
143 data['SA'] = np.array([ analize_sentiment(tweet) for tweet in data['Tweets'] ])
144 #print "length"
145 #print data.values
146
147 #sentiment analysis for tweet
148 pos_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] > 0]
149 neu_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] == 0]
150 neg_tweets = [ tweet for index, tweet in enumerate(data['Tweets']) if data['SA'][index] < 0]
151
152 posCount = len(pos_tweets)*100/len(data['Tweets'])
153 neutCount = len(neu_tweets)*100/len(data['Tweets'])
154 negCount = len(neg_tweets)*100/len(data['Tweets'])
155
156 setgPos(posCount)
157 setgNeut(neutCount)
158 setgNeg(negCount)
159
160 print "SENTIMENT ANALYSIS OF '%s' for '%s' number of tweets" % (q,tCount)
161 print("Positive tweets: {}%".format(posCount))
162 print("Neutral tweets: {}%".format(neutCount))
163 print("Negative tweets: {}%".format(negCount))
164
165 labels = 'Positive', 'Neutral', 'Negative'
166 sizes = [posCount, neutCount, negCount]
167 colors = ['green', 'yellow', 'red']
168 patches, texts = plt.pie(sizes, colors=colors, shadow=True, startangle=90)
169 plt.legend(patches, labels, loc="best")
170 plt.axis('equal')
171 plt.tight_layout()
172 plt.show()
173
174
175def setgPos(n):
176 global gPos
177 gPos = n
178def setgNeut(n):
179 global gNeut
180 gNeut = n
181def setgNeg(n):
182 global gNeg
183 gNeg = n
184
185def compareScores(IMDB_score, pos, neut, neg):
186 Iscore = float(IMDB_score)
187 Pscore = float(pos)
188 Tscore = float(neut)
189 Nscore = float(neg)
190
191 score_positive = abs((Iscore/10) - (Pscore/100)) *100
192 score_Neutral = abs((Iscore/10) - (Tscore/100)) *100
193 score_Negative = abs((Iscore/10) - (Nscore/100)) *100
194
195 score = [score_positive, score_Neutral, score_Negative]
196
197 return score
198
199
200#-------------------------------------------------------------------------------------
201def main():
202 print("This program fetches tweets about movies and performs SA on them")
203
204 searchTweets()
205
206 IMDB_Data.getRating()
207
208 score = compareScores(IMDB_Data.gIMDB,gPos,gNeut,gNeg)
209
210
211 print("variation between IMDB and Positive Sentiment : {}%".format(score[0]))
212 #print("variation between IMDB and Neutral Sentiment : {}%".format(score[1]))
213 #print("variation between IMDB and Negative Sentiment: {}%".format(score[2]))
214
215
216if __name__=='__main__':
217 main()