· 6 years ago · Jun 15, 2019, 03:52 PM
1import urllib.request
2import json
3import re
4from pymystem3 import Mystem
5import sqlite3
6import operator
7import matplotlib.pyplot as plt
8from flask import Flask
9from flask import render_template
10import os
11
12token = os.environ["TOKEN"]
13
14owner_id1 = -94208167 # Data Mining
15owner_id2 = -91453124 # Data Science
16owner_id3 = -49815762 # Data Mining Labs
17
18table1 = 'data_mining'
19table2 = 'data_science'
20table3 = 'data_mining_labs'
21
22a = 'https://api.vk.com/method/'
23b = 'wall.get?'
24c = 'owner_id=%i&offset=%i&count=%i&v=5.92&access_token=%s'
25
26d1 = 'data_unlemmatized_'
27d2 = 'data_lemmatized_'
28
29conn = sqlite3.connect('final_project_data.db')
30c1 = conn.cursor()
31
32
33def get_posts(how_many, id_local):
34 count = 0
35 r = how_many % 100
36 local_posts = {}
37 i = 0
38 while i < how_many//100:
39 p1 = a + b + c
40 req = urllib.request.Request(p1 % (id_local, 100*i, 100, token))
41 response = urllib.request.urlopen(req)
42 result = response.read().decode('utf-8')
43 data1 = json.loads(result)
44 data_length = len(data1['response']['items'])
45
46 for j in range(min(100, data_length)):
47 local_posts[j+100*i] = data1['response']['items'][j]
48
49 count = i + 1
50 i = i + 1
51 if data_length < 100:
52 i = how_many
53 r = 0
54
55 if r > 0:
56 p2 = a + b + c
57 req = urllib.request.Request(p2 % (id_local, 100*count, r, token))
58 response = urllib.request.urlopen(req)
59 result = response.read().decode('utf-8')
60 data1 = json.loads(result)
61
62 for i in range(r):
63 local_posts[i+100*count] = data1['response']['items'][i]
64
65 return local_posts
66
67
68def save_posts_data_to_database(posts_local, table):
69 file_for_rewriting = open(d1 + table + '.txt', 'w', encoding='utf-8')
70 file_for_rewriting.write('')
71 file_for_rewriting.close()
72
73 file_local = open(d1 + table + '.txt', 'a', encoding='utf-8')
74
75 s1 = 'INSERT INTO ' + table + ' VALUES (?, ?)'
76 for i in range(len(posts_local)):
77 post_id = posts_local[i]['id']
78 text = posts_local[i]['text']
79 file_local.write(text + '\n')
80 c1.execute(s1, (post_id, text))
81 conn.commit()
82
83 file_local.close()
84
85
86def save_lemmatized_data(texts_local, table):
87 m = Mystem()
88 file_local = open(d2 + table + '.txt', 'w', encoding='utf-8')
89 file_local.write('')
90 file_local.close()
91
92 file_local = open(d2 + table + '.txt', 'a', encoding='utf-8')
93 text = ''
94 count = 0
95 for line_local in texts_local.split('\n'):
96 text = text + line_local + ' '
97 if count > 10:
98 lemma = m.lemmatize(text)
99 file_local.write(''.join(lemma))
100 text = ''
101 count = 0
102 count += 1
103
104 lemma = m.lemmatize(text)
105 file_local.write(''.join(lemma))
106 file_local.close()
107
108
109def get_stop_words_from_file(file_local):
110 stop_words = list([])
111 f = open(file_local, 'r', encoding='utf-8')
112 f1 = f.readlines()
113 for x in f1:
114 stop_words.append(x.split('\n')[0])
115
116 return stop_words
117
118
119def get_top_words_from_file(file_local):
120 top_words = list([])
121 f = open(file_local, 'r', encoding='utf-8')
122 f1 = f.readlines()
123 for x in f1:
124 top_words.append(x.split('\n')[0])
125
126 return top_words
127
128
129def get_data_from_file(table):
130 file_local_1 = open(d1 + table + '.txt', 'r', encoding='utf-8')
131 texts_local = file_local_1.read()
132 file_local_1.close()
133
134 return texts_local
135
136
137def get_top_number_words_lemmatized(how_many, table, color_local):
138 file_local = open(d2 + table + '.txt', "r", encoding='utf-8')
139 texts_local = file_local.read()
140 file_local.close()
141 stop_words = get_stop_words_from_file('stop_ru.txt')
142 top_words = get_top_words_from_file('top_words_' + table + '.txt')
143 words_count = dict()
144 words = re.compile('\w+').findall(texts_local.lower())
145
146 for word in words:
147 if word not in stop_words and word in top_words:
148 if words_count.get(word) is None:
149 words_count[word] = 1
150 else:
151 words_count[word] += 1
152
153 ky = operator.itemgetter(1)
154 words_count1 = sorted(words_count.items(), key=ky, reverse=True)
155 words_top = list([])
156 X = list([])
157 Y = list([])
158
159 for m in range(min(how_many, len(words_count1))):
160 words_top.append(words_count1[m][0])
161 X.append(m + 1)
162 Y.append(words_count1[m][1])
163
164 plt.bar(X, Y, color=color_local)
165 plt.title("Ключевые слова")
166 plt.xlabel("Слова")
167 plt.ylabel("Частотность слов")
168 plt.xticks(X, words_top, rotation=75)
169 plt.subplots_adjust(bottom=0.3)
170 plt.savefig('static/img/' + table + '.png')
171 plt.clf()
172
173 return X, Y
174
175
176def get_comparison_graph(x, y_1, y_2, y_3):
177 plt.plot(x, y_1, color='#89a203')
178 plt.plot(x, y_2, color='#96f97b')
179 plt.plot(x, y_3, color='#029386')
180
181 lines = list()
182 lines.append('Data Mining | Анализ данных')
183 lines.append('Data Science')
184 lines.append('Data Mining Labs')
185 plt.legend(lines)
186
187 s1 = 'График-сравнение частотности 18 самых '
188 s2 = 'частотных ключевых слов по трем сообществам'
189 plt.title(s1 + s2)
190
191 plt.xlabel("Рейтинг слов")
192 plt.ylabel("Частотность слов")
193 plt.xticks(x)
194 plt.subplots_adjust(bottom=0.1)
195
196 fig = plt.gcf()
197 fig.set_size_inches(9, 5)
198
199 plt.savefig('static/img/' + 'comparison_graph' + '.png', dpi=100)
200 plt.clf()
201
202
203def rewrite_all_data_and_run(how_many):
204 c1.execute('DROP TABLE IF EXISTS data_mining')
205 c1.execute('DROP TABLE IF EXISTS data_science')
206 c1.execute('DROP TABLE IF EXISTS data_mining_labs')
207
208 s1 = 'CREATE TABLE IF NOT EXISTS '
209 pq1 = s1 + 'data_mining(post_id integer, text text)'
210 pq2 = s1 + 'data_science(post_id integer, text text)'
211 pq3 = s1 + 'data_mining_labs(post_id integer, text text)'
212
213 c1.execute(pq1)
214 c1.execute(pq2)
215 c1.execute(pq3)
216
217 posts1 = get_posts(how_many, owner_id1) # WARNING!!!
218 posts2 = get_posts(how_many, owner_id2) # WARNING!!!
219 posts3 = get_posts(how_many, owner_id3) # WARNING!!!
220
221 save_posts_data_to_database(posts1, table1) # WARNING!!!
222 save_posts_data_to_database(posts2, table2) # WARNING!!!
223 save_posts_data_to_database(posts3, table3) # WARNING!!!
224
225 save_lemmatized_data(get_data_from_file(table1), table1) # WARNING!!!
226 save_lemmatized_data(get_data_from_file(table2), table2) # WARNING!!!
227 save_lemmatized_data(get_data_from_file(table3), table3) # WARNING!!!
228
229
230# rewrite_all_data_and_run(300)
231
232x1, y1 = get_top_number_words_lemmatized(40, table1, '#89a203')
233x2, y2 = get_top_number_words_lemmatized(40, table2, '#96f97b')
234x3, y3 = get_top_number_words_lemmatized(40, table3, '#029386')
235
236get_comparison_graph(x1, y1, y2, y3)
237
238app = Flask(__name__)
239
240
241@app.after_request
242def add_header(response):
243 response.cache_control.max_age = 0
244 return response
245
246
247@app.route('/')
248def welcome_page():
249 return render_template('welcome_page.html')
250
251
252@app.route('/data_mining')
253def gdata_mining_page():
254 return render_template('data_mining_page.html')
255
256
257@app.route('/data_science')
258def data_science_page():
259 return render_template('data_science_page.html')
260
261
262@app.route('/data_mining_labs')
263def data_mining_labs_page():
264 return render_template('data_mining_labs_page.html')
265
266
267@app.route('/comparison')
268def comparison_page():
269 return render_template('comparison_page.html')
270
271
272'''
273if __name__ == '__main__':
274 app.run(debug=False)
275'''
276
277if __name__ == '__main__':
278 import os
279 app.debug = True
280 port = int(os.environ.get("PORT", 5000))
281 app.run(host='0.0.0.0', port=port)
282
283conn.close()