· 6 years ago · May 08, 2019, 07:20 PM
1import urllib.request
2import json
3import re
4import sqlite3
5from datetime import datetime
6from pymystem3 import Mystem
7import matplotlib.pyplot as plt
8import operator
9from wordcloud import WordCloud
10
11
12token = '' # need to obtain Service token
13owner_id = -84793390 # ÐÐºÐ°Ð´ÐµÐ¼Ð¸Ñ Ð¯Ð½Ð´ÐµÐºÑа
14
15a = 'https://api.vk.com/method/'
16b = 'wall.get?'
17b2 = 'wall.getComments?'
18c = 'owner_id=%i&offset=%i&count=%i&v=5.92&access_token=%s'
19c2 = 'post_id=%i&owner_id=%i&offset=%i&count=%i&v=5.92&access_token=%s'
20
21conn = sqlite3.connect('hw6_data.db')
22c1 = conn.cursor()
23
24
25def get_stop_words_from_file(file_local):
26 stop_words = list([])
27 f = open(file_local, 'r', encoding='utf-8')
28 f1 = f.readlines()
29 for x in f1:
30 stop_words.append(x.split('\n')[0])
31
32 return stop_words
33
34
35def get_sex_by_user_id(user_id):
36 # 0 - None, 1 - female, 2 - male
37 part_1 = 'https://api.vk.com/method/users.get?'
38 part_2 = 'v=%s&access_token=%s&user_ids=%s&fields=sex'
39 if user_id <= 0:
40 return -1
41 version = 5.92
42 req = urllib.request.Request(part_1 + part_2 % (version, token, user_id))
43 response = urllib.request.urlopen(req)
44 result = response.read().decode('utf-8')
45 data = json.loads(result)
46 sex = -1
47 if 'sex' in (data['response'][0]):
48 sex = data['response'][0]['sex']
49 return sex
50
51
52def get_is_private_name_sex_by_user_id(user_id):
53
54 version = 5.92
55 part_1 = 'https://api.vk.com/method/users.get?'
56 part_2 = 'v=%s&access_token=%s&user_ids=%s&fields=sex'
57 req = urllib.request.Request(part_1 + part_2 % (version, token, user_id))
58 response = urllib.request.urlopen(req)
59 result = response.read().decode('utf-8')
60 data = json.loads(result)
61 is_private = 0
62 if 'deactivated' in data['response'][0]:
63 is_private = 0
64 elif data['response'][0]['is_closed'] is True:
65 is_private = 1
66 name = data['response'][0]['first_name'] + ' '
67 name = name + data['response'][0]['last_name']
68 sex = -1
69 if 'sex' in (data['response'][0]):
70 sex = data['response'][0]['sex']
71 return is_private, name, sex
72
73
74def get_number_of_friends_by_id(user_id):
75
76 if user_id < 0:
77 return - 1
78 version = 5.92
79 part_1 = 'https://api.vk.com/method/friends.get?'
80 part_2 = 'v=%s&access_token=%s&user_id=%s'
81 req = urllib.request.Request(part_1 + part_2 % (version, token, user_id))
82 response = urllib.request.urlopen(req)
83 result = response.read().decode('utf-8')
84 data = json.loads(result)
85 if 'response' not in data:
86 return -1
87 return data['response']['count']
88
89
90def get_posts(how_many):
91 count = 0
92 r = how_many % 100
93 local_posts = {}
94 i = 0
95 while i < how_many//100:
96 p1 = a + b + c
97 req = urllib.request.Request(p1 % (owner_id, 100*i, 100, token))
98 response = urllib.request.urlopen(req)
99 result = response.read().decode('utf-8')
100 data1 = json.loads(result)
101 data_length = len(data1['response']['items'])
102
103 for j in range(min(100, data_length)):
104 local_posts[j+100*i] = data1['response']['items'][j]
105
106 count = i + 1
107 i = i + 1
108 if data_length < 100:
109 i = how_many
110 r = 0
111
112 if r > 0:
113 p2 = a + b + c
114 req = urllib.request.Request(p2 % (owner_id, 100*count, r, token))
115 response = urllib.request.urlopen(req)
116 result = response.read().decode('utf-8')
117 data1 = json.loads(result)
118
119 for i in range(r):
120 local_posts[i+100*count] = data1['response']['items'][i]
121
122 return local_posts
123
124
125def get_thread_comments(local_comments, main_comment, offset):
126 c_id = main_comment["id"]
127 count_thread = main_comment["thread"]["count"]
128
129 count = 0
130 for i in range(count_thread//100):
131 p1 = 'https://api.vk.com/method/wall.getComments?'
132 p2 = 'comment_id=%i&owner_id=%i&offset=%i&'
133 p3 = p1 + p2 + 'count=100&v=5.92&access_token=%s'
134 req1 = urllib.request.Request(p3 % (c_id, owner_id, 100*count, token))
135 response1 = urllib.request.urlopen(req1)
136 result1 = response1.read().decode('utf-8')
137 data3 = json.loads(result1)
138
139 for j in range(100):
140 t1 = "This comment has been deleted"
141 t2 = data3['response']['items'][j]["id"]
142 t3 = "user_id"
143 t4 = "text"
144 if "deleted" in data3['response']['items'][j]:
145 local_comments[offset+i*count+j] = {t4: t1, t3: -1, 'id': t2}
146 else:
147 t5 = data3['response']['items'][j]["text"]
148 t6 = data3['response']['items'][j]["from_id"]
149 local_comments[offset+i*count+j] = {t4: t5, t3: t6, 'id': t2}
150 count += 1
151
152 rg = count_thread % 100
153 p4 = 'https://api.vk.com/method/wall.getComments?comment_id=%i'
154 p5 = '&owner_id=%i&offset=%i&count=%i&v=5.92&access_token=%s'
155 n8 = 100*count
156 req1 = urllib.request.Request(p4 + p5 % (c_id, owner_id, n8, rg, token))
157 response1 = urllib.request.urlopen(req1)
158 result1 = response1.read().decode('utf-8')
159 data3 = json.loads(result1)
160 for j in range(rg):
161 s1 = "text"
162 s2 = "This comment has been deleted"
163 s3 = "user_id"
164 s4 = data3['response']['items'][j]["id"]
165 s5 = data3['response']['items'][j]["text"]
166 s6 = data3['response']['items'][j]["from_id"]
167 if "deleted" in data3['response']['items'][j]:
168 local_comments[offset+100*count+j] = {s1: s2, s3: -1, 'id': s4}
169 else:
170 local_comments[offset+100*count+j] = {s1: s5, s3: s6, 'id': s4}
171
172 return local_comments
173
174
175def get_comments(post_local):
176 count = 0
177 local_comments = {}
178 post_id = post_local["id"]
179 number_of_comments = post_local['comments']['count']
180 r = number_of_comments % 100
181
182 for i in range(number_of_comments//100):
183 p1 = a + b2 + c2
184 l = 100*i
185 req = urllib.request.Request(p1 % (post_id, owner_id, l, 100, token))
186 response = urllib.request.urlopen(req)
187 result = response.read().decode('utf-8')
188 data2 = json.loads(result)
189
190 for j in range(100):
191 s1 = "text"
192 s2 = "This comment has been deleted"
193 s3 = "user_id"
194 s4 = data2['response']['items'][j]["id"]
195 if "deleted" in data2['response']['items'][j]:
196 local_comments[j+100*i] = {s1: s2, s3: -1, 'id': s4}
197 else:
198 s5 = data2['response']['items'][j]["text"]
199 s6 = data2['response']['items'][j]["from_id"]
200 local_comments[j+100*i] = {s1: s5, s3: s6, 'id': s4}
201
202 n = data2["response"]['items'][j]["thread"]["count"]
203 if n > 0:
204 s7 = local_comments
205 s8 = data2["response"]['items'][j]
206 local_comments = get_thread_comments(s7, s8, j+100*i + 1)
207 j += n
208
209 count = i + 1
210
211 if r > 0:
212 j1 = a + b2 + c2
213 i1 = 100*count
214 req = urllib.request.Request(j1 % (post_id, owner_id, i1, r, token))
215 response = urllib.request.urlopen(req)
216 result = response.read().decode('utf-8')
217 data2 = json.loads(result)
218 k, j = 0, 0
219
220 while j < r:
221 s1 = "text"
222 s2 = "This comment has been deleted"
223 s3 = "user_id"
224 s4 = data2['response']['items'][k]["id"]
225 if "deleted" in data2['response']['items'][k]:
226 local_comments[100*count + j] = {s1: s2, s3: -1, 'id': s4}
227 else:
228 s5 = data2['response']['items'][k]["text"]
229 s6 = data2['response']['items'][k]["from_id"]
230 local_comments[100*count + j] = {s1: s5, s3: s6, 'id': s4}
231
232 n = data2["response"]['items'][k]["thread"]["count"]
233 if n > 0:
234 u1 = data2["response"]['items'][k]
235 e1 = 100*count + j + 1
236 local_comments = get_thread_comments(local_comments, u1, e1)
237 j += n
238 k += 1
239 j += 1
240
241 for k in range(len(local_comments)):
242 t111 = local_comments[k]['text']
243 local_comments[k]['text'] = re.sub(r'\[id\d+\|', '', t111)
244
245 return local_comments
246
247
248def get_post_or_comment_length(text_local):
249 length = 0
250 if text_local != "This comment has been deleted":
251 length = len(re.compile('\w+').findall(text_local))
252 return length
253
254
255def get_average_comments_lengths(local_comments):
256 length_of_comments = 0
257 how_many_comments = len(local_comments)
258 for i in range(len(local_comments)):
259 if local_comments[i][3] != "This comment has been deleted":
260 lc = local_comments[i][3]
261 length_of_comments += len(re.compile('\w+').findall(lc))
262 else:
263 how_many_comments -= 1
264
265 average_length_of_comments = 0
266 if how_many_comments > 0:
267 average_length_of_comments = length_of_comments/how_many_comments
268
269 return average_length_of_comments
270
271
272def get_time_of_post_in_hours(unix_time):
273 utc = datetime.fromtimestamp(unix_time)
274 hours_24_format = (utc.hour*3600 + utc.minute*60 + utc.second)/3600
275 return hours_24_format
276
277
278def get_post_weekday(post_local):
279 unixtime = post_local['date']
280 utc = datetime.fromtimestamp(unixtime)
281 post_weekday = datetime.weekday(utc) # 0 - monday
282 return post_weekday + 1 # 1 - monday
283
284
285def get_data_from_file():
286
287 file_local_1 = open("data_unlemmatized.txt", "r", encoding='utf-8')
288 texts_local = file_local_1.read()
289 file_local_1.close()
290
291 return texts_local
292
293
294def save_lemmatized_data(texts_local):
295 m = Mystem()
296 file_local = open("data_lemmatized.txt", "w", encoding='utf-8')
297 file_local.write('')
298 file_local.close()
299
300 file_local = open("data_lemmatized.txt", "a", encoding='utf-8')
301 text = ''
302 count = 0
303 for line_local in texts_local.split('\n'):
304 text = text + line_local + ' '
305 if count > 10:
306 lemma = m.lemmatize(text)
307 file_local.write(''.join(lemma))
308 text = ''
309 count = 0
310 count += 1
311
312 lemma = m.lemmatize(text)
313 file_local.write(''.join(lemma))
314 file_local.close()
315
316
317def save_posts_data_to_database(posts_local):
318 file_for_rewriting = open("data_lemmatized.txt", "w", encoding='utf-8')
319 file_for_rewriting.write('')
320 file_for_rewriting.close()
321
322 file_for_rewriting = open("data_unlemmatized.txt", "w", encoding='utf-8')
323 file_for_rewriting.write('')
324 file_for_rewriting.close()
325
326 file_local = open('data_unlemmatized.txt', "a", encoding='utf-8')
327
328 s1 = 'INSERT INTO posts VALUES (?, ?, ?, ?, ?, ?)'
329 for i in range(len(posts_local)):
330 post_id = posts_local[i]['id']
331 owner_id_local = posts_local[i]['owner_id']
332 day = get_post_weekday(posts_local[i])
333 c_count = posts_local[i]['comments']['count']
334 text = posts_local[i]['text']
335 file_local.write(text + '\n')
336 p_date = posts_local[i]['date']
337 c1.execute(s1, (post_id, owner_id_local, day, c_count, text, p_date))
338 conn.commit()
339
340 file_local.close()
341
342
343def save_comments_data_to_database(posts_local):
344 file_local = open("data_unlemmatized.txt", "a", encoding='utf-8')
345
346 s1 = 'INSERT INTO comments VALUES (?, ?, ?, ?)'
347 for i in range(len(posts_local)):
348 comments_local = get_comments(posts_local[i])
349 for j in range(len(comments_local)):
350 c_id = comments_local[j]['id']
351 user_id = comments_local[j]['user_id']
352 post_id = posts_local[i]['id']
353 text = comments_local[j]['text']
354 file_local.write(text + '\n')
355 c1.execute(s1, (c_id, user_id, post_id, text))
356 conn.commit()
357 if user_id > 0:
358 save_users_data_to_database(user_id)
359
360 file_local.close()
361
362
363def save_users_data_to_database(user_id):
364 c1.execute('SELECT * FROM users WHERE user_id = ' + str(user_id))
365 s1 = 'INSERT INTO users VALUES (?, ?, ?, ?, ?)'
366 if c1.fetchone() is None and user_id > 0:
367 is_private, name, sex = get_is_private_name_sex_by_user_id(user_id)
368 nf = get_number_of_friends_by_id(user_id)
369 c1.execute(s1, (user_id, is_private, name, sex, nf))
370 conn.commit()
371
372# ################### Graphs ###################
373
374
375def graph_1_posts_and_comments_lengths(posts_local):
376
377 len_posts = len(posts_local)
378 data_x_y = [{}]*len_posts
379 s1 = 'SELECT * FROM comments WHERE post_id = '
380 for h in range(len_posts):
381 c1.execute(s1 + str(posts_local[h][0]))
382 comments_local = c1.fetchall()
383 x = get_post_or_comment_length(posts_local[h][4])
384 y = get_average_comments_lengths(comments_local)
385 data_x_y[h] = {'x': x, 'y': y}
386
387 data_x_y = sorted(data_x_y, key=lambda k: k['x'])
388 X = [0]*len_posts
389 Y = [0]*len_posts
390
391 for i in range(len_posts):
392 X[i] = data_x_y[i]['x']
393 Y[i] = data_x_y[i]['y']
394
395 plt.scatter(X, Y, s=30, c="#ceaefa", marker="o")
396 t1 = "График завиÑимоÑти Ñредней длины комментариев от длины поÑта"
397 plt.title(t1)
398 plt.xlabel("Длина поÑта, Ñлова")
399 plt.ylabel("СреднÑÑ Ð´Ð»Ð¸Ð½Ð° комментариÑ, Ñлова")
400 plt.xlim(-3, 300)
401 plt.ylim(-0.5, 50)
402 plt.show()
403
404
405def graph_2_post_time_and_length(posts_local):
406
407 len_posts = len(posts_local)
408 data_x_y = list([])
409 for h in range(24):
410 data_x_y.append([0, 0])
411
412 s1 = 'SELECT date FROM posts WHERE post_id = '
413 for h in range(len_posts):
414 c1.execute(s1 + str(posts_local[h][0]))
415 date_local = c1.fetchall()[0][0]
416 post_time = get_time_of_post_in_hours(date_local)
417 post_length = get_post_or_comment_length(posts_local[h][4])
418 decision_hat = int(post_time//1)
419 data_x_y[decision_hat][0] += post_length
420 data_x_y[decision_hat][1] += 1
421
422 X = []
423 Y = []
424 l_time = list()
425
426 count_local = 0
427 for i in range(24):
428 g = i
429 if i == 23:
430 g = -1
431 y_count = data_x_y[i][1]
432 if y_count != 0:
433 X.append(count_local+1)
434 Y.append(data_x_y[i][0]//y_count)
435 l_time.append('%d:01-%d:00' % (i, g + 1))
436 count_local += 1
437
438 t1 = "График завиÑимоÑти Ñредней длины поÑта от времени публикации поÑта"
439 plt.scatter(X, Y, s=60, c="orange", marker="o")
440 plt.title(t1)
441 plt.xlabel("Ð’Ñ€ÐµÐ¼Ñ Ð¿ÑƒÐ±Ð»Ð¸ÐºÐ°Ñ†Ð¸Ð¸ поÑта, чаÑÑ‹")
442 plt.ylabel("СреднÑÑ Ð´Ð»Ð¸Ð½Ð° поÑта, Ñлова")
443 plt.subplots_adjust(bottom=0.3)
444 plt.xticks(X, l_time, rotation=45)
445 plt.xlim(0, count_local + 1)
446 plt.ylim(-3, 150)
447 plt.show()
448
449
450def graph_3_post_weekday_and_length(posts_local):
451 len_posts = len(posts_local)
452 data_x_y = list([[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]])
453
454 for h in range(len_posts):
455 post_weekday = posts_local[h][2]
456 post_length = get_post_or_comment_length(posts_local[h][4])
457 data_x_y[post_weekday-1][0] += post_length
458 data_x_y[post_weekday-1][1] += 1
459
460 X = []
461 Y = []
462
463 for i in range(7):
464 X.append(i + 1)
465 y_count = data_x_y[i][1]
466 if y_count == 0:
467 y_count = 1
468 Y.append(data_x_y[i][0]/y_count)
469
470 plt.plot(X, Y, c="#ff6163")
471 plt.title("График завиÑимоÑти Ñредней длины поÑта от Ð´Ð½Ñ Ð½ÐµÐ´ÐµÐ»Ð¸ публикации")
472 plt.xlabel("День недели публикации поÑта")
473 plt.ylabel("Длина поÑта, Ñлова")
474 days_list = list(['Понедельник', 'Вторник', 'Среда', 'Четверг', 'ПÑтница'])
475 days_list.append('Суббота')
476 days_list.append('ВоÑкреÑенье')
477 plt.xticks(X, days_list, rotation=45)
478 plt.subplots_adjust(bottom=0.3)
479 plt.xlim(0.5, 7.5)
480 plt.ylim(0, 110)
481 plt.show()
482
483
484def graph_4_sex_and_comments_length(posts_local):
485
486 len_posts = len(posts_local)
487 male_sum = 0
488 male_count = 0
489 female_sum = 0
490 female_count = 0
491
492 s1 = 'SELECT * FROM comments WHERE post_id = '
493 s2 = 'SELECT sex FROM users WHERE user_id = '
494 for h in range(len_posts):
495 c1.execute(s1 + str(posts_local[h][0]))
496 comments_local = c1.fetchall()
497 for i in range(len(comments_local)):
498 y = get_post_or_comment_length(comments_local[i][3])
499 user_id = comments_local[i][1]
500 if user_id < 1:
501 continue
502 c1.execute(s2 + str(user_id))
503 sex = c1.fetchone()[0]
504 if sex > 0:
505 x = sex
506 if x == 2:
507 male_sum += y
508 male_count += 1
509 else:
510 female_sum += y
511 female_count += 1
512
513 X = [1, 2]
514 Y = [female_sum/female_count, male_sum/male_count]
515 plt.bar(X[0], Y[0], color='#9d0759')
516 plt.bar(X[1], Y[1], color='#3a18b1')
517 plt.xticks(X, ['ЖенÑкий', 'МужÑкой'], rotation=0)
518 t0 = "График завиÑимоÑти Ñредней длины комментариев"
519 t1 = t0 + " от пола автора комментариÑ"
520 plt.title(t1)
521 plt.xlabel("Пол")
522 plt.ylabel("СреднÑÑ Ð´Ð»Ð¸Ð½Ð° комментариÑ, Ñлова")
523 plt.subplots_adjust(bottom=0.2)
524 plt.xlim(0, 3)
525 plt.ylim(0, 25)
526 plt.show()
527
528
529def graph_5_number_of_friends_and_comments_length(posts_local):
530
531 len_posts = len(posts_local)
532 z1 = []
533 for uu in range(11):
534 z1.append([0, 0])
535 data_x_y = list(z1) # [0][1] - 0 is sum, 1 is count. Use ranking
536 s1 = 'SELECT number_of_friends FROM users WHERE user_id = '
537 for h in range(len_posts):
538 post_id = posts_local[h][0]
539 c1.execute('SELECT * FROM comments WHERE post_id = ' + str(post_id))
540 comments_local = c1.fetchall()
541 for i in range(len(comments_local)):
542 user_id = comments_local[i][1]
543 if user_id < 1:
544 continue
545 c1.execute(s1 + str(user_id))
546 number_of_friends = c1.fetchone()[0]
547 if number_of_friends < 0:
548 continue
549 comment_length = get_post_or_comment_length(comments_local[i][3])
550 if number_of_friends > -1:
551 decision_hat = number_of_friends//30
552 if decision_hat > 10:
553 decision_hat = 10
554 data_x_y[decision_hat][0] += comment_length
555 data_x_y[decision_hat][1] += 1
556
557 length_data = len(data_x_y)
558 X = []
559 Y = []
560
561 for i in range(length_data):
562 X.append(i+1)
563 if data_x_y[i][1] == 0:
564 data_x_y[i][1] = 1
565 Y.append(data_x_y[i][0]/data_x_y[i][1])
566
567 plt.plot(X, Y, c='#fed0fc', linewidth=3)
568 t1 = "График завиÑимоÑти Ñредней длины комментариев от количеÑтва друзей"
569 plt.title(t1)
570 list_hours = list(['0-30', '31-60', '61-90', '91-120', '121-150'])
571 list_hours.append('151-180')
572 list_hours.append('181-210')
573 list_hours.append('211-240')
574 list_hours.append('241-270')
575 list_hours.append('271-300')
576 list_hours.append('>300')
577
578 plt.xticks(X, list_hours, rotation=45)
579 plt.xlim(0.5, 11.5)
580 plt.ylim(0, 25)
581 plt.xlabel("КоличеÑтво друзей")
582 plt.ylabel("Длина комментариÑ, Ñлова")
583 plt.subplots_adjust(bottom=0.2)
584 plt.show()
585
586
587def graph_6_top_30_words_unlemmatized():
588
589 file_local = open("data_unlemmatized.txt", "r", encoding='utf-8')
590 texts_local = file_local.read()
591 file_local.close()
592 stop_words = get_stop_words_from_file('stop_ru.txt')
593 words_count = {}
594 words = re.compile('\w+').findall(texts_local.lower())
595
596 for word in words:
597 if word not in stop_words:
598 if words_count.get(word) is None:
599 words_count[word] = 1
600 else:
601 words_count[word] += 1
602 ky = operator.itemgetter(1)
603 words_count = sorted(words_count.items(), key=ky, reverse=True)
604 words_top = list([])
605 X = list([])
606 Y = list([])
607
608 for m in range(min(30, len(words_count))):
609 words_top.append(words_count[m][0])
610 X.append(m + 1)
611 Y.append(words_count[m][1])
612
613 plt.bar(X, Y, color='#fffe71')
614 plt.title("Топ 30 Ñлов по нелемматизированным текÑтам")
615 plt.xlabel("Слова")
616 plt.ylabel("ЧаÑтотноÑть Ñлов")
617 plt.xticks(X, words_top, rotation=60)
618 plt.subplots_adjust(bottom=0.3)
619 plt.xlim(0, 31)
620 plt.ylim(0, 150)
621 plt.show()
622
623
624def graph_7_and_cloud_top_30_words_lemmatized():
625
626 file_local = open("data_lemmatized.txt", "r", encoding='utf-8')
627 texts_local = file_local.read()
628 file_local.close()
629 stop_words = get_stop_words_from_file('stop_ru.txt')
630 words_count = dict()
631 words = re.compile('\w+').findall(texts_local.lower())
632
633 for word in words:
634 if word not in stop_words:
635 if words_count.get(word) is None:
636 words_count[word] = 1
637 else:
638 words_count[word] += 1
639
640 ky = operator.itemgetter(1)
641 words_count1 = sorted(words_count.items(), key=ky, reverse=True)
642 words_top = list([])
643 X = list([])
644 Y = list([])
645
646 for m in range(min(30, len(words_count1))):
647 words_top.append(words_count1[m][0])
648 X.append(m + 1)
649 Y.append(words_count1[m][1])
650
651 plt.bar(X, Y, color='#89a203')
652 plt.title("Топ 30 Ñлов по лемматизированным текÑтам")
653 plt.xlabel("Слова")
654 plt.ylabel("ЧаÑтотноÑть Ñлов")
655 plt.xticks(X, words_top, rotation=60)
656 plt.subplots_adjust(bottom=0.3)
657 plt.xlim(0, 31)
658 plt.ylim(0, 300)
659 plt.show()
660
661 cloud = WordCloud(background_color="white", max_words=30)
662 cloud.generate_from_frequencies(words_count)
663 plt.imshow(cloud, interpolation='bilinear')
664 plt.axis("off")
665 plt.show()
666
667 cloud.to_file("words_cloud.png")
668
669
670# ##############################################################
671
672
673def run_on_existing_data():
674
675 c1.execute('SELECT * FROM posts')
676 posts_all = c1.fetchall()
677 graph_1_posts_and_comments_lengths(posts_all)
678 graph_2_post_time_and_length(posts_all)
679 graph_3_post_weekday_and_length(posts_all)
680 graph_4_sex_and_comments_length(posts_all)
681 graph_5_number_of_friends_and_comments_length(posts_all)
682 graph_6_top_30_words_unlemmatized()
683 graph_7_and_cloud_top_30_words_lemmatized()
684
685
686def rewrite_all_data_and_run():
687
688 c1.execute('DROP TABLE IF EXISTS posts')
689 c1.execute('DROP TABLE IF EXISTS comments')
690 c1.execute('DROP TABLE IF EXISTS users')
691
692 pq1 = "CREATE TABLE IF NOT EXISTS posts(post_id integer, owner_id integer"
693 pq2 = pq1 + ", weekday integer, comments integer, text text, date integer)"
694 pq3 = "CREATE TABLE IF NOT EXISTS comments(id integer, user_id integer"
695 pq4 = pq3 + ", post_id integer, text text)"
696 pq5 = "CREATE TABLE IF NOT EXISTS users(user_id integer, is_private integer"
697 pq6 = pq5 + ", name text, sex integer, number_of_friends integer)"
698 c1.execute(pq2)
699 c1.execute(pq4)
700 c1.execute(pq6)
701
702 posts = get_posts(151) # WARNING!!!
703 save_posts_data_to_database(posts) # WARNING!!!
704 save_comments_data_to_database(posts) # WARNING!!!
705 save_lemmatized_data(get_data_from_file()) # WARNING!!!
706
707 c1.execute('SELECT * FROM posts')
708 posts_all = c1.fetchall()
709 graph_1_posts_and_comments_lengths(posts_all)
710 graph_2_post_time_and_length(posts_all)
711 graph_3_post_weekday_and_length(posts_all)
712 graph_4_sex_and_comments_length(posts_all)
713 graph_5_number_of_friends_and_comments_length(posts_all)
714 graph_6_top_30_words_unlemmatized()
715 graph_7_and_cloud_top_30_words_lemmatized()
716
717
718run_on_existing_data()
719
720conn.close()